Commit 39cc98f1f8aa949afeea89f424c7494b0785d7da
Committed by
Linus Torvalds
1 parent
d149e3b25d
Exists in
master
and in
20 other branches
memcg: remove pointless next_mz nullification in mem_cgroup_soft_limit_reclaim()
next_mz is assigned to NULL if __mem_cgroup_largest_soft_limit_node selects the same mz. This doesn't make much sense as we assign to the variable right in the next loop. Compiler will probably optimize this out but it is little bit confusing for the code reading. Signed-off-by: Michal Hocko <mhocko@suse.cz> Acked-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 1 changed file with 2 additions and 3 deletions Inline Diff
mm/memcontrol.c
1 | /* memcontrol.c - Memory Controller | 1 | /* memcontrol.c - Memory Controller |
2 | * | 2 | * |
3 | * Copyright IBM Corporation, 2007 | 3 | * Copyright IBM Corporation, 2007 |
4 | * Author Balbir Singh <balbir@linux.vnet.ibm.com> | 4 | * Author Balbir Singh <balbir@linux.vnet.ibm.com> |
5 | * | 5 | * |
6 | * Copyright 2007 OpenVZ SWsoft Inc | 6 | * Copyright 2007 OpenVZ SWsoft Inc |
7 | * Author: Pavel Emelianov <xemul@openvz.org> | 7 | * Author: Pavel Emelianov <xemul@openvz.org> |
8 | * | 8 | * |
9 | * Memory thresholds | 9 | * Memory thresholds |
10 | * Copyright (C) 2009 Nokia Corporation | 10 | * Copyright (C) 2009 Nokia Corporation |
11 | * Author: Kirill A. Shutemov | 11 | * Author: Kirill A. Shutemov |
12 | * | 12 | * |
13 | * This program is free software; you can redistribute it and/or modify | 13 | * This program is free software; you can redistribute it and/or modify |
14 | * it under the terms of the GNU General Public License as published by | 14 | * it under the terms of the GNU General Public License as published by |
15 | * the Free Software Foundation; either version 2 of the License, or | 15 | * the Free Software Foundation; either version 2 of the License, or |
16 | * (at your option) any later version. | 16 | * (at your option) any later version. |
17 | * | 17 | * |
18 | * This program is distributed in the hope that it will be useful, | 18 | * This program is distributed in the hope that it will be useful, |
19 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | 19 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
20 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 20 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
21 | * GNU General Public License for more details. | 21 | * GNU General Public License for more details. |
22 | */ | 22 | */ |
23 | 23 | ||
24 | #include <linux/res_counter.h> | 24 | #include <linux/res_counter.h> |
25 | #include <linux/memcontrol.h> | 25 | #include <linux/memcontrol.h> |
26 | #include <linux/cgroup.h> | 26 | #include <linux/cgroup.h> |
27 | #include <linux/mm.h> | 27 | #include <linux/mm.h> |
28 | #include <linux/hugetlb.h> | 28 | #include <linux/hugetlb.h> |
29 | #include <linux/pagemap.h> | 29 | #include <linux/pagemap.h> |
30 | #include <linux/smp.h> | 30 | #include <linux/smp.h> |
31 | #include <linux/page-flags.h> | 31 | #include <linux/page-flags.h> |
32 | #include <linux/backing-dev.h> | 32 | #include <linux/backing-dev.h> |
33 | #include <linux/bit_spinlock.h> | 33 | #include <linux/bit_spinlock.h> |
34 | #include <linux/rcupdate.h> | 34 | #include <linux/rcupdate.h> |
35 | #include <linux/limits.h> | 35 | #include <linux/limits.h> |
36 | #include <linux/mutex.h> | 36 | #include <linux/mutex.h> |
37 | #include <linux/rbtree.h> | 37 | #include <linux/rbtree.h> |
38 | #include <linux/slab.h> | 38 | #include <linux/slab.h> |
39 | #include <linux/swap.h> | 39 | #include <linux/swap.h> |
40 | #include <linux/swapops.h> | 40 | #include <linux/swapops.h> |
41 | #include <linux/spinlock.h> | 41 | #include <linux/spinlock.h> |
42 | #include <linux/eventfd.h> | 42 | #include <linux/eventfd.h> |
43 | #include <linux/sort.h> | 43 | #include <linux/sort.h> |
44 | #include <linux/fs.h> | 44 | #include <linux/fs.h> |
45 | #include <linux/seq_file.h> | 45 | #include <linux/seq_file.h> |
46 | #include <linux/vmalloc.h> | 46 | #include <linux/vmalloc.h> |
47 | #include <linux/mm_inline.h> | 47 | #include <linux/mm_inline.h> |
48 | #include <linux/page_cgroup.h> | 48 | #include <linux/page_cgroup.h> |
49 | #include <linux/cpu.h> | 49 | #include <linux/cpu.h> |
50 | #include <linux/oom.h> | 50 | #include <linux/oom.h> |
51 | #include "internal.h" | 51 | #include "internal.h" |
52 | 52 | ||
53 | #include <asm/uaccess.h> | 53 | #include <asm/uaccess.h> |
54 | 54 | ||
55 | #include <trace/events/vmscan.h> | 55 | #include <trace/events/vmscan.h> |
56 | 56 | ||
57 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; | 57 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; |
58 | #define MEM_CGROUP_RECLAIM_RETRIES 5 | 58 | #define MEM_CGROUP_RECLAIM_RETRIES 5 |
59 | struct mem_cgroup *root_mem_cgroup __read_mostly; | 59 | struct mem_cgroup *root_mem_cgroup __read_mostly; |
60 | 60 | ||
61 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 61 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
62 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ | 62 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ |
63 | int do_swap_account __read_mostly; | 63 | int do_swap_account __read_mostly; |
64 | 64 | ||
65 | /* for remember boot option*/ | 65 | /* for remember boot option*/ |
66 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED | 66 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED |
67 | static int really_do_swap_account __initdata = 1; | 67 | static int really_do_swap_account __initdata = 1; |
68 | #else | 68 | #else |
69 | static int really_do_swap_account __initdata = 0; | 69 | static int really_do_swap_account __initdata = 0; |
70 | #endif | 70 | #endif |
71 | 71 | ||
72 | #else | 72 | #else |
73 | #define do_swap_account (0) | 73 | #define do_swap_account (0) |
74 | #endif | 74 | #endif |
75 | 75 | ||
76 | 76 | ||
77 | /* | 77 | /* |
78 | * Statistics for memory cgroup. | 78 | * Statistics for memory cgroup. |
79 | */ | 79 | */ |
80 | enum mem_cgroup_stat_index { | 80 | enum mem_cgroup_stat_index { |
81 | /* | 81 | /* |
82 | * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. | 82 | * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. |
83 | */ | 83 | */ |
84 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ | 84 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ |
85 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ | 85 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ |
86 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ | 86 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ |
87 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ | 87 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ |
88 | MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ | 88 | MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ |
89 | MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ | 89 | MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ |
90 | MEM_CGROUP_STAT_NSTATS, | 90 | MEM_CGROUP_STAT_NSTATS, |
91 | }; | 91 | }; |
92 | 92 | ||
93 | enum mem_cgroup_events_index { | 93 | enum mem_cgroup_events_index { |
94 | MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ | 94 | MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ |
95 | MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ | 95 | MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ |
96 | MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */ | 96 | MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */ |
97 | MEM_CGROUP_EVENTS_NSTATS, | 97 | MEM_CGROUP_EVENTS_NSTATS, |
98 | }; | 98 | }; |
99 | /* | 99 | /* |
100 | * Per memcg event counter is incremented at every pagein/pageout. With THP, | 100 | * Per memcg event counter is incremented at every pagein/pageout. With THP, |
101 | * it will be incremated by the number of pages. This counter is used for | 101 | * it will be incremated by the number of pages. This counter is used for |
102 | * for trigger some periodic events. This is straightforward and better | 102 | * for trigger some periodic events. This is straightforward and better |
103 | * than using jiffies etc. to handle periodic memcg event. | 103 | * than using jiffies etc. to handle periodic memcg event. |
104 | */ | 104 | */ |
105 | enum mem_cgroup_events_target { | 105 | enum mem_cgroup_events_target { |
106 | MEM_CGROUP_TARGET_THRESH, | 106 | MEM_CGROUP_TARGET_THRESH, |
107 | MEM_CGROUP_TARGET_SOFTLIMIT, | 107 | MEM_CGROUP_TARGET_SOFTLIMIT, |
108 | MEM_CGROUP_NTARGETS, | 108 | MEM_CGROUP_NTARGETS, |
109 | }; | 109 | }; |
110 | #define THRESHOLDS_EVENTS_TARGET (128) | 110 | #define THRESHOLDS_EVENTS_TARGET (128) |
111 | #define SOFTLIMIT_EVENTS_TARGET (1024) | 111 | #define SOFTLIMIT_EVENTS_TARGET (1024) |
112 | 112 | ||
113 | struct mem_cgroup_stat_cpu { | 113 | struct mem_cgroup_stat_cpu { |
114 | long count[MEM_CGROUP_STAT_NSTATS]; | 114 | long count[MEM_CGROUP_STAT_NSTATS]; |
115 | unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; | 115 | unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; |
116 | unsigned long targets[MEM_CGROUP_NTARGETS]; | 116 | unsigned long targets[MEM_CGROUP_NTARGETS]; |
117 | }; | 117 | }; |
118 | 118 | ||
119 | /* | 119 | /* |
120 | * per-zone information in memory controller. | 120 | * per-zone information in memory controller. |
121 | */ | 121 | */ |
122 | struct mem_cgroup_per_zone { | 122 | struct mem_cgroup_per_zone { |
123 | /* | 123 | /* |
124 | * spin_lock to protect the per cgroup LRU | 124 | * spin_lock to protect the per cgroup LRU |
125 | */ | 125 | */ |
126 | struct list_head lists[NR_LRU_LISTS]; | 126 | struct list_head lists[NR_LRU_LISTS]; |
127 | unsigned long count[NR_LRU_LISTS]; | 127 | unsigned long count[NR_LRU_LISTS]; |
128 | 128 | ||
129 | struct zone_reclaim_stat reclaim_stat; | 129 | struct zone_reclaim_stat reclaim_stat; |
130 | struct rb_node tree_node; /* RB tree node */ | 130 | struct rb_node tree_node; /* RB tree node */ |
131 | unsigned long long usage_in_excess;/* Set to the value by which */ | 131 | unsigned long long usage_in_excess;/* Set to the value by which */ |
132 | /* the soft limit is exceeded*/ | 132 | /* the soft limit is exceeded*/ |
133 | bool on_tree; | 133 | bool on_tree; |
134 | struct mem_cgroup *mem; /* Back pointer, we cannot */ | 134 | struct mem_cgroup *mem; /* Back pointer, we cannot */ |
135 | /* use container_of */ | 135 | /* use container_of */ |
136 | }; | 136 | }; |
137 | /* Macro for accessing counter */ | 137 | /* Macro for accessing counter */ |
138 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) | 138 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) |
139 | 139 | ||
140 | struct mem_cgroup_per_node { | 140 | struct mem_cgroup_per_node { |
141 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; | 141 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; |
142 | }; | 142 | }; |
143 | 143 | ||
144 | struct mem_cgroup_lru_info { | 144 | struct mem_cgroup_lru_info { |
145 | struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; | 145 | struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; |
146 | }; | 146 | }; |
147 | 147 | ||
148 | /* | 148 | /* |
149 | * Cgroups above their limits are maintained in a RB-Tree, independent of | 149 | * Cgroups above their limits are maintained in a RB-Tree, independent of |
150 | * their hierarchy representation | 150 | * their hierarchy representation |
151 | */ | 151 | */ |
152 | 152 | ||
153 | struct mem_cgroup_tree_per_zone { | 153 | struct mem_cgroup_tree_per_zone { |
154 | struct rb_root rb_root; | 154 | struct rb_root rb_root; |
155 | spinlock_t lock; | 155 | spinlock_t lock; |
156 | }; | 156 | }; |
157 | 157 | ||
158 | struct mem_cgroup_tree_per_node { | 158 | struct mem_cgroup_tree_per_node { |
159 | struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; | 159 | struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; |
160 | }; | 160 | }; |
161 | 161 | ||
162 | struct mem_cgroup_tree { | 162 | struct mem_cgroup_tree { |
163 | struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; | 163 | struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; |
164 | }; | 164 | }; |
165 | 165 | ||
166 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; | 166 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; |
167 | 167 | ||
168 | struct mem_cgroup_threshold { | 168 | struct mem_cgroup_threshold { |
169 | struct eventfd_ctx *eventfd; | 169 | struct eventfd_ctx *eventfd; |
170 | u64 threshold; | 170 | u64 threshold; |
171 | }; | 171 | }; |
172 | 172 | ||
173 | /* For threshold */ | 173 | /* For threshold */ |
174 | struct mem_cgroup_threshold_ary { | 174 | struct mem_cgroup_threshold_ary { |
175 | /* An array index points to threshold just below usage. */ | 175 | /* An array index points to threshold just below usage. */ |
176 | int current_threshold; | 176 | int current_threshold; |
177 | /* Size of entries[] */ | 177 | /* Size of entries[] */ |
178 | unsigned int size; | 178 | unsigned int size; |
179 | /* Array of thresholds */ | 179 | /* Array of thresholds */ |
180 | struct mem_cgroup_threshold entries[0]; | 180 | struct mem_cgroup_threshold entries[0]; |
181 | }; | 181 | }; |
182 | 182 | ||
183 | struct mem_cgroup_thresholds { | 183 | struct mem_cgroup_thresholds { |
184 | /* Primary thresholds array */ | 184 | /* Primary thresholds array */ |
185 | struct mem_cgroup_threshold_ary *primary; | 185 | struct mem_cgroup_threshold_ary *primary; |
186 | /* | 186 | /* |
187 | * Spare threshold array. | 187 | * Spare threshold array. |
188 | * This is needed to make mem_cgroup_unregister_event() "never fail". | 188 | * This is needed to make mem_cgroup_unregister_event() "never fail". |
189 | * It must be able to store at least primary->size - 1 entries. | 189 | * It must be able to store at least primary->size - 1 entries. |
190 | */ | 190 | */ |
191 | struct mem_cgroup_threshold_ary *spare; | 191 | struct mem_cgroup_threshold_ary *spare; |
192 | }; | 192 | }; |
193 | 193 | ||
194 | /* for OOM */ | 194 | /* for OOM */ |
195 | struct mem_cgroup_eventfd_list { | 195 | struct mem_cgroup_eventfd_list { |
196 | struct list_head list; | 196 | struct list_head list; |
197 | struct eventfd_ctx *eventfd; | 197 | struct eventfd_ctx *eventfd; |
198 | }; | 198 | }; |
199 | 199 | ||
200 | static void mem_cgroup_threshold(struct mem_cgroup *mem); | 200 | static void mem_cgroup_threshold(struct mem_cgroup *mem); |
201 | static void mem_cgroup_oom_notify(struct mem_cgroup *mem); | 201 | static void mem_cgroup_oom_notify(struct mem_cgroup *mem); |
202 | 202 | ||
203 | /* | 203 | /* |
204 | * The memory controller data structure. The memory controller controls both | 204 | * The memory controller data structure. The memory controller controls both |
205 | * page cache and RSS per cgroup. We would eventually like to provide | 205 | * page cache and RSS per cgroup. We would eventually like to provide |
206 | * statistics based on the statistics developed by Rik Van Riel for clock-pro, | 206 | * statistics based on the statistics developed by Rik Van Riel for clock-pro, |
207 | * to help the administrator determine what knobs to tune. | 207 | * to help the administrator determine what knobs to tune. |
208 | * | 208 | * |
209 | * TODO: Add a water mark for the memory controller. Reclaim will begin when | 209 | * TODO: Add a water mark for the memory controller. Reclaim will begin when |
210 | * we hit the water mark. May be even add a low water mark, such that | 210 | * we hit the water mark. May be even add a low water mark, such that |
211 | * no reclaim occurs from a cgroup at it's low water mark, this is | 211 | * no reclaim occurs from a cgroup at it's low water mark, this is |
212 | * a feature that will be implemented much later in the future. | 212 | * a feature that will be implemented much later in the future. |
213 | */ | 213 | */ |
214 | struct mem_cgroup { | 214 | struct mem_cgroup { |
215 | struct cgroup_subsys_state css; | 215 | struct cgroup_subsys_state css; |
216 | /* | 216 | /* |
217 | * the counter to account for memory usage | 217 | * the counter to account for memory usage |
218 | */ | 218 | */ |
219 | struct res_counter res; | 219 | struct res_counter res; |
220 | /* | 220 | /* |
221 | * the counter to account for mem+swap usage. | 221 | * the counter to account for mem+swap usage. |
222 | */ | 222 | */ |
223 | struct res_counter memsw; | 223 | struct res_counter memsw; |
224 | /* | 224 | /* |
225 | * Per cgroup active and inactive list, similar to the | 225 | * Per cgroup active and inactive list, similar to the |
226 | * per zone LRU lists. | 226 | * per zone LRU lists. |
227 | */ | 227 | */ |
228 | struct mem_cgroup_lru_info info; | 228 | struct mem_cgroup_lru_info info; |
229 | /* | 229 | /* |
230 | * While reclaiming in a hierarchy, we cache the last child we | 230 | * While reclaiming in a hierarchy, we cache the last child we |
231 | * reclaimed from. | 231 | * reclaimed from. |
232 | */ | 232 | */ |
233 | int last_scanned_child; | 233 | int last_scanned_child; |
234 | /* | 234 | /* |
235 | * Should the accounting and control be hierarchical, per subtree? | 235 | * Should the accounting and control be hierarchical, per subtree? |
236 | */ | 236 | */ |
237 | bool use_hierarchy; | 237 | bool use_hierarchy; |
238 | atomic_t oom_lock; | 238 | atomic_t oom_lock; |
239 | atomic_t refcnt; | 239 | atomic_t refcnt; |
240 | 240 | ||
241 | unsigned int swappiness; | 241 | unsigned int swappiness; |
242 | /* OOM-Killer disable */ | 242 | /* OOM-Killer disable */ |
243 | int oom_kill_disable; | 243 | int oom_kill_disable; |
244 | 244 | ||
245 | /* set when res.limit == memsw.limit */ | 245 | /* set when res.limit == memsw.limit */ |
246 | bool memsw_is_minimum; | 246 | bool memsw_is_minimum; |
247 | 247 | ||
248 | /* protect arrays of thresholds */ | 248 | /* protect arrays of thresholds */ |
249 | struct mutex thresholds_lock; | 249 | struct mutex thresholds_lock; |
250 | 250 | ||
251 | /* thresholds for memory usage. RCU-protected */ | 251 | /* thresholds for memory usage. RCU-protected */ |
252 | struct mem_cgroup_thresholds thresholds; | 252 | struct mem_cgroup_thresholds thresholds; |
253 | 253 | ||
254 | /* thresholds for mem+swap usage. RCU-protected */ | 254 | /* thresholds for mem+swap usage. RCU-protected */ |
255 | struct mem_cgroup_thresholds memsw_thresholds; | 255 | struct mem_cgroup_thresholds memsw_thresholds; |
256 | 256 | ||
257 | /* For oom notifier event fd */ | 257 | /* For oom notifier event fd */ |
258 | struct list_head oom_notify; | 258 | struct list_head oom_notify; |
259 | 259 | ||
260 | /* | 260 | /* |
261 | * Should we move charges of a task when a task is moved into this | 261 | * Should we move charges of a task when a task is moved into this |
262 | * mem_cgroup ? And what type of charges should we move ? | 262 | * mem_cgroup ? And what type of charges should we move ? |
263 | */ | 263 | */ |
264 | unsigned long move_charge_at_immigrate; | 264 | unsigned long move_charge_at_immigrate; |
265 | /* | 265 | /* |
266 | * percpu counter. | 266 | * percpu counter. |
267 | */ | 267 | */ |
268 | struct mem_cgroup_stat_cpu *stat; | 268 | struct mem_cgroup_stat_cpu *stat; |
269 | /* | 269 | /* |
270 | * used when a cpu is offlined or other synchronizations | 270 | * used when a cpu is offlined or other synchronizations |
271 | * See mem_cgroup_read_stat(). | 271 | * See mem_cgroup_read_stat(). |
272 | */ | 272 | */ |
273 | struct mem_cgroup_stat_cpu nocpu_base; | 273 | struct mem_cgroup_stat_cpu nocpu_base; |
274 | spinlock_t pcp_counter_lock; | 274 | spinlock_t pcp_counter_lock; |
275 | }; | 275 | }; |
276 | 276 | ||
277 | /* Stuffs for move charges at task migration. */ | 277 | /* Stuffs for move charges at task migration. */ |
278 | /* | 278 | /* |
279 | * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a | 279 | * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a |
280 | * left-shifted bitmap of these types. | 280 | * left-shifted bitmap of these types. |
281 | */ | 281 | */ |
282 | enum move_type { | 282 | enum move_type { |
283 | MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ | 283 | MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ |
284 | MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ | 284 | MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ |
285 | NR_MOVE_TYPE, | 285 | NR_MOVE_TYPE, |
286 | }; | 286 | }; |
287 | 287 | ||
288 | /* "mc" and its members are protected by cgroup_mutex */ | 288 | /* "mc" and its members are protected by cgroup_mutex */ |
289 | static struct move_charge_struct { | 289 | static struct move_charge_struct { |
290 | spinlock_t lock; /* for from, to */ | 290 | spinlock_t lock; /* for from, to */ |
291 | struct mem_cgroup *from; | 291 | struct mem_cgroup *from; |
292 | struct mem_cgroup *to; | 292 | struct mem_cgroup *to; |
293 | unsigned long precharge; | 293 | unsigned long precharge; |
294 | unsigned long moved_charge; | 294 | unsigned long moved_charge; |
295 | unsigned long moved_swap; | 295 | unsigned long moved_swap; |
296 | struct task_struct *moving_task; /* a task moving charges */ | 296 | struct task_struct *moving_task; /* a task moving charges */ |
297 | wait_queue_head_t waitq; /* a waitq for other context */ | 297 | wait_queue_head_t waitq; /* a waitq for other context */ |
298 | } mc = { | 298 | } mc = { |
299 | .lock = __SPIN_LOCK_UNLOCKED(mc.lock), | 299 | .lock = __SPIN_LOCK_UNLOCKED(mc.lock), |
300 | .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), | 300 | .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), |
301 | }; | 301 | }; |
302 | 302 | ||
303 | static bool move_anon(void) | 303 | static bool move_anon(void) |
304 | { | 304 | { |
305 | return test_bit(MOVE_CHARGE_TYPE_ANON, | 305 | return test_bit(MOVE_CHARGE_TYPE_ANON, |
306 | &mc.to->move_charge_at_immigrate); | 306 | &mc.to->move_charge_at_immigrate); |
307 | } | 307 | } |
308 | 308 | ||
309 | static bool move_file(void) | 309 | static bool move_file(void) |
310 | { | 310 | { |
311 | return test_bit(MOVE_CHARGE_TYPE_FILE, | 311 | return test_bit(MOVE_CHARGE_TYPE_FILE, |
312 | &mc.to->move_charge_at_immigrate); | 312 | &mc.to->move_charge_at_immigrate); |
313 | } | 313 | } |
314 | 314 | ||
315 | /* | 315 | /* |
316 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft | 316 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft |
317 | * limit reclaim to prevent infinite loops, if they ever occur. | 317 | * limit reclaim to prevent infinite loops, if they ever occur. |
318 | */ | 318 | */ |
319 | #define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) | 319 | #define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) |
320 | #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) | 320 | #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) |
321 | 321 | ||
322 | enum charge_type { | 322 | enum charge_type { |
323 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | 323 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, |
324 | MEM_CGROUP_CHARGE_TYPE_MAPPED, | 324 | MEM_CGROUP_CHARGE_TYPE_MAPPED, |
325 | MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ | 325 | MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ |
326 | MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ | 326 | MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ |
327 | MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ | 327 | MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ |
328 | MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ | 328 | MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ |
329 | NR_CHARGE_TYPE, | 329 | NR_CHARGE_TYPE, |
330 | }; | 330 | }; |
331 | 331 | ||
332 | /* for encoding cft->private value on file */ | 332 | /* for encoding cft->private value on file */ |
333 | #define _MEM (0) | 333 | #define _MEM (0) |
334 | #define _MEMSWAP (1) | 334 | #define _MEMSWAP (1) |
335 | #define _OOM_TYPE (2) | 335 | #define _OOM_TYPE (2) |
336 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) | 336 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) |
337 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) | 337 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) |
338 | #define MEMFILE_ATTR(val) ((val) & 0xffff) | 338 | #define MEMFILE_ATTR(val) ((val) & 0xffff) |
339 | /* Used for OOM nofiier */ | 339 | /* Used for OOM nofiier */ |
340 | #define OOM_CONTROL (0) | 340 | #define OOM_CONTROL (0) |
341 | 341 | ||
342 | /* | 342 | /* |
343 | * Reclaim flags for mem_cgroup_hierarchical_reclaim | 343 | * Reclaim flags for mem_cgroup_hierarchical_reclaim |
344 | */ | 344 | */ |
345 | #define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 | 345 | #define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 |
346 | #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) | 346 | #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) |
347 | #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 | 347 | #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 |
348 | #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) | 348 | #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) |
349 | #define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 | 349 | #define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 |
350 | #define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) | 350 | #define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) |
351 | 351 | ||
352 | static void mem_cgroup_get(struct mem_cgroup *mem); | 352 | static void mem_cgroup_get(struct mem_cgroup *mem); |
353 | static void mem_cgroup_put(struct mem_cgroup *mem); | 353 | static void mem_cgroup_put(struct mem_cgroup *mem); |
354 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); | 354 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); |
355 | static void drain_all_stock_async(void); | 355 | static void drain_all_stock_async(void); |
356 | 356 | ||
357 | static struct mem_cgroup_per_zone * | 357 | static struct mem_cgroup_per_zone * |
358 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | 358 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) |
359 | { | 359 | { |
360 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; | 360 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; |
361 | } | 361 | } |
362 | 362 | ||
363 | struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) | 363 | struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) |
364 | { | 364 | { |
365 | return &mem->css; | 365 | return &mem->css; |
366 | } | 366 | } |
367 | 367 | ||
368 | static struct mem_cgroup_per_zone * | 368 | static struct mem_cgroup_per_zone * |
369 | page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page) | 369 | page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page) |
370 | { | 370 | { |
371 | int nid = page_to_nid(page); | 371 | int nid = page_to_nid(page); |
372 | int zid = page_zonenum(page); | 372 | int zid = page_zonenum(page); |
373 | 373 | ||
374 | return mem_cgroup_zoneinfo(mem, nid, zid); | 374 | return mem_cgroup_zoneinfo(mem, nid, zid); |
375 | } | 375 | } |
376 | 376 | ||
377 | static struct mem_cgroup_tree_per_zone * | 377 | static struct mem_cgroup_tree_per_zone * |
378 | soft_limit_tree_node_zone(int nid, int zid) | 378 | soft_limit_tree_node_zone(int nid, int zid) |
379 | { | 379 | { |
380 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | 380 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; |
381 | } | 381 | } |
382 | 382 | ||
383 | static struct mem_cgroup_tree_per_zone * | 383 | static struct mem_cgroup_tree_per_zone * |
384 | soft_limit_tree_from_page(struct page *page) | 384 | soft_limit_tree_from_page(struct page *page) |
385 | { | 385 | { |
386 | int nid = page_to_nid(page); | 386 | int nid = page_to_nid(page); |
387 | int zid = page_zonenum(page); | 387 | int zid = page_zonenum(page); |
388 | 388 | ||
389 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | 389 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; |
390 | } | 390 | } |
391 | 391 | ||
392 | static void | 392 | static void |
393 | __mem_cgroup_insert_exceeded(struct mem_cgroup *mem, | 393 | __mem_cgroup_insert_exceeded(struct mem_cgroup *mem, |
394 | struct mem_cgroup_per_zone *mz, | 394 | struct mem_cgroup_per_zone *mz, |
395 | struct mem_cgroup_tree_per_zone *mctz, | 395 | struct mem_cgroup_tree_per_zone *mctz, |
396 | unsigned long long new_usage_in_excess) | 396 | unsigned long long new_usage_in_excess) |
397 | { | 397 | { |
398 | struct rb_node **p = &mctz->rb_root.rb_node; | 398 | struct rb_node **p = &mctz->rb_root.rb_node; |
399 | struct rb_node *parent = NULL; | 399 | struct rb_node *parent = NULL; |
400 | struct mem_cgroup_per_zone *mz_node; | 400 | struct mem_cgroup_per_zone *mz_node; |
401 | 401 | ||
402 | if (mz->on_tree) | 402 | if (mz->on_tree) |
403 | return; | 403 | return; |
404 | 404 | ||
405 | mz->usage_in_excess = new_usage_in_excess; | 405 | mz->usage_in_excess = new_usage_in_excess; |
406 | if (!mz->usage_in_excess) | 406 | if (!mz->usage_in_excess) |
407 | return; | 407 | return; |
408 | while (*p) { | 408 | while (*p) { |
409 | parent = *p; | 409 | parent = *p; |
410 | mz_node = rb_entry(parent, struct mem_cgroup_per_zone, | 410 | mz_node = rb_entry(parent, struct mem_cgroup_per_zone, |
411 | tree_node); | 411 | tree_node); |
412 | if (mz->usage_in_excess < mz_node->usage_in_excess) | 412 | if (mz->usage_in_excess < mz_node->usage_in_excess) |
413 | p = &(*p)->rb_left; | 413 | p = &(*p)->rb_left; |
414 | /* | 414 | /* |
415 | * We can't avoid mem cgroups that are over their soft | 415 | * We can't avoid mem cgroups that are over their soft |
416 | * limit by the same amount | 416 | * limit by the same amount |
417 | */ | 417 | */ |
418 | else if (mz->usage_in_excess >= mz_node->usage_in_excess) | 418 | else if (mz->usage_in_excess >= mz_node->usage_in_excess) |
419 | p = &(*p)->rb_right; | 419 | p = &(*p)->rb_right; |
420 | } | 420 | } |
421 | rb_link_node(&mz->tree_node, parent, p); | 421 | rb_link_node(&mz->tree_node, parent, p); |
422 | rb_insert_color(&mz->tree_node, &mctz->rb_root); | 422 | rb_insert_color(&mz->tree_node, &mctz->rb_root); |
423 | mz->on_tree = true; | 423 | mz->on_tree = true; |
424 | } | 424 | } |
425 | 425 | ||
426 | static void | 426 | static void |
427 | __mem_cgroup_remove_exceeded(struct mem_cgroup *mem, | 427 | __mem_cgroup_remove_exceeded(struct mem_cgroup *mem, |
428 | struct mem_cgroup_per_zone *mz, | 428 | struct mem_cgroup_per_zone *mz, |
429 | struct mem_cgroup_tree_per_zone *mctz) | 429 | struct mem_cgroup_tree_per_zone *mctz) |
430 | { | 430 | { |
431 | if (!mz->on_tree) | 431 | if (!mz->on_tree) |
432 | return; | 432 | return; |
433 | rb_erase(&mz->tree_node, &mctz->rb_root); | 433 | rb_erase(&mz->tree_node, &mctz->rb_root); |
434 | mz->on_tree = false; | 434 | mz->on_tree = false; |
435 | } | 435 | } |
436 | 436 | ||
437 | static void | 437 | static void |
438 | mem_cgroup_remove_exceeded(struct mem_cgroup *mem, | 438 | mem_cgroup_remove_exceeded(struct mem_cgroup *mem, |
439 | struct mem_cgroup_per_zone *mz, | 439 | struct mem_cgroup_per_zone *mz, |
440 | struct mem_cgroup_tree_per_zone *mctz) | 440 | struct mem_cgroup_tree_per_zone *mctz) |
441 | { | 441 | { |
442 | spin_lock(&mctz->lock); | 442 | spin_lock(&mctz->lock); |
443 | __mem_cgroup_remove_exceeded(mem, mz, mctz); | 443 | __mem_cgroup_remove_exceeded(mem, mz, mctz); |
444 | spin_unlock(&mctz->lock); | 444 | spin_unlock(&mctz->lock); |
445 | } | 445 | } |
446 | 446 | ||
447 | 447 | ||
448 | static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) | 448 | static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) |
449 | { | 449 | { |
450 | unsigned long long excess; | 450 | unsigned long long excess; |
451 | struct mem_cgroup_per_zone *mz; | 451 | struct mem_cgroup_per_zone *mz; |
452 | struct mem_cgroup_tree_per_zone *mctz; | 452 | struct mem_cgroup_tree_per_zone *mctz; |
453 | int nid = page_to_nid(page); | 453 | int nid = page_to_nid(page); |
454 | int zid = page_zonenum(page); | 454 | int zid = page_zonenum(page); |
455 | mctz = soft_limit_tree_from_page(page); | 455 | mctz = soft_limit_tree_from_page(page); |
456 | 456 | ||
457 | /* | 457 | /* |
458 | * Necessary to update all ancestors when hierarchy is used. | 458 | * Necessary to update all ancestors when hierarchy is used. |
459 | * because their event counter is not touched. | 459 | * because their event counter is not touched. |
460 | */ | 460 | */ |
461 | for (; mem; mem = parent_mem_cgroup(mem)) { | 461 | for (; mem; mem = parent_mem_cgroup(mem)) { |
462 | mz = mem_cgroup_zoneinfo(mem, nid, zid); | 462 | mz = mem_cgroup_zoneinfo(mem, nid, zid); |
463 | excess = res_counter_soft_limit_excess(&mem->res); | 463 | excess = res_counter_soft_limit_excess(&mem->res); |
464 | /* | 464 | /* |
465 | * We have to update the tree if mz is on RB-tree or | 465 | * We have to update the tree if mz is on RB-tree or |
466 | * mem is over its softlimit. | 466 | * mem is over its softlimit. |
467 | */ | 467 | */ |
468 | if (excess || mz->on_tree) { | 468 | if (excess || mz->on_tree) { |
469 | spin_lock(&mctz->lock); | 469 | spin_lock(&mctz->lock); |
470 | /* if on-tree, remove it */ | 470 | /* if on-tree, remove it */ |
471 | if (mz->on_tree) | 471 | if (mz->on_tree) |
472 | __mem_cgroup_remove_exceeded(mem, mz, mctz); | 472 | __mem_cgroup_remove_exceeded(mem, mz, mctz); |
473 | /* | 473 | /* |
474 | * Insert again. mz->usage_in_excess will be updated. | 474 | * Insert again. mz->usage_in_excess will be updated. |
475 | * If excess is 0, no tree ops. | 475 | * If excess is 0, no tree ops. |
476 | */ | 476 | */ |
477 | __mem_cgroup_insert_exceeded(mem, mz, mctz, excess); | 477 | __mem_cgroup_insert_exceeded(mem, mz, mctz, excess); |
478 | spin_unlock(&mctz->lock); | 478 | spin_unlock(&mctz->lock); |
479 | } | 479 | } |
480 | } | 480 | } |
481 | } | 481 | } |
482 | 482 | ||
483 | static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) | 483 | static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) |
484 | { | 484 | { |
485 | int node, zone; | 485 | int node, zone; |
486 | struct mem_cgroup_per_zone *mz; | 486 | struct mem_cgroup_per_zone *mz; |
487 | struct mem_cgroup_tree_per_zone *mctz; | 487 | struct mem_cgroup_tree_per_zone *mctz; |
488 | 488 | ||
489 | for_each_node_state(node, N_POSSIBLE) { | 489 | for_each_node_state(node, N_POSSIBLE) { |
490 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 490 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
491 | mz = mem_cgroup_zoneinfo(mem, node, zone); | 491 | mz = mem_cgroup_zoneinfo(mem, node, zone); |
492 | mctz = soft_limit_tree_node_zone(node, zone); | 492 | mctz = soft_limit_tree_node_zone(node, zone); |
493 | mem_cgroup_remove_exceeded(mem, mz, mctz); | 493 | mem_cgroup_remove_exceeded(mem, mz, mctz); |
494 | } | 494 | } |
495 | } | 495 | } |
496 | } | 496 | } |
497 | 497 | ||
498 | static struct mem_cgroup_per_zone * | 498 | static struct mem_cgroup_per_zone * |
499 | __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | 499 | __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) |
500 | { | 500 | { |
501 | struct rb_node *rightmost = NULL; | 501 | struct rb_node *rightmost = NULL; |
502 | struct mem_cgroup_per_zone *mz; | 502 | struct mem_cgroup_per_zone *mz; |
503 | 503 | ||
504 | retry: | 504 | retry: |
505 | mz = NULL; | 505 | mz = NULL; |
506 | rightmost = rb_last(&mctz->rb_root); | 506 | rightmost = rb_last(&mctz->rb_root); |
507 | if (!rightmost) | 507 | if (!rightmost) |
508 | goto done; /* Nothing to reclaim from */ | 508 | goto done; /* Nothing to reclaim from */ |
509 | 509 | ||
510 | mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); | 510 | mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); |
511 | /* | 511 | /* |
512 | * Remove the node now but someone else can add it back, | 512 | * Remove the node now but someone else can add it back, |
513 | * we will to add it back at the end of reclaim to its correct | 513 | * we will to add it back at the end of reclaim to its correct |
514 | * position in the tree. | 514 | * position in the tree. |
515 | */ | 515 | */ |
516 | __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); | 516 | __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); |
517 | if (!res_counter_soft_limit_excess(&mz->mem->res) || | 517 | if (!res_counter_soft_limit_excess(&mz->mem->res) || |
518 | !css_tryget(&mz->mem->css)) | 518 | !css_tryget(&mz->mem->css)) |
519 | goto retry; | 519 | goto retry; |
520 | done: | 520 | done: |
521 | return mz; | 521 | return mz; |
522 | } | 522 | } |
523 | 523 | ||
524 | static struct mem_cgroup_per_zone * | 524 | static struct mem_cgroup_per_zone * |
525 | mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | 525 | mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) |
526 | { | 526 | { |
527 | struct mem_cgroup_per_zone *mz; | 527 | struct mem_cgroup_per_zone *mz; |
528 | 528 | ||
529 | spin_lock(&mctz->lock); | 529 | spin_lock(&mctz->lock); |
530 | mz = __mem_cgroup_largest_soft_limit_node(mctz); | 530 | mz = __mem_cgroup_largest_soft_limit_node(mctz); |
531 | spin_unlock(&mctz->lock); | 531 | spin_unlock(&mctz->lock); |
532 | return mz; | 532 | return mz; |
533 | } | 533 | } |
534 | 534 | ||
535 | /* | 535 | /* |
536 | * Implementation Note: reading percpu statistics for memcg. | 536 | * Implementation Note: reading percpu statistics for memcg. |
537 | * | 537 | * |
538 | * Both of vmstat[] and percpu_counter has threshold and do periodic | 538 | * Both of vmstat[] and percpu_counter has threshold and do periodic |
539 | * synchronization to implement "quick" read. There are trade-off between | 539 | * synchronization to implement "quick" read. There are trade-off between |
540 | * reading cost and precision of value. Then, we may have a chance to implement | 540 | * reading cost and precision of value. Then, we may have a chance to implement |
541 | * a periodic synchronizion of counter in memcg's counter. | 541 | * a periodic synchronizion of counter in memcg's counter. |
542 | * | 542 | * |
543 | * But this _read() function is used for user interface now. The user accounts | 543 | * But this _read() function is used for user interface now. The user accounts |
544 | * memory usage by memory cgroup and he _always_ requires exact value because | 544 | * memory usage by memory cgroup and he _always_ requires exact value because |
545 | * he accounts memory. Even if we provide quick-and-fuzzy read, we always | 545 | * he accounts memory. Even if we provide quick-and-fuzzy read, we always |
546 | * have to visit all online cpus and make sum. So, for now, unnecessary | 546 | * have to visit all online cpus and make sum. So, for now, unnecessary |
547 | * synchronization is not implemented. (just implemented for cpu hotplug) | 547 | * synchronization is not implemented. (just implemented for cpu hotplug) |
548 | * | 548 | * |
549 | * If there are kernel internal actions which can make use of some not-exact | 549 | * If there are kernel internal actions which can make use of some not-exact |
550 | * value, and reading all cpu value can be performance bottleneck in some | 550 | * value, and reading all cpu value can be performance bottleneck in some |
551 | * common workload, threashold and synchonization as vmstat[] should be | 551 | * common workload, threashold and synchonization as vmstat[] should be |
552 | * implemented. | 552 | * implemented. |
553 | */ | 553 | */ |
554 | static long mem_cgroup_read_stat(struct mem_cgroup *mem, | 554 | static long mem_cgroup_read_stat(struct mem_cgroup *mem, |
555 | enum mem_cgroup_stat_index idx) | 555 | enum mem_cgroup_stat_index idx) |
556 | { | 556 | { |
557 | long val = 0; | 557 | long val = 0; |
558 | int cpu; | 558 | int cpu; |
559 | 559 | ||
560 | get_online_cpus(); | 560 | get_online_cpus(); |
561 | for_each_online_cpu(cpu) | 561 | for_each_online_cpu(cpu) |
562 | val += per_cpu(mem->stat->count[idx], cpu); | 562 | val += per_cpu(mem->stat->count[idx], cpu); |
563 | #ifdef CONFIG_HOTPLUG_CPU | 563 | #ifdef CONFIG_HOTPLUG_CPU |
564 | spin_lock(&mem->pcp_counter_lock); | 564 | spin_lock(&mem->pcp_counter_lock); |
565 | val += mem->nocpu_base.count[idx]; | 565 | val += mem->nocpu_base.count[idx]; |
566 | spin_unlock(&mem->pcp_counter_lock); | 566 | spin_unlock(&mem->pcp_counter_lock); |
567 | #endif | 567 | #endif |
568 | put_online_cpus(); | 568 | put_online_cpus(); |
569 | return val; | 569 | return val; |
570 | } | 570 | } |
571 | 571 | ||
572 | static long mem_cgroup_local_usage(struct mem_cgroup *mem) | 572 | static long mem_cgroup_local_usage(struct mem_cgroup *mem) |
573 | { | 573 | { |
574 | long ret; | 574 | long ret; |
575 | 575 | ||
576 | ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); | 576 | ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); |
577 | ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); | 577 | ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); |
578 | return ret; | 578 | return ret; |
579 | } | 579 | } |
580 | 580 | ||
581 | static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, | 581 | static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, |
582 | bool charge) | 582 | bool charge) |
583 | { | 583 | { |
584 | int val = (charge) ? 1 : -1; | 584 | int val = (charge) ? 1 : -1; |
585 | this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); | 585 | this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); |
586 | } | 586 | } |
587 | 587 | ||
588 | static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem, | 588 | static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem, |
589 | enum mem_cgroup_events_index idx) | 589 | enum mem_cgroup_events_index idx) |
590 | { | 590 | { |
591 | unsigned long val = 0; | 591 | unsigned long val = 0; |
592 | int cpu; | 592 | int cpu; |
593 | 593 | ||
594 | for_each_online_cpu(cpu) | 594 | for_each_online_cpu(cpu) |
595 | val += per_cpu(mem->stat->events[idx], cpu); | 595 | val += per_cpu(mem->stat->events[idx], cpu); |
596 | #ifdef CONFIG_HOTPLUG_CPU | 596 | #ifdef CONFIG_HOTPLUG_CPU |
597 | spin_lock(&mem->pcp_counter_lock); | 597 | spin_lock(&mem->pcp_counter_lock); |
598 | val += mem->nocpu_base.events[idx]; | 598 | val += mem->nocpu_base.events[idx]; |
599 | spin_unlock(&mem->pcp_counter_lock); | 599 | spin_unlock(&mem->pcp_counter_lock); |
600 | #endif | 600 | #endif |
601 | return val; | 601 | return val; |
602 | } | 602 | } |
603 | 603 | ||
604 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | 604 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, |
605 | bool file, int nr_pages) | 605 | bool file, int nr_pages) |
606 | { | 606 | { |
607 | preempt_disable(); | 607 | preempt_disable(); |
608 | 608 | ||
609 | if (file) | 609 | if (file) |
610 | __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); | 610 | __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); |
611 | else | 611 | else |
612 | __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); | 612 | __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); |
613 | 613 | ||
614 | /* pagein of a big page is an event. So, ignore page size */ | 614 | /* pagein of a big page is an event. So, ignore page size */ |
615 | if (nr_pages > 0) | 615 | if (nr_pages > 0) |
616 | __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); | 616 | __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); |
617 | else { | 617 | else { |
618 | __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); | 618 | __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); |
619 | nr_pages = -nr_pages; /* for event */ | 619 | nr_pages = -nr_pages; /* for event */ |
620 | } | 620 | } |
621 | 621 | ||
622 | __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); | 622 | __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); |
623 | 623 | ||
624 | preempt_enable(); | 624 | preempt_enable(); |
625 | } | 625 | } |
626 | 626 | ||
627 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, | 627 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, |
628 | enum lru_list idx) | 628 | enum lru_list idx) |
629 | { | 629 | { |
630 | int nid, zid; | 630 | int nid, zid; |
631 | struct mem_cgroup_per_zone *mz; | 631 | struct mem_cgroup_per_zone *mz; |
632 | u64 total = 0; | 632 | u64 total = 0; |
633 | 633 | ||
634 | for_each_online_node(nid) | 634 | for_each_online_node(nid) |
635 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 635 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { |
636 | mz = mem_cgroup_zoneinfo(mem, nid, zid); | 636 | mz = mem_cgroup_zoneinfo(mem, nid, zid); |
637 | total += MEM_CGROUP_ZSTAT(mz, idx); | 637 | total += MEM_CGROUP_ZSTAT(mz, idx); |
638 | } | 638 | } |
639 | return total; | 639 | return total; |
640 | } | 640 | } |
641 | 641 | ||
642 | static bool __memcg_event_check(struct mem_cgroup *mem, int target) | 642 | static bool __memcg_event_check(struct mem_cgroup *mem, int target) |
643 | { | 643 | { |
644 | unsigned long val, next; | 644 | unsigned long val, next; |
645 | 645 | ||
646 | val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); | 646 | val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); |
647 | next = this_cpu_read(mem->stat->targets[target]); | 647 | next = this_cpu_read(mem->stat->targets[target]); |
648 | /* from time_after() in jiffies.h */ | 648 | /* from time_after() in jiffies.h */ |
649 | return ((long)next - (long)val < 0); | 649 | return ((long)next - (long)val < 0); |
650 | } | 650 | } |
651 | 651 | ||
652 | static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target) | 652 | static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target) |
653 | { | 653 | { |
654 | unsigned long val, next; | 654 | unsigned long val, next; |
655 | 655 | ||
656 | val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); | 656 | val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); |
657 | 657 | ||
658 | switch (target) { | 658 | switch (target) { |
659 | case MEM_CGROUP_TARGET_THRESH: | 659 | case MEM_CGROUP_TARGET_THRESH: |
660 | next = val + THRESHOLDS_EVENTS_TARGET; | 660 | next = val + THRESHOLDS_EVENTS_TARGET; |
661 | break; | 661 | break; |
662 | case MEM_CGROUP_TARGET_SOFTLIMIT: | 662 | case MEM_CGROUP_TARGET_SOFTLIMIT: |
663 | next = val + SOFTLIMIT_EVENTS_TARGET; | 663 | next = val + SOFTLIMIT_EVENTS_TARGET; |
664 | break; | 664 | break; |
665 | default: | 665 | default: |
666 | return; | 666 | return; |
667 | } | 667 | } |
668 | 668 | ||
669 | this_cpu_write(mem->stat->targets[target], next); | 669 | this_cpu_write(mem->stat->targets[target], next); |
670 | } | 670 | } |
671 | 671 | ||
672 | /* | 672 | /* |
673 | * Check events in order. | 673 | * Check events in order. |
674 | * | 674 | * |
675 | */ | 675 | */ |
676 | static void memcg_check_events(struct mem_cgroup *mem, struct page *page) | 676 | static void memcg_check_events(struct mem_cgroup *mem, struct page *page) |
677 | { | 677 | { |
678 | /* threshold event is triggered in finer grain than soft limit */ | 678 | /* threshold event is triggered in finer grain than soft limit */ |
679 | if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) { | 679 | if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) { |
680 | mem_cgroup_threshold(mem); | 680 | mem_cgroup_threshold(mem); |
681 | __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); | 681 | __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); |
682 | if (unlikely(__memcg_event_check(mem, | 682 | if (unlikely(__memcg_event_check(mem, |
683 | MEM_CGROUP_TARGET_SOFTLIMIT))){ | 683 | MEM_CGROUP_TARGET_SOFTLIMIT))){ |
684 | mem_cgroup_update_tree(mem, page); | 684 | mem_cgroup_update_tree(mem, page); |
685 | __mem_cgroup_target_update(mem, | 685 | __mem_cgroup_target_update(mem, |
686 | MEM_CGROUP_TARGET_SOFTLIMIT); | 686 | MEM_CGROUP_TARGET_SOFTLIMIT); |
687 | } | 687 | } |
688 | } | 688 | } |
689 | } | 689 | } |
690 | 690 | ||
691 | static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) | 691 | static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) |
692 | { | 692 | { |
693 | return container_of(cgroup_subsys_state(cont, | 693 | return container_of(cgroup_subsys_state(cont, |
694 | mem_cgroup_subsys_id), struct mem_cgroup, | 694 | mem_cgroup_subsys_id), struct mem_cgroup, |
695 | css); | 695 | css); |
696 | } | 696 | } |
697 | 697 | ||
698 | struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | 698 | struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) |
699 | { | 699 | { |
700 | /* | 700 | /* |
701 | * mm_update_next_owner() may clear mm->owner to NULL | 701 | * mm_update_next_owner() may clear mm->owner to NULL |
702 | * if it races with swapoff, page migration, etc. | 702 | * if it races with swapoff, page migration, etc. |
703 | * So this can be called with p == NULL. | 703 | * So this can be called with p == NULL. |
704 | */ | 704 | */ |
705 | if (unlikely(!p)) | 705 | if (unlikely(!p)) |
706 | return NULL; | 706 | return NULL; |
707 | 707 | ||
708 | return container_of(task_subsys_state(p, mem_cgroup_subsys_id), | 708 | return container_of(task_subsys_state(p, mem_cgroup_subsys_id), |
709 | struct mem_cgroup, css); | 709 | struct mem_cgroup, css); |
710 | } | 710 | } |
711 | 711 | ||
712 | static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | 712 | static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) |
713 | { | 713 | { |
714 | struct mem_cgroup *mem = NULL; | 714 | struct mem_cgroup *mem = NULL; |
715 | 715 | ||
716 | if (!mm) | 716 | if (!mm) |
717 | return NULL; | 717 | return NULL; |
718 | /* | 718 | /* |
719 | * Because we have no locks, mm->owner's may be being moved to other | 719 | * Because we have no locks, mm->owner's may be being moved to other |
720 | * cgroup. We use css_tryget() here even if this looks | 720 | * cgroup. We use css_tryget() here even if this looks |
721 | * pessimistic (rather than adding locks here). | 721 | * pessimistic (rather than adding locks here). |
722 | */ | 722 | */ |
723 | rcu_read_lock(); | 723 | rcu_read_lock(); |
724 | do { | 724 | do { |
725 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 725 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); |
726 | if (unlikely(!mem)) | 726 | if (unlikely(!mem)) |
727 | break; | 727 | break; |
728 | } while (!css_tryget(&mem->css)); | 728 | } while (!css_tryget(&mem->css)); |
729 | rcu_read_unlock(); | 729 | rcu_read_unlock(); |
730 | return mem; | 730 | return mem; |
731 | } | 731 | } |
732 | 732 | ||
733 | /* The caller has to guarantee "mem" exists before calling this */ | 733 | /* The caller has to guarantee "mem" exists before calling this */ |
734 | static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem) | 734 | static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem) |
735 | { | 735 | { |
736 | struct cgroup_subsys_state *css; | 736 | struct cgroup_subsys_state *css; |
737 | int found; | 737 | int found; |
738 | 738 | ||
739 | if (!mem) /* ROOT cgroup has the smallest ID */ | 739 | if (!mem) /* ROOT cgroup has the smallest ID */ |
740 | return root_mem_cgroup; /*css_put/get against root is ignored*/ | 740 | return root_mem_cgroup; /*css_put/get against root is ignored*/ |
741 | if (!mem->use_hierarchy) { | 741 | if (!mem->use_hierarchy) { |
742 | if (css_tryget(&mem->css)) | 742 | if (css_tryget(&mem->css)) |
743 | return mem; | 743 | return mem; |
744 | return NULL; | 744 | return NULL; |
745 | } | 745 | } |
746 | rcu_read_lock(); | 746 | rcu_read_lock(); |
747 | /* | 747 | /* |
748 | * searching a memory cgroup which has the smallest ID under given | 748 | * searching a memory cgroup which has the smallest ID under given |
749 | * ROOT cgroup. (ID >= 1) | 749 | * ROOT cgroup. (ID >= 1) |
750 | */ | 750 | */ |
751 | css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found); | 751 | css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found); |
752 | if (css && css_tryget(css)) | 752 | if (css && css_tryget(css)) |
753 | mem = container_of(css, struct mem_cgroup, css); | 753 | mem = container_of(css, struct mem_cgroup, css); |
754 | else | 754 | else |
755 | mem = NULL; | 755 | mem = NULL; |
756 | rcu_read_unlock(); | 756 | rcu_read_unlock(); |
757 | return mem; | 757 | return mem; |
758 | } | 758 | } |
759 | 759 | ||
760 | static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, | 760 | static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, |
761 | struct mem_cgroup *root, | 761 | struct mem_cgroup *root, |
762 | bool cond) | 762 | bool cond) |
763 | { | 763 | { |
764 | int nextid = css_id(&iter->css) + 1; | 764 | int nextid = css_id(&iter->css) + 1; |
765 | int found; | 765 | int found; |
766 | int hierarchy_used; | 766 | int hierarchy_used; |
767 | struct cgroup_subsys_state *css; | 767 | struct cgroup_subsys_state *css; |
768 | 768 | ||
769 | hierarchy_used = iter->use_hierarchy; | 769 | hierarchy_used = iter->use_hierarchy; |
770 | 770 | ||
771 | css_put(&iter->css); | 771 | css_put(&iter->css); |
772 | /* If no ROOT, walk all, ignore hierarchy */ | 772 | /* If no ROOT, walk all, ignore hierarchy */ |
773 | if (!cond || (root && !hierarchy_used)) | 773 | if (!cond || (root && !hierarchy_used)) |
774 | return NULL; | 774 | return NULL; |
775 | 775 | ||
776 | if (!root) | 776 | if (!root) |
777 | root = root_mem_cgroup; | 777 | root = root_mem_cgroup; |
778 | 778 | ||
779 | do { | 779 | do { |
780 | iter = NULL; | 780 | iter = NULL; |
781 | rcu_read_lock(); | 781 | rcu_read_lock(); |
782 | 782 | ||
783 | css = css_get_next(&mem_cgroup_subsys, nextid, | 783 | css = css_get_next(&mem_cgroup_subsys, nextid, |
784 | &root->css, &found); | 784 | &root->css, &found); |
785 | if (css && css_tryget(css)) | 785 | if (css && css_tryget(css)) |
786 | iter = container_of(css, struct mem_cgroup, css); | 786 | iter = container_of(css, struct mem_cgroup, css); |
787 | rcu_read_unlock(); | 787 | rcu_read_unlock(); |
788 | /* If css is NULL, no more cgroups will be found */ | 788 | /* If css is NULL, no more cgroups will be found */ |
789 | nextid = found + 1; | 789 | nextid = found + 1; |
790 | } while (css && !iter); | 790 | } while (css && !iter); |
791 | 791 | ||
792 | return iter; | 792 | return iter; |
793 | } | 793 | } |
794 | /* | 794 | /* |
795 | * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please | 795 | * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please |
796 | * be careful that "break" loop is not allowed. We have reference count. | 796 | * be careful that "break" loop is not allowed. We have reference count. |
797 | * Instead of that modify "cond" to be false and "continue" to exit the loop. | 797 | * Instead of that modify "cond" to be false and "continue" to exit the loop. |
798 | */ | 798 | */ |
799 | #define for_each_mem_cgroup_tree_cond(iter, root, cond) \ | 799 | #define for_each_mem_cgroup_tree_cond(iter, root, cond) \ |
800 | for (iter = mem_cgroup_start_loop(root);\ | 800 | for (iter = mem_cgroup_start_loop(root);\ |
801 | iter != NULL;\ | 801 | iter != NULL;\ |
802 | iter = mem_cgroup_get_next(iter, root, cond)) | 802 | iter = mem_cgroup_get_next(iter, root, cond)) |
803 | 803 | ||
804 | #define for_each_mem_cgroup_tree(iter, root) \ | 804 | #define for_each_mem_cgroup_tree(iter, root) \ |
805 | for_each_mem_cgroup_tree_cond(iter, root, true) | 805 | for_each_mem_cgroup_tree_cond(iter, root, true) |
806 | 806 | ||
807 | #define for_each_mem_cgroup_all(iter) \ | 807 | #define for_each_mem_cgroup_all(iter) \ |
808 | for_each_mem_cgroup_tree_cond(iter, NULL, true) | 808 | for_each_mem_cgroup_tree_cond(iter, NULL, true) |
809 | 809 | ||
810 | 810 | ||
811 | static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) | 811 | static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) |
812 | { | 812 | { |
813 | return (mem == root_mem_cgroup); | 813 | return (mem == root_mem_cgroup); |
814 | } | 814 | } |
815 | 815 | ||
816 | /* | 816 | /* |
817 | * Following LRU functions are allowed to be used without PCG_LOCK. | 817 | * Following LRU functions are allowed to be used without PCG_LOCK. |
818 | * Operations are called by routine of global LRU independently from memcg. | 818 | * Operations are called by routine of global LRU independently from memcg. |
819 | * What we have to take care of here is validness of pc->mem_cgroup. | 819 | * What we have to take care of here is validness of pc->mem_cgroup. |
820 | * | 820 | * |
821 | * Changes to pc->mem_cgroup happens when | 821 | * Changes to pc->mem_cgroup happens when |
822 | * 1. charge | 822 | * 1. charge |
823 | * 2. moving account | 823 | * 2. moving account |
824 | * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. | 824 | * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. |
825 | * It is added to LRU before charge. | 825 | * It is added to LRU before charge. |
826 | * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. | 826 | * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. |
827 | * When moving account, the page is not on LRU. It's isolated. | 827 | * When moving account, the page is not on LRU. It's isolated. |
828 | */ | 828 | */ |
829 | 829 | ||
830 | void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) | 830 | void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) |
831 | { | 831 | { |
832 | struct page_cgroup *pc; | 832 | struct page_cgroup *pc; |
833 | struct mem_cgroup_per_zone *mz; | 833 | struct mem_cgroup_per_zone *mz; |
834 | 834 | ||
835 | if (mem_cgroup_disabled()) | 835 | if (mem_cgroup_disabled()) |
836 | return; | 836 | return; |
837 | pc = lookup_page_cgroup(page); | 837 | pc = lookup_page_cgroup(page); |
838 | /* can happen while we handle swapcache. */ | 838 | /* can happen while we handle swapcache. */ |
839 | if (!TestClearPageCgroupAcctLRU(pc)) | 839 | if (!TestClearPageCgroupAcctLRU(pc)) |
840 | return; | 840 | return; |
841 | VM_BUG_ON(!pc->mem_cgroup); | 841 | VM_BUG_ON(!pc->mem_cgroup); |
842 | /* | 842 | /* |
843 | * We don't check PCG_USED bit. It's cleared when the "page" is finally | 843 | * We don't check PCG_USED bit. It's cleared when the "page" is finally |
844 | * removed from global LRU. | 844 | * removed from global LRU. |
845 | */ | 845 | */ |
846 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); | 846 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); |
847 | /* huge page split is done under lru_lock. so, we have no races. */ | 847 | /* huge page split is done under lru_lock. so, we have no races. */ |
848 | MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); | 848 | MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); |
849 | if (mem_cgroup_is_root(pc->mem_cgroup)) | 849 | if (mem_cgroup_is_root(pc->mem_cgroup)) |
850 | return; | 850 | return; |
851 | VM_BUG_ON(list_empty(&pc->lru)); | 851 | VM_BUG_ON(list_empty(&pc->lru)); |
852 | list_del_init(&pc->lru); | 852 | list_del_init(&pc->lru); |
853 | } | 853 | } |
854 | 854 | ||
855 | void mem_cgroup_del_lru(struct page *page) | 855 | void mem_cgroup_del_lru(struct page *page) |
856 | { | 856 | { |
857 | mem_cgroup_del_lru_list(page, page_lru(page)); | 857 | mem_cgroup_del_lru_list(page, page_lru(page)); |
858 | } | 858 | } |
859 | 859 | ||
860 | /* | 860 | /* |
861 | * Writeback is about to end against a page which has been marked for immediate | 861 | * Writeback is about to end against a page which has been marked for immediate |
862 | * reclaim. If it still appears to be reclaimable, move it to the tail of the | 862 | * reclaim. If it still appears to be reclaimable, move it to the tail of the |
863 | * inactive list. | 863 | * inactive list. |
864 | */ | 864 | */ |
865 | void mem_cgroup_rotate_reclaimable_page(struct page *page) | 865 | void mem_cgroup_rotate_reclaimable_page(struct page *page) |
866 | { | 866 | { |
867 | struct mem_cgroup_per_zone *mz; | 867 | struct mem_cgroup_per_zone *mz; |
868 | struct page_cgroup *pc; | 868 | struct page_cgroup *pc; |
869 | enum lru_list lru = page_lru(page); | 869 | enum lru_list lru = page_lru(page); |
870 | 870 | ||
871 | if (mem_cgroup_disabled()) | 871 | if (mem_cgroup_disabled()) |
872 | return; | 872 | return; |
873 | 873 | ||
874 | pc = lookup_page_cgroup(page); | 874 | pc = lookup_page_cgroup(page); |
875 | /* unused or root page is not rotated. */ | 875 | /* unused or root page is not rotated. */ |
876 | if (!PageCgroupUsed(pc)) | 876 | if (!PageCgroupUsed(pc)) |
877 | return; | 877 | return; |
878 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ | 878 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ |
879 | smp_rmb(); | 879 | smp_rmb(); |
880 | if (mem_cgroup_is_root(pc->mem_cgroup)) | 880 | if (mem_cgroup_is_root(pc->mem_cgroup)) |
881 | return; | 881 | return; |
882 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); | 882 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); |
883 | list_move_tail(&pc->lru, &mz->lists[lru]); | 883 | list_move_tail(&pc->lru, &mz->lists[lru]); |
884 | } | 884 | } |
885 | 885 | ||
886 | void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) | 886 | void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) |
887 | { | 887 | { |
888 | struct mem_cgroup_per_zone *mz; | 888 | struct mem_cgroup_per_zone *mz; |
889 | struct page_cgroup *pc; | 889 | struct page_cgroup *pc; |
890 | 890 | ||
891 | if (mem_cgroup_disabled()) | 891 | if (mem_cgroup_disabled()) |
892 | return; | 892 | return; |
893 | 893 | ||
894 | pc = lookup_page_cgroup(page); | 894 | pc = lookup_page_cgroup(page); |
895 | /* unused or root page is not rotated. */ | 895 | /* unused or root page is not rotated. */ |
896 | if (!PageCgroupUsed(pc)) | 896 | if (!PageCgroupUsed(pc)) |
897 | return; | 897 | return; |
898 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ | 898 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ |
899 | smp_rmb(); | 899 | smp_rmb(); |
900 | if (mem_cgroup_is_root(pc->mem_cgroup)) | 900 | if (mem_cgroup_is_root(pc->mem_cgroup)) |
901 | return; | 901 | return; |
902 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); | 902 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); |
903 | list_move(&pc->lru, &mz->lists[lru]); | 903 | list_move(&pc->lru, &mz->lists[lru]); |
904 | } | 904 | } |
905 | 905 | ||
906 | void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) | 906 | void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) |
907 | { | 907 | { |
908 | struct page_cgroup *pc; | 908 | struct page_cgroup *pc; |
909 | struct mem_cgroup_per_zone *mz; | 909 | struct mem_cgroup_per_zone *mz; |
910 | 910 | ||
911 | if (mem_cgroup_disabled()) | 911 | if (mem_cgroup_disabled()) |
912 | return; | 912 | return; |
913 | pc = lookup_page_cgroup(page); | 913 | pc = lookup_page_cgroup(page); |
914 | VM_BUG_ON(PageCgroupAcctLRU(pc)); | 914 | VM_BUG_ON(PageCgroupAcctLRU(pc)); |
915 | if (!PageCgroupUsed(pc)) | 915 | if (!PageCgroupUsed(pc)) |
916 | return; | 916 | return; |
917 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ | 917 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ |
918 | smp_rmb(); | 918 | smp_rmb(); |
919 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); | 919 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); |
920 | /* huge page split is done under lru_lock. so, we have no races. */ | 920 | /* huge page split is done under lru_lock. so, we have no races. */ |
921 | MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); | 921 | MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); |
922 | SetPageCgroupAcctLRU(pc); | 922 | SetPageCgroupAcctLRU(pc); |
923 | if (mem_cgroup_is_root(pc->mem_cgroup)) | 923 | if (mem_cgroup_is_root(pc->mem_cgroup)) |
924 | return; | 924 | return; |
925 | list_add(&pc->lru, &mz->lists[lru]); | 925 | list_add(&pc->lru, &mz->lists[lru]); |
926 | } | 926 | } |
927 | 927 | ||
928 | /* | 928 | /* |
929 | * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed | 929 | * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed |
930 | * while it's linked to lru because the page may be reused after it's fully | 930 | * while it's linked to lru because the page may be reused after it's fully |
931 | * uncharged. To handle that, unlink page_cgroup from LRU when charge it again. | 931 | * uncharged. To handle that, unlink page_cgroup from LRU when charge it again. |
932 | * It's done under lock_page and expected that zone->lru_lock isnever held. | 932 | * It's done under lock_page and expected that zone->lru_lock isnever held. |
933 | */ | 933 | */ |
934 | static void mem_cgroup_lru_del_before_commit(struct page *page) | 934 | static void mem_cgroup_lru_del_before_commit(struct page *page) |
935 | { | 935 | { |
936 | unsigned long flags; | 936 | unsigned long flags; |
937 | struct zone *zone = page_zone(page); | 937 | struct zone *zone = page_zone(page); |
938 | struct page_cgroup *pc = lookup_page_cgroup(page); | 938 | struct page_cgroup *pc = lookup_page_cgroup(page); |
939 | 939 | ||
940 | /* | 940 | /* |
941 | * Doing this check without taking ->lru_lock seems wrong but this | 941 | * Doing this check without taking ->lru_lock seems wrong but this |
942 | * is safe. Because if page_cgroup's USED bit is unset, the page | 942 | * is safe. Because if page_cgroup's USED bit is unset, the page |
943 | * will not be added to any memcg's LRU. If page_cgroup's USED bit is | 943 | * will not be added to any memcg's LRU. If page_cgroup's USED bit is |
944 | * set, the commit after this will fail, anyway. | 944 | * set, the commit after this will fail, anyway. |
945 | * This all charge/uncharge is done under some mutual execustion. | 945 | * This all charge/uncharge is done under some mutual execustion. |
946 | * So, we don't need to taking care of changes in USED bit. | 946 | * So, we don't need to taking care of changes in USED bit. |
947 | */ | 947 | */ |
948 | if (likely(!PageLRU(page))) | 948 | if (likely(!PageLRU(page))) |
949 | return; | 949 | return; |
950 | 950 | ||
951 | spin_lock_irqsave(&zone->lru_lock, flags); | 951 | spin_lock_irqsave(&zone->lru_lock, flags); |
952 | /* | 952 | /* |
953 | * Forget old LRU when this page_cgroup is *not* used. This Used bit | 953 | * Forget old LRU when this page_cgroup is *not* used. This Used bit |
954 | * is guarded by lock_page() because the page is SwapCache. | 954 | * is guarded by lock_page() because the page is SwapCache. |
955 | */ | 955 | */ |
956 | if (!PageCgroupUsed(pc)) | 956 | if (!PageCgroupUsed(pc)) |
957 | mem_cgroup_del_lru_list(page, page_lru(page)); | 957 | mem_cgroup_del_lru_list(page, page_lru(page)); |
958 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 958 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
959 | } | 959 | } |
960 | 960 | ||
961 | static void mem_cgroup_lru_add_after_commit(struct page *page) | 961 | static void mem_cgroup_lru_add_after_commit(struct page *page) |
962 | { | 962 | { |
963 | unsigned long flags; | 963 | unsigned long flags; |
964 | struct zone *zone = page_zone(page); | 964 | struct zone *zone = page_zone(page); |
965 | struct page_cgroup *pc = lookup_page_cgroup(page); | 965 | struct page_cgroup *pc = lookup_page_cgroup(page); |
966 | 966 | ||
967 | /* taking care of that the page is added to LRU while we commit it */ | 967 | /* taking care of that the page is added to LRU while we commit it */ |
968 | if (likely(!PageLRU(page))) | 968 | if (likely(!PageLRU(page))) |
969 | return; | 969 | return; |
970 | spin_lock_irqsave(&zone->lru_lock, flags); | 970 | spin_lock_irqsave(&zone->lru_lock, flags); |
971 | /* link when the page is linked to LRU but page_cgroup isn't */ | 971 | /* link when the page is linked to LRU but page_cgroup isn't */ |
972 | if (PageLRU(page) && !PageCgroupAcctLRU(pc)) | 972 | if (PageLRU(page) && !PageCgroupAcctLRU(pc)) |
973 | mem_cgroup_add_lru_list(page, page_lru(page)); | 973 | mem_cgroup_add_lru_list(page, page_lru(page)); |
974 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 974 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
975 | } | 975 | } |
976 | 976 | ||
977 | 977 | ||
978 | void mem_cgroup_move_lists(struct page *page, | 978 | void mem_cgroup_move_lists(struct page *page, |
979 | enum lru_list from, enum lru_list to) | 979 | enum lru_list from, enum lru_list to) |
980 | { | 980 | { |
981 | if (mem_cgroup_disabled()) | 981 | if (mem_cgroup_disabled()) |
982 | return; | 982 | return; |
983 | mem_cgroup_del_lru_list(page, from); | 983 | mem_cgroup_del_lru_list(page, from); |
984 | mem_cgroup_add_lru_list(page, to); | 984 | mem_cgroup_add_lru_list(page, to); |
985 | } | 985 | } |
986 | 986 | ||
987 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | 987 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) |
988 | { | 988 | { |
989 | int ret; | 989 | int ret; |
990 | struct mem_cgroup *curr = NULL; | 990 | struct mem_cgroup *curr = NULL; |
991 | struct task_struct *p; | 991 | struct task_struct *p; |
992 | 992 | ||
993 | p = find_lock_task_mm(task); | 993 | p = find_lock_task_mm(task); |
994 | if (!p) | 994 | if (!p) |
995 | return 0; | 995 | return 0; |
996 | curr = try_get_mem_cgroup_from_mm(p->mm); | 996 | curr = try_get_mem_cgroup_from_mm(p->mm); |
997 | task_unlock(p); | 997 | task_unlock(p); |
998 | if (!curr) | 998 | if (!curr) |
999 | return 0; | 999 | return 0; |
1000 | /* | 1000 | /* |
1001 | * We should check use_hierarchy of "mem" not "curr". Because checking | 1001 | * We should check use_hierarchy of "mem" not "curr". Because checking |
1002 | * use_hierarchy of "curr" here make this function true if hierarchy is | 1002 | * use_hierarchy of "curr" here make this function true if hierarchy is |
1003 | * enabled in "curr" and "curr" is a child of "mem" in *cgroup* | 1003 | * enabled in "curr" and "curr" is a child of "mem" in *cgroup* |
1004 | * hierarchy(even if use_hierarchy is disabled in "mem"). | 1004 | * hierarchy(even if use_hierarchy is disabled in "mem"). |
1005 | */ | 1005 | */ |
1006 | if (mem->use_hierarchy) | 1006 | if (mem->use_hierarchy) |
1007 | ret = css_is_ancestor(&curr->css, &mem->css); | 1007 | ret = css_is_ancestor(&curr->css, &mem->css); |
1008 | else | 1008 | else |
1009 | ret = (curr == mem); | 1009 | ret = (curr == mem); |
1010 | css_put(&curr->css); | 1010 | css_put(&curr->css); |
1011 | return ret; | 1011 | return ret; |
1012 | } | 1012 | } |
1013 | 1013 | ||
1014 | static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) | 1014 | static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) |
1015 | { | 1015 | { |
1016 | unsigned long active; | 1016 | unsigned long active; |
1017 | unsigned long inactive; | 1017 | unsigned long inactive; |
1018 | unsigned long gb; | 1018 | unsigned long gb; |
1019 | unsigned long inactive_ratio; | 1019 | unsigned long inactive_ratio; |
1020 | 1020 | ||
1021 | inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON); | 1021 | inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON); |
1022 | active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON); | 1022 | active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON); |
1023 | 1023 | ||
1024 | gb = (inactive + active) >> (30 - PAGE_SHIFT); | 1024 | gb = (inactive + active) >> (30 - PAGE_SHIFT); |
1025 | if (gb) | 1025 | if (gb) |
1026 | inactive_ratio = int_sqrt(10 * gb); | 1026 | inactive_ratio = int_sqrt(10 * gb); |
1027 | else | 1027 | else |
1028 | inactive_ratio = 1; | 1028 | inactive_ratio = 1; |
1029 | 1029 | ||
1030 | if (present_pages) { | 1030 | if (present_pages) { |
1031 | present_pages[0] = inactive; | 1031 | present_pages[0] = inactive; |
1032 | present_pages[1] = active; | 1032 | present_pages[1] = active; |
1033 | } | 1033 | } |
1034 | 1034 | ||
1035 | return inactive_ratio; | 1035 | return inactive_ratio; |
1036 | } | 1036 | } |
1037 | 1037 | ||
1038 | int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) | 1038 | int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) |
1039 | { | 1039 | { |
1040 | unsigned long active; | 1040 | unsigned long active; |
1041 | unsigned long inactive; | 1041 | unsigned long inactive; |
1042 | unsigned long present_pages[2]; | 1042 | unsigned long present_pages[2]; |
1043 | unsigned long inactive_ratio; | 1043 | unsigned long inactive_ratio; |
1044 | 1044 | ||
1045 | inactive_ratio = calc_inactive_ratio(memcg, present_pages); | 1045 | inactive_ratio = calc_inactive_ratio(memcg, present_pages); |
1046 | 1046 | ||
1047 | inactive = present_pages[0]; | 1047 | inactive = present_pages[0]; |
1048 | active = present_pages[1]; | 1048 | active = present_pages[1]; |
1049 | 1049 | ||
1050 | if (inactive * inactive_ratio < active) | 1050 | if (inactive * inactive_ratio < active) |
1051 | return 1; | 1051 | return 1; |
1052 | 1052 | ||
1053 | return 0; | 1053 | return 0; |
1054 | } | 1054 | } |
1055 | 1055 | ||
1056 | int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) | 1056 | int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) |
1057 | { | 1057 | { |
1058 | unsigned long active; | 1058 | unsigned long active; |
1059 | unsigned long inactive; | 1059 | unsigned long inactive; |
1060 | 1060 | ||
1061 | inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE); | 1061 | inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE); |
1062 | active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE); | 1062 | active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE); |
1063 | 1063 | ||
1064 | return (active > inactive); | 1064 | return (active > inactive); |
1065 | } | 1065 | } |
1066 | 1066 | ||
1067 | unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, | 1067 | unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, |
1068 | struct zone *zone, | 1068 | struct zone *zone, |
1069 | enum lru_list lru) | 1069 | enum lru_list lru) |
1070 | { | 1070 | { |
1071 | int nid = zone_to_nid(zone); | 1071 | int nid = zone_to_nid(zone); |
1072 | int zid = zone_idx(zone); | 1072 | int zid = zone_idx(zone); |
1073 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); | 1073 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); |
1074 | 1074 | ||
1075 | return MEM_CGROUP_ZSTAT(mz, lru); | 1075 | return MEM_CGROUP_ZSTAT(mz, lru); |
1076 | } | 1076 | } |
1077 | 1077 | ||
1078 | struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, | 1078 | struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, |
1079 | struct zone *zone) | 1079 | struct zone *zone) |
1080 | { | 1080 | { |
1081 | int nid = zone_to_nid(zone); | 1081 | int nid = zone_to_nid(zone); |
1082 | int zid = zone_idx(zone); | 1082 | int zid = zone_idx(zone); |
1083 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); | 1083 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); |
1084 | 1084 | ||
1085 | return &mz->reclaim_stat; | 1085 | return &mz->reclaim_stat; |
1086 | } | 1086 | } |
1087 | 1087 | ||
1088 | struct zone_reclaim_stat * | 1088 | struct zone_reclaim_stat * |
1089 | mem_cgroup_get_reclaim_stat_from_page(struct page *page) | 1089 | mem_cgroup_get_reclaim_stat_from_page(struct page *page) |
1090 | { | 1090 | { |
1091 | struct page_cgroup *pc; | 1091 | struct page_cgroup *pc; |
1092 | struct mem_cgroup_per_zone *mz; | 1092 | struct mem_cgroup_per_zone *mz; |
1093 | 1093 | ||
1094 | if (mem_cgroup_disabled()) | 1094 | if (mem_cgroup_disabled()) |
1095 | return NULL; | 1095 | return NULL; |
1096 | 1096 | ||
1097 | pc = lookup_page_cgroup(page); | 1097 | pc = lookup_page_cgroup(page); |
1098 | if (!PageCgroupUsed(pc)) | 1098 | if (!PageCgroupUsed(pc)) |
1099 | return NULL; | 1099 | return NULL; |
1100 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ | 1100 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ |
1101 | smp_rmb(); | 1101 | smp_rmb(); |
1102 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); | 1102 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); |
1103 | return &mz->reclaim_stat; | 1103 | return &mz->reclaim_stat; |
1104 | } | 1104 | } |
1105 | 1105 | ||
1106 | unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | 1106 | unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, |
1107 | struct list_head *dst, | 1107 | struct list_head *dst, |
1108 | unsigned long *scanned, int order, | 1108 | unsigned long *scanned, int order, |
1109 | int mode, struct zone *z, | 1109 | int mode, struct zone *z, |
1110 | struct mem_cgroup *mem_cont, | 1110 | struct mem_cgroup *mem_cont, |
1111 | int active, int file) | 1111 | int active, int file) |
1112 | { | 1112 | { |
1113 | unsigned long nr_taken = 0; | 1113 | unsigned long nr_taken = 0; |
1114 | struct page *page; | 1114 | struct page *page; |
1115 | unsigned long scan; | 1115 | unsigned long scan; |
1116 | LIST_HEAD(pc_list); | 1116 | LIST_HEAD(pc_list); |
1117 | struct list_head *src; | 1117 | struct list_head *src; |
1118 | struct page_cgroup *pc, *tmp; | 1118 | struct page_cgroup *pc, *tmp; |
1119 | int nid = zone_to_nid(z); | 1119 | int nid = zone_to_nid(z); |
1120 | int zid = zone_idx(z); | 1120 | int zid = zone_idx(z); |
1121 | struct mem_cgroup_per_zone *mz; | 1121 | struct mem_cgroup_per_zone *mz; |
1122 | int lru = LRU_FILE * file + active; | 1122 | int lru = LRU_FILE * file + active; |
1123 | int ret; | 1123 | int ret; |
1124 | 1124 | ||
1125 | BUG_ON(!mem_cont); | 1125 | BUG_ON(!mem_cont); |
1126 | mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); | 1126 | mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); |
1127 | src = &mz->lists[lru]; | 1127 | src = &mz->lists[lru]; |
1128 | 1128 | ||
1129 | scan = 0; | 1129 | scan = 0; |
1130 | list_for_each_entry_safe_reverse(pc, tmp, src, lru) { | 1130 | list_for_each_entry_safe_reverse(pc, tmp, src, lru) { |
1131 | if (scan >= nr_to_scan) | 1131 | if (scan >= nr_to_scan) |
1132 | break; | 1132 | break; |
1133 | 1133 | ||
1134 | if (unlikely(!PageCgroupUsed(pc))) | 1134 | if (unlikely(!PageCgroupUsed(pc))) |
1135 | continue; | 1135 | continue; |
1136 | 1136 | ||
1137 | page = lookup_cgroup_page(pc); | 1137 | page = lookup_cgroup_page(pc); |
1138 | 1138 | ||
1139 | if (unlikely(!PageLRU(page))) | 1139 | if (unlikely(!PageLRU(page))) |
1140 | continue; | 1140 | continue; |
1141 | 1141 | ||
1142 | scan++; | 1142 | scan++; |
1143 | ret = __isolate_lru_page(page, mode, file); | 1143 | ret = __isolate_lru_page(page, mode, file); |
1144 | switch (ret) { | 1144 | switch (ret) { |
1145 | case 0: | 1145 | case 0: |
1146 | list_move(&page->lru, dst); | 1146 | list_move(&page->lru, dst); |
1147 | mem_cgroup_del_lru(page); | 1147 | mem_cgroup_del_lru(page); |
1148 | nr_taken += hpage_nr_pages(page); | 1148 | nr_taken += hpage_nr_pages(page); |
1149 | break; | 1149 | break; |
1150 | case -EBUSY: | 1150 | case -EBUSY: |
1151 | /* we don't affect global LRU but rotate in our LRU */ | 1151 | /* we don't affect global LRU but rotate in our LRU */ |
1152 | mem_cgroup_rotate_lru_list(page, page_lru(page)); | 1152 | mem_cgroup_rotate_lru_list(page, page_lru(page)); |
1153 | break; | 1153 | break; |
1154 | default: | 1154 | default: |
1155 | break; | 1155 | break; |
1156 | } | 1156 | } |
1157 | } | 1157 | } |
1158 | 1158 | ||
1159 | *scanned = scan; | 1159 | *scanned = scan; |
1160 | 1160 | ||
1161 | trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken, | 1161 | trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken, |
1162 | 0, 0, 0, mode); | 1162 | 0, 0, 0, mode); |
1163 | 1163 | ||
1164 | return nr_taken; | 1164 | return nr_taken; |
1165 | } | 1165 | } |
1166 | 1166 | ||
1167 | #define mem_cgroup_from_res_counter(counter, member) \ | 1167 | #define mem_cgroup_from_res_counter(counter, member) \ |
1168 | container_of(counter, struct mem_cgroup, member) | 1168 | container_of(counter, struct mem_cgroup, member) |
1169 | 1169 | ||
1170 | /** | 1170 | /** |
1171 | * mem_cgroup_margin - calculate chargeable space of a memory cgroup | 1171 | * mem_cgroup_margin - calculate chargeable space of a memory cgroup |
1172 | * @mem: the memory cgroup | 1172 | * @mem: the memory cgroup |
1173 | * | 1173 | * |
1174 | * Returns the maximum amount of memory @mem can be charged with, in | 1174 | * Returns the maximum amount of memory @mem can be charged with, in |
1175 | * pages. | 1175 | * pages. |
1176 | */ | 1176 | */ |
1177 | static unsigned long mem_cgroup_margin(struct mem_cgroup *mem) | 1177 | static unsigned long mem_cgroup_margin(struct mem_cgroup *mem) |
1178 | { | 1178 | { |
1179 | unsigned long long margin; | 1179 | unsigned long long margin; |
1180 | 1180 | ||
1181 | margin = res_counter_margin(&mem->res); | 1181 | margin = res_counter_margin(&mem->res); |
1182 | if (do_swap_account) | 1182 | if (do_swap_account) |
1183 | margin = min(margin, res_counter_margin(&mem->memsw)); | 1183 | margin = min(margin, res_counter_margin(&mem->memsw)); |
1184 | return margin >> PAGE_SHIFT; | 1184 | return margin >> PAGE_SHIFT; |
1185 | } | 1185 | } |
1186 | 1186 | ||
1187 | static unsigned int get_swappiness(struct mem_cgroup *memcg) | 1187 | static unsigned int get_swappiness(struct mem_cgroup *memcg) |
1188 | { | 1188 | { |
1189 | struct cgroup *cgrp = memcg->css.cgroup; | 1189 | struct cgroup *cgrp = memcg->css.cgroup; |
1190 | 1190 | ||
1191 | /* root ? */ | 1191 | /* root ? */ |
1192 | if (cgrp->parent == NULL) | 1192 | if (cgrp->parent == NULL) |
1193 | return vm_swappiness; | 1193 | return vm_swappiness; |
1194 | 1194 | ||
1195 | return memcg->swappiness; | 1195 | return memcg->swappiness; |
1196 | } | 1196 | } |
1197 | 1197 | ||
1198 | static void mem_cgroup_start_move(struct mem_cgroup *mem) | 1198 | static void mem_cgroup_start_move(struct mem_cgroup *mem) |
1199 | { | 1199 | { |
1200 | int cpu; | 1200 | int cpu; |
1201 | 1201 | ||
1202 | get_online_cpus(); | 1202 | get_online_cpus(); |
1203 | spin_lock(&mem->pcp_counter_lock); | 1203 | spin_lock(&mem->pcp_counter_lock); |
1204 | for_each_online_cpu(cpu) | 1204 | for_each_online_cpu(cpu) |
1205 | per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; | 1205 | per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; |
1206 | mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; | 1206 | mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; |
1207 | spin_unlock(&mem->pcp_counter_lock); | 1207 | spin_unlock(&mem->pcp_counter_lock); |
1208 | put_online_cpus(); | 1208 | put_online_cpus(); |
1209 | 1209 | ||
1210 | synchronize_rcu(); | 1210 | synchronize_rcu(); |
1211 | } | 1211 | } |
1212 | 1212 | ||
1213 | static void mem_cgroup_end_move(struct mem_cgroup *mem) | 1213 | static void mem_cgroup_end_move(struct mem_cgroup *mem) |
1214 | { | 1214 | { |
1215 | int cpu; | 1215 | int cpu; |
1216 | 1216 | ||
1217 | if (!mem) | 1217 | if (!mem) |
1218 | return; | 1218 | return; |
1219 | get_online_cpus(); | 1219 | get_online_cpus(); |
1220 | spin_lock(&mem->pcp_counter_lock); | 1220 | spin_lock(&mem->pcp_counter_lock); |
1221 | for_each_online_cpu(cpu) | 1221 | for_each_online_cpu(cpu) |
1222 | per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; | 1222 | per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; |
1223 | mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; | 1223 | mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; |
1224 | spin_unlock(&mem->pcp_counter_lock); | 1224 | spin_unlock(&mem->pcp_counter_lock); |
1225 | put_online_cpus(); | 1225 | put_online_cpus(); |
1226 | } | 1226 | } |
1227 | /* | 1227 | /* |
1228 | * 2 routines for checking "mem" is under move_account() or not. | 1228 | * 2 routines for checking "mem" is under move_account() or not. |
1229 | * | 1229 | * |
1230 | * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used | 1230 | * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used |
1231 | * for avoiding race in accounting. If true, | 1231 | * for avoiding race in accounting. If true, |
1232 | * pc->mem_cgroup may be overwritten. | 1232 | * pc->mem_cgroup may be overwritten. |
1233 | * | 1233 | * |
1234 | * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or | 1234 | * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or |
1235 | * under hierarchy of moving cgroups. This is for | 1235 | * under hierarchy of moving cgroups. This is for |
1236 | * waiting at hith-memory prressure caused by "move". | 1236 | * waiting at hith-memory prressure caused by "move". |
1237 | */ | 1237 | */ |
1238 | 1238 | ||
1239 | static bool mem_cgroup_stealed(struct mem_cgroup *mem) | 1239 | static bool mem_cgroup_stealed(struct mem_cgroup *mem) |
1240 | { | 1240 | { |
1241 | VM_BUG_ON(!rcu_read_lock_held()); | 1241 | VM_BUG_ON(!rcu_read_lock_held()); |
1242 | return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0; | 1242 | return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0; |
1243 | } | 1243 | } |
1244 | 1244 | ||
1245 | static bool mem_cgroup_under_move(struct mem_cgroup *mem) | 1245 | static bool mem_cgroup_under_move(struct mem_cgroup *mem) |
1246 | { | 1246 | { |
1247 | struct mem_cgroup *from; | 1247 | struct mem_cgroup *from; |
1248 | struct mem_cgroup *to; | 1248 | struct mem_cgroup *to; |
1249 | bool ret = false; | 1249 | bool ret = false; |
1250 | /* | 1250 | /* |
1251 | * Unlike task_move routines, we access mc.to, mc.from not under | 1251 | * Unlike task_move routines, we access mc.to, mc.from not under |
1252 | * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. | 1252 | * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. |
1253 | */ | 1253 | */ |
1254 | spin_lock(&mc.lock); | 1254 | spin_lock(&mc.lock); |
1255 | from = mc.from; | 1255 | from = mc.from; |
1256 | to = mc.to; | 1256 | to = mc.to; |
1257 | if (!from) | 1257 | if (!from) |
1258 | goto unlock; | 1258 | goto unlock; |
1259 | if (from == mem || to == mem | 1259 | if (from == mem || to == mem |
1260 | || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css)) | 1260 | || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css)) |
1261 | || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css))) | 1261 | || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css))) |
1262 | ret = true; | 1262 | ret = true; |
1263 | unlock: | 1263 | unlock: |
1264 | spin_unlock(&mc.lock); | 1264 | spin_unlock(&mc.lock); |
1265 | return ret; | 1265 | return ret; |
1266 | } | 1266 | } |
1267 | 1267 | ||
1268 | static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem) | 1268 | static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem) |
1269 | { | 1269 | { |
1270 | if (mc.moving_task && current != mc.moving_task) { | 1270 | if (mc.moving_task && current != mc.moving_task) { |
1271 | if (mem_cgroup_under_move(mem)) { | 1271 | if (mem_cgroup_under_move(mem)) { |
1272 | DEFINE_WAIT(wait); | 1272 | DEFINE_WAIT(wait); |
1273 | prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); | 1273 | prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); |
1274 | /* moving charge context might have finished. */ | 1274 | /* moving charge context might have finished. */ |
1275 | if (mc.moving_task) | 1275 | if (mc.moving_task) |
1276 | schedule(); | 1276 | schedule(); |
1277 | finish_wait(&mc.waitq, &wait); | 1277 | finish_wait(&mc.waitq, &wait); |
1278 | return true; | 1278 | return true; |
1279 | } | 1279 | } |
1280 | } | 1280 | } |
1281 | return false; | 1281 | return false; |
1282 | } | 1282 | } |
1283 | 1283 | ||
1284 | /** | 1284 | /** |
1285 | * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. | 1285 | * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. |
1286 | * @memcg: The memory cgroup that went over limit | 1286 | * @memcg: The memory cgroup that went over limit |
1287 | * @p: Task that is going to be killed | 1287 | * @p: Task that is going to be killed |
1288 | * | 1288 | * |
1289 | * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is | 1289 | * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is |
1290 | * enabled | 1290 | * enabled |
1291 | */ | 1291 | */ |
1292 | void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | 1292 | void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) |
1293 | { | 1293 | { |
1294 | struct cgroup *task_cgrp; | 1294 | struct cgroup *task_cgrp; |
1295 | struct cgroup *mem_cgrp; | 1295 | struct cgroup *mem_cgrp; |
1296 | /* | 1296 | /* |
1297 | * Need a buffer in BSS, can't rely on allocations. The code relies | 1297 | * Need a buffer in BSS, can't rely on allocations. The code relies |
1298 | * on the assumption that OOM is serialized for memory controller. | 1298 | * on the assumption that OOM is serialized for memory controller. |
1299 | * If this assumption is broken, revisit this code. | 1299 | * If this assumption is broken, revisit this code. |
1300 | */ | 1300 | */ |
1301 | static char memcg_name[PATH_MAX]; | 1301 | static char memcg_name[PATH_MAX]; |
1302 | int ret; | 1302 | int ret; |
1303 | 1303 | ||
1304 | if (!memcg || !p) | 1304 | if (!memcg || !p) |
1305 | return; | 1305 | return; |
1306 | 1306 | ||
1307 | 1307 | ||
1308 | rcu_read_lock(); | 1308 | rcu_read_lock(); |
1309 | 1309 | ||
1310 | mem_cgrp = memcg->css.cgroup; | 1310 | mem_cgrp = memcg->css.cgroup; |
1311 | task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); | 1311 | task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); |
1312 | 1312 | ||
1313 | ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); | 1313 | ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); |
1314 | if (ret < 0) { | 1314 | if (ret < 0) { |
1315 | /* | 1315 | /* |
1316 | * Unfortunately, we are unable to convert to a useful name | 1316 | * Unfortunately, we are unable to convert to a useful name |
1317 | * But we'll still print out the usage information | 1317 | * But we'll still print out the usage information |
1318 | */ | 1318 | */ |
1319 | rcu_read_unlock(); | 1319 | rcu_read_unlock(); |
1320 | goto done; | 1320 | goto done; |
1321 | } | 1321 | } |
1322 | rcu_read_unlock(); | 1322 | rcu_read_unlock(); |
1323 | 1323 | ||
1324 | printk(KERN_INFO "Task in %s killed", memcg_name); | 1324 | printk(KERN_INFO "Task in %s killed", memcg_name); |
1325 | 1325 | ||
1326 | rcu_read_lock(); | 1326 | rcu_read_lock(); |
1327 | ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); | 1327 | ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); |
1328 | if (ret < 0) { | 1328 | if (ret < 0) { |
1329 | rcu_read_unlock(); | 1329 | rcu_read_unlock(); |
1330 | goto done; | 1330 | goto done; |
1331 | } | 1331 | } |
1332 | rcu_read_unlock(); | 1332 | rcu_read_unlock(); |
1333 | 1333 | ||
1334 | /* | 1334 | /* |
1335 | * Continues from above, so we don't need an KERN_ level | 1335 | * Continues from above, so we don't need an KERN_ level |
1336 | */ | 1336 | */ |
1337 | printk(KERN_CONT " as a result of limit of %s\n", memcg_name); | 1337 | printk(KERN_CONT " as a result of limit of %s\n", memcg_name); |
1338 | done: | 1338 | done: |
1339 | 1339 | ||
1340 | printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", | 1340 | printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", |
1341 | res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, | 1341 | res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, |
1342 | res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, | 1342 | res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, |
1343 | res_counter_read_u64(&memcg->res, RES_FAILCNT)); | 1343 | res_counter_read_u64(&memcg->res, RES_FAILCNT)); |
1344 | printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " | 1344 | printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " |
1345 | "failcnt %llu\n", | 1345 | "failcnt %llu\n", |
1346 | res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, | 1346 | res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, |
1347 | res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, | 1347 | res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, |
1348 | res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); | 1348 | res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); |
1349 | } | 1349 | } |
1350 | 1350 | ||
1351 | /* | 1351 | /* |
1352 | * This function returns the number of memcg under hierarchy tree. Returns | 1352 | * This function returns the number of memcg under hierarchy tree. Returns |
1353 | * 1(self count) if no children. | 1353 | * 1(self count) if no children. |
1354 | */ | 1354 | */ |
1355 | static int mem_cgroup_count_children(struct mem_cgroup *mem) | 1355 | static int mem_cgroup_count_children(struct mem_cgroup *mem) |
1356 | { | 1356 | { |
1357 | int num = 0; | 1357 | int num = 0; |
1358 | struct mem_cgroup *iter; | 1358 | struct mem_cgroup *iter; |
1359 | 1359 | ||
1360 | for_each_mem_cgroup_tree(iter, mem) | 1360 | for_each_mem_cgroup_tree(iter, mem) |
1361 | num++; | 1361 | num++; |
1362 | return num; | 1362 | return num; |
1363 | } | 1363 | } |
1364 | 1364 | ||
1365 | /* | 1365 | /* |
1366 | * Return the memory (and swap, if configured) limit for a memcg. | 1366 | * Return the memory (and swap, if configured) limit for a memcg. |
1367 | */ | 1367 | */ |
1368 | u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) | 1368 | u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) |
1369 | { | 1369 | { |
1370 | u64 limit; | 1370 | u64 limit; |
1371 | u64 memsw; | 1371 | u64 memsw; |
1372 | 1372 | ||
1373 | limit = res_counter_read_u64(&memcg->res, RES_LIMIT); | 1373 | limit = res_counter_read_u64(&memcg->res, RES_LIMIT); |
1374 | limit += total_swap_pages << PAGE_SHIFT; | 1374 | limit += total_swap_pages << PAGE_SHIFT; |
1375 | 1375 | ||
1376 | memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | 1376 | memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); |
1377 | /* | 1377 | /* |
1378 | * If memsw is finite and limits the amount of swap space available | 1378 | * If memsw is finite and limits the amount of swap space available |
1379 | * to this memcg, return that limit. | 1379 | * to this memcg, return that limit. |
1380 | */ | 1380 | */ |
1381 | return min(limit, memsw); | 1381 | return min(limit, memsw); |
1382 | } | 1382 | } |
1383 | 1383 | ||
1384 | /* | 1384 | /* |
1385 | * Visit the first child (need not be the first child as per the ordering | 1385 | * Visit the first child (need not be the first child as per the ordering |
1386 | * of the cgroup list, since we track last_scanned_child) of @mem and use | 1386 | * of the cgroup list, since we track last_scanned_child) of @mem and use |
1387 | * that to reclaim free pages from. | 1387 | * that to reclaim free pages from. |
1388 | */ | 1388 | */ |
1389 | static struct mem_cgroup * | 1389 | static struct mem_cgroup * |
1390 | mem_cgroup_select_victim(struct mem_cgroup *root_mem) | 1390 | mem_cgroup_select_victim(struct mem_cgroup *root_mem) |
1391 | { | 1391 | { |
1392 | struct mem_cgroup *ret = NULL; | 1392 | struct mem_cgroup *ret = NULL; |
1393 | struct cgroup_subsys_state *css; | 1393 | struct cgroup_subsys_state *css; |
1394 | int nextid, found; | 1394 | int nextid, found; |
1395 | 1395 | ||
1396 | if (!root_mem->use_hierarchy) { | 1396 | if (!root_mem->use_hierarchy) { |
1397 | css_get(&root_mem->css); | 1397 | css_get(&root_mem->css); |
1398 | ret = root_mem; | 1398 | ret = root_mem; |
1399 | } | 1399 | } |
1400 | 1400 | ||
1401 | while (!ret) { | 1401 | while (!ret) { |
1402 | rcu_read_lock(); | 1402 | rcu_read_lock(); |
1403 | nextid = root_mem->last_scanned_child + 1; | 1403 | nextid = root_mem->last_scanned_child + 1; |
1404 | css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, | 1404 | css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, |
1405 | &found); | 1405 | &found); |
1406 | if (css && css_tryget(css)) | 1406 | if (css && css_tryget(css)) |
1407 | ret = container_of(css, struct mem_cgroup, css); | 1407 | ret = container_of(css, struct mem_cgroup, css); |
1408 | 1408 | ||
1409 | rcu_read_unlock(); | 1409 | rcu_read_unlock(); |
1410 | /* Updates scanning parameter */ | 1410 | /* Updates scanning parameter */ |
1411 | if (!css) { | 1411 | if (!css) { |
1412 | /* this means start scan from ID:1 */ | 1412 | /* this means start scan from ID:1 */ |
1413 | root_mem->last_scanned_child = 0; | 1413 | root_mem->last_scanned_child = 0; |
1414 | } else | 1414 | } else |
1415 | root_mem->last_scanned_child = found; | 1415 | root_mem->last_scanned_child = found; |
1416 | } | 1416 | } |
1417 | 1417 | ||
1418 | return ret; | 1418 | return ret; |
1419 | } | 1419 | } |
1420 | 1420 | ||
1421 | /* | 1421 | /* |
1422 | * Scan the hierarchy if needed to reclaim memory. We remember the last child | 1422 | * Scan the hierarchy if needed to reclaim memory. We remember the last child |
1423 | * we reclaimed from, so that we don't end up penalizing one child extensively | 1423 | * we reclaimed from, so that we don't end up penalizing one child extensively |
1424 | * based on its position in the children list. | 1424 | * based on its position in the children list. |
1425 | * | 1425 | * |
1426 | * root_mem is the original ancestor that we've been reclaim from. | 1426 | * root_mem is the original ancestor that we've been reclaim from. |
1427 | * | 1427 | * |
1428 | * We give up and return to the caller when we visit root_mem twice. | 1428 | * We give up and return to the caller when we visit root_mem twice. |
1429 | * (other groups can be removed while we're walking....) | 1429 | * (other groups can be removed while we're walking....) |
1430 | * | 1430 | * |
1431 | * If shrink==true, for avoiding to free too much, this returns immedieately. | 1431 | * If shrink==true, for avoiding to free too much, this returns immedieately. |
1432 | */ | 1432 | */ |
1433 | static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | 1433 | static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, |
1434 | struct zone *zone, | 1434 | struct zone *zone, |
1435 | gfp_t gfp_mask, | 1435 | gfp_t gfp_mask, |
1436 | unsigned long reclaim_options, | 1436 | unsigned long reclaim_options, |
1437 | unsigned long *total_scanned) | 1437 | unsigned long *total_scanned) |
1438 | { | 1438 | { |
1439 | struct mem_cgroup *victim; | 1439 | struct mem_cgroup *victim; |
1440 | int ret, total = 0; | 1440 | int ret, total = 0; |
1441 | int loop = 0; | 1441 | int loop = 0; |
1442 | bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; | 1442 | bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; |
1443 | bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; | 1443 | bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; |
1444 | bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; | 1444 | bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; |
1445 | unsigned long excess; | 1445 | unsigned long excess; |
1446 | unsigned long nr_scanned; | 1446 | unsigned long nr_scanned; |
1447 | 1447 | ||
1448 | excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; | 1448 | excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; |
1449 | 1449 | ||
1450 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ | 1450 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ |
1451 | if (root_mem->memsw_is_minimum) | 1451 | if (root_mem->memsw_is_minimum) |
1452 | noswap = true; | 1452 | noswap = true; |
1453 | 1453 | ||
1454 | while (1) { | 1454 | while (1) { |
1455 | victim = mem_cgroup_select_victim(root_mem); | 1455 | victim = mem_cgroup_select_victim(root_mem); |
1456 | if (victim == root_mem) { | 1456 | if (victim == root_mem) { |
1457 | loop++; | 1457 | loop++; |
1458 | if (loop >= 1) | 1458 | if (loop >= 1) |
1459 | drain_all_stock_async(); | 1459 | drain_all_stock_async(); |
1460 | if (loop >= 2) { | 1460 | if (loop >= 2) { |
1461 | /* | 1461 | /* |
1462 | * If we have not been able to reclaim | 1462 | * If we have not been able to reclaim |
1463 | * anything, it might because there are | 1463 | * anything, it might because there are |
1464 | * no reclaimable pages under this hierarchy | 1464 | * no reclaimable pages under this hierarchy |
1465 | */ | 1465 | */ |
1466 | if (!check_soft || !total) { | 1466 | if (!check_soft || !total) { |
1467 | css_put(&victim->css); | 1467 | css_put(&victim->css); |
1468 | break; | 1468 | break; |
1469 | } | 1469 | } |
1470 | /* | 1470 | /* |
1471 | * We want to do more targeted reclaim. | 1471 | * We want to do more targeted reclaim. |
1472 | * excess >> 2 is not to excessive so as to | 1472 | * excess >> 2 is not to excessive so as to |
1473 | * reclaim too much, nor too less that we keep | 1473 | * reclaim too much, nor too less that we keep |
1474 | * coming back to reclaim from this cgroup | 1474 | * coming back to reclaim from this cgroup |
1475 | */ | 1475 | */ |
1476 | if (total >= (excess >> 2) || | 1476 | if (total >= (excess >> 2) || |
1477 | (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { | 1477 | (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { |
1478 | css_put(&victim->css); | 1478 | css_put(&victim->css); |
1479 | break; | 1479 | break; |
1480 | } | 1480 | } |
1481 | } | 1481 | } |
1482 | } | 1482 | } |
1483 | if (!mem_cgroup_local_usage(victim)) { | 1483 | if (!mem_cgroup_local_usage(victim)) { |
1484 | /* this cgroup's local usage == 0 */ | 1484 | /* this cgroup's local usage == 0 */ |
1485 | css_put(&victim->css); | 1485 | css_put(&victim->css); |
1486 | continue; | 1486 | continue; |
1487 | } | 1487 | } |
1488 | /* we use swappiness of local cgroup */ | 1488 | /* we use swappiness of local cgroup */ |
1489 | if (check_soft) { | 1489 | if (check_soft) { |
1490 | ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, | 1490 | ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, |
1491 | noswap, get_swappiness(victim), zone, | 1491 | noswap, get_swappiness(victim), zone, |
1492 | &nr_scanned); | 1492 | &nr_scanned); |
1493 | *total_scanned += nr_scanned; | 1493 | *total_scanned += nr_scanned; |
1494 | } else | 1494 | } else |
1495 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, | 1495 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, |
1496 | noswap, get_swappiness(victim)); | 1496 | noswap, get_swappiness(victim)); |
1497 | css_put(&victim->css); | 1497 | css_put(&victim->css); |
1498 | /* | 1498 | /* |
1499 | * At shrinking usage, we can't check we should stop here or | 1499 | * At shrinking usage, we can't check we should stop here or |
1500 | * reclaim more. It's depends on callers. last_scanned_child | 1500 | * reclaim more. It's depends on callers. last_scanned_child |
1501 | * will work enough for keeping fairness under tree. | 1501 | * will work enough for keeping fairness under tree. |
1502 | */ | 1502 | */ |
1503 | if (shrink) | 1503 | if (shrink) |
1504 | return ret; | 1504 | return ret; |
1505 | total += ret; | 1505 | total += ret; |
1506 | if (check_soft) { | 1506 | if (check_soft) { |
1507 | if (!res_counter_soft_limit_excess(&root_mem->res)) | 1507 | if (!res_counter_soft_limit_excess(&root_mem->res)) |
1508 | return total; | 1508 | return total; |
1509 | } else if (mem_cgroup_margin(root_mem)) | 1509 | } else if (mem_cgroup_margin(root_mem)) |
1510 | return 1 + total; | 1510 | return 1 + total; |
1511 | } | 1511 | } |
1512 | return total; | 1512 | return total; |
1513 | } | 1513 | } |
1514 | 1514 | ||
1515 | /* | 1515 | /* |
1516 | * Check OOM-Killer is already running under our hierarchy. | 1516 | * Check OOM-Killer is already running under our hierarchy. |
1517 | * If someone is running, return false. | 1517 | * If someone is running, return false. |
1518 | */ | 1518 | */ |
1519 | static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) | 1519 | static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) |
1520 | { | 1520 | { |
1521 | int x, lock_count = 0; | 1521 | int x, lock_count = 0; |
1522 | struct mem_cgroup *iter; | 1522 | struct mem_cgroup *iter; |
1523 | 1523 | ||
1524 | for_each_mem_cgroup_tree(iter, mem) { | 1524 | for_each_mem_cgroup_tree(iter, mem) { |
1525 | x = atomic_inc_return(&iter->oom_lock); | 1525 | x = atomic_inc_return(&iter->oom_lock); |
1526 | lock_count = max(x, lock_count); | 1526 | lock_count = max(x, lock_count); |
1527 | } | 1527 | } |
1528 | 1528 | ||
1529 | if (lock_count == 1) | 1529 | if (lock_count == 1) |
1530 | return true; | 1530 | return true; |
1531 | return false; | 1531 | return false; |
1532 | } | 1532 | } |
1533 | 1533 | ||
1534 | static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) | 1534 | static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) |
1535 | { | 1535 | { |
1536 | struct mem_cgroup *iter; | 1536 | struct mem_cgroup *iter; |
1537 | 1537 | ||
1538 | /* | 1538 | /* |
1539 | * When a new child is created while the hierarchy is under oom, | 1539 | * When a new child is created while the hierarchy is under oom, |
1540 | * mem_cgroup_oom_lock() may not be called. We have to use | 1540 | * mem_cgroup_oom_lock() may not be called. We have to use |
1541 | * atomic_add_unless() here. | 1541 | * atomic_add_unless() here. |
1542 | */ | 1542 | */ |
1543 | for_each_mem_cgroup_tree(iter, mem) | 1543 | for_each_mem_cgroup_tree(iter, mem) |
1544 | atomic_add_unless(&iter->oom_lock, -1, 0); | 1544 | atomic_add_unless(&iter->oom_lock, -1, 0); |
1545 | return 0; | 1545 | return 0; |
1546 | } | 1546 | } |
1547 | 1547 | ||
1548 | 1548 | ||
1549 | static DEFINE_MUTEX(memcg_oom_mutex); | 1549 | static DEFINE_MUTEX(memcg_oom_mutex); |
1550 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); | 1550 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); |
1551 | 1551 | ||
1552 | struct oom_wait_info { | 1552 | struct oom_wait_info { |
1553 | struct mem_cgroup *mem; | 1553 | struct mem_cgroup *mem; |
1554 | wait_queue_t wait; | 1554 | wait_queue_t wait; |
1555 | }; | 1555 | }; |
1556 | 1556 | ||
1557 | static int memcg_oom_wake_function(wait_queue_t *wait, | 1557 | static int memcg_oom_wake_function(wait_queue_t *wait, |
1558 | unsigned mode, int sync, void *arg) | 1558 | unsigned mode, int sync, void *arg) |
1559 | { | 1559 | { |
1560 | struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg; | 1560 | struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg; |
1561 | struct oom_wait_info *oom_wait_info; | 1561 | struct oom_wait_info *oom_wait_info; |
1562 | 1562 | ||
1563 | oom_wait_info = container_of(wait, struct oom_wait_info, wait); | 1563 | oom_wait_info = container_of(wait, struct oom_wait_info, wait); |
1564 | 1564 | ||
1565 | if (oom_wait_info->mem == wake_mem) | 1565 | if (oom_wait_info->mem == wake_mem) |
1566 | goto wakeup; | 1566 | goto wakeup; |
1567 | /* if no hierarchy, no match */ | 1567 | /* if no hierarchy, no match */ |
1568 | if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy) | 1568 | if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy) |
1569 | return 0; | 1569 | return 0; |
1570 | /* | 1570 | /* |
1571 | * Both of oom_wait_info->mem and wake_mem are stable under us. | 1571 | * Both of oom_wait_info->mem and wake_mem are stable under us. |
1572 | * Then we can use css_is_ancestor without taking care of RCU. | 1572 | * Then we can use css_is_ancestor without taking care of RCU. |
1573 | */ | 1573 | */ |
1574 | if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) && | 1574 | if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) && |
1575 | !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css)) | 1575 | !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css)) |
1576 | return 0; | 1576 | return 0; |
1577 | 1577 | ||
1578 | wakeup: | 1578 | wakeup: |
1579 | return autoremove_wake_function(wait, mode, sync, arg); | 1579 | return autoremove_wake_function(wait, mode, sync, arg); |
1580 | } | 1580 | } |
1581 | 1581 | ||
1582 | static void memcg_wakeup_oom(struct mem_cgroup *mem) | 1582 | static void memcg_wakeup_oom(struct mem_cgroup *mem) |
1583 | { | 1583 | { |
1584 | /* for filtering, pass "mem" as argument. */ | 1584 | /* for filtering, pass "mem" as argument. */ |
1585 | __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem); | 1585 | __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem); |
1586 | } | 1586 | } |
1587 | 1587 | ||
1588 | static void memcg_oom_recover(struct mem_cgroup *mem) | 1588 | static void memcg_oom_recover(struct mem_cgroup *mem) |
1589 | { | 1589 | { |
1590 | if (mem && atomic_read(&mem->oom_lock)) | 1590 | if (mem && atomic_read(&mem->oom_lock)) |
1591 | memcg_wakeup_oom(mem); | 1591 | memcg_wakeup_oom(mem); |
1592 | } | 1592 | } |
1593 | 1593 | ||
1594 | /* | 1594 | /* |
1595 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. | 1595 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. |
1596 | */ | 1596 | */ |
1597 | bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | 1597 | bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) |
1598 | { | 1598 | { |
1599 | struct oom_wait_info owait; | 1599 | struct oom_wait_info owait; |
1600 | bool locked, need_to_kill; | 1600 | bool locked, need_to_kill; |
1601 | 1601 | ||
1602 | owait.mem = mem; | 1602 | owait.mem = mem; |
1603 | owait.wait.flags = 0; | 1603 | owait.wait.flags = 0; |
1604 | owait.wait.func = memcg_oom_wake_function; | 1604 | owait.wait.func = memcg_oom_wake_function; |
1605 | owait.wait.private = current; | 1605 | owait.wait.private = current; |
1606 | INIT_LIST_HEAD(&owait.wait.task_list); | 1606 | INIT_LIST_HEAD(&owait.wait.task_list); |
1607 | need_to_kill = true; | 1607 | need_to_kill = true; |
1608 | /* At first, try to OOM lock hierarchy under mem.*/ | 1608 | /* At first, try to OOM lock hierarchy under mem.*/ |
1609 | mutex_lock(&memcg_oom_mutex); | 1609 | mutex_lock(&memcg_oom_mutex); |
1610 | locked = mem_cgroup_oom_lock(mem); | 1610 | locked = mem_cgroup_oom_lock(mem); |
1611 | /* | 1611 | /* |
1612 | * Even if signal_pending(), we can't quit charge() loop without | 1612 | * Even if signal_pending(), we can't quit charge() loop without |
1613 | * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL | 1613 | * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL |
1614 | * under OOM is always welcomed, use TASK_KILLABLE here. | 1614 | * under OOM is always welcomed, use TASK_KILLABLE here. |
1615 | */ | 1615 | */ |
1616 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); | 1616 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); |
1617 | if (!locked || mem->oom_kill_disable) | 1617 | if (!locked || mem->oom_kill_disable) |
1618 | need_to_kill = false; | 1618 | need_to_kill = false; |
1619 | if (locked) | 1619 | if (locked) |
1620 | mem_cgroup_oom_notify(mem); | 1620 | mem_cgroup_oom_notify(mem); |
1621 | mutex_unlock(&memcg_oom_mutex); | 1621 | mutex_unlock(&memcg_oom_mutex); |
1622 | 1622 | ||
1623 | if (need_to_kill) { | 1623 | if (need_to_kill) { |
1624 | finish_wait(&memcg_oom_waitq, &owait.wait); | 1624 | finish_wait(&memcg_oom_waitq, &owait.wait); |
1625 | mem_cgroup_out_of_memory(mem, mask); | 1625 | mem_cgroup_out_of_memory(mem, mask); |
1626 | } else { | 1626 | } else { |
1627 | schedule(); | 1627 | schedule(); |
1628 | finish_wait(&memcg_oom_waitq, &owait.wait); | 1628 | finish_wait(&memcg_oom_waitq, &owait.wait); |
1629 | } | 1629 | } |
1630 | mutex_lock(&memcg_oom_mutex); | 1630 | mutex_lock(&memcg_oom_mutex); |
1631 | mem_cgroup_oom_unlock(mem); | 1631 | mem_cgroup_oom_unlock(mem); |
1632 | memcg_wakeup_oom(mem); | 1632 | memcg_wakeup_oom(mem); |
1633 | mutex_unlock(&memcg_oom_mutex); | 1633 | mutex_unlock(&memcg_oom_mutex); |
1634 | 1634 | ||
1635 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) | 1635 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) |
1636 | return false; | 1636 | return false; |
1637 | /* Give chance to dying process */ | 1637 | /* Give chance to dying process */ |
1638 | schedule_timeout(1); | 1638 | schedule_timeout(1); |
1639 | return true; | 1639 | return true; |
1640 | } | 1640 | } |
1641 | 1641 | ||
1642 | /* | 1642 | /* |
1643 | * Currently used to update mapped file statistics, but the routine can be | 1643 | * Currently used to update mapped file statistics, but the routine can be |
1644 | * generalized to update other statistics as well. | 1644 | * generalized to update other statistics as well. |
1645 | * | 1645 | * |
1646 | * Notes: Race condition | 1646 | * Notes: Race condition |
1647 | * | 1647 | * |
1648 | * We usually use page_cgroup_lock() for accessing page_cgroup member but | 1648 | * We usually use page_cgroup_lock() for accessing page_cgroup member but |
1649 | * it tends to be costly. But considering some conditions, we doesn't need | 1649 | * it tends to be costly. But considering some conditions, we doesn't need |
1650 | * to do so _always_. | 1650 | * to do so _always_. |
1651 | * | 1651 | * |
1652 | * Considering "charge", lock_page_cgroup() is not required because all | 1652 | * Considering "charge", lock_page_cgroup() is not required because all |
1653 | * file-stat operations happen after a page is attached to radix-tree. There | 1653 | * file-stat operations happen after a page is attached to radix-tree. There |
1654 | * are no race with "charge". | 1654 | * are no race with "charge". |
1655 | * | 1655 | * |
1656 | * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup | 1656 | * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup |
1657 | * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even | 1657 | * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even |
1658 | * if there are race with "uncharge". Statistics itself is properly handled | 1658 | * if there are race with "uncharge". Statistics itself is properly handled |
1659 | * by flags. | 1659 | * by flags. |
1660 | * | 1660 | * |
1661 | * Considering "move", this is an only case we see a race. To make the race | 1661 | * Considering "move", this is an only case we see a race. To make the race |
1662 | * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are | 1662 | * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are |
1663 | * possibility of race condition. If there is, we take a lock. | 1663 | * possibility of race condition. If there is, we take a lock. |
1664 | */ | 1664 | */ |
1665 | 1665 | ||
1666 | void mem_cgroup_update_page_stat(struct page *page, | 1666 | void mem_cgroup_update_page_stat(struct page *page, |
1667 | enum mem_cgroup_page_stat_item idx, int val) | 1667 | enum mem_cgroup_page_stat_item idx, int val) |
1668 | { | 1668 | { |
1669 | struct mem_cgroup *mem; | 1669 | struct mem_cgroup *mem; |
1670 | struct page_cgroup *pc = lookup_page_cgroup(page); | 1670 | struct page_cgroup *pc = lookup_page_cgroup(page); |
1671 | bool need_unlock = false; | 1671 | bool need_unlock = false; |
1672 | unsigned long uninitialized_var(flags); | 1672 | unsigned long uninitialized_var(flags); |
1673 | 1673 | ||
1674 | if (unlikely(!pc)) | 1674 | if (unlikely(!pc)) |
1675 | return; | 1675 | return; |
1676 | 1676 | ||
1677 | rcu_read_lock(); | 1677 | rcu_read_lock(); |
1678 | mem = pc->mem_cgroup; | 1678 | mem = pc->mem_cgroup; |
1679 | if (unlikely(!mem || !PageCgroupUsed(pc))) | 1679 | if (unlikely(!mem || !PageCgroupUsed(pc))) |
1680 | goto out; | 1680 | goto out; |
1681 | /* pc->mem_cgroup is unstable ? */ | 1681 | /* pc->mem_cgroup is unstable ? */ |
1682 | if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) { | 1682 | if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) { |
1683 | /* take a lock against to access pc->mem_cgroup */ | 1683 | /* take a lock against to access pc->mem_cgroup */ |
1684 | move_lock_page_cgroup(pc, &flags); | 1684 | move_lock_page_cgroup(pc, &flags); |
1685 | need_unlock = true; | 1685 | need_unlock = true; |
1686 | mem = pc->mem_cgroup; | 1686 | mem = pc->mem_cgroup; |
1687 | if (!mem || !PageCgroupUsed(pc)) | 1687 | if (!mem || !PageCgroupUsed(pc)) |
1688 | goto out; | 1688 | goto out; |
1689 | } | 1689 | } |
1690 | 1690 | ||
1691 | switch (idx) { | 1691 | switch (idx) { |
1692 | case MEMCG_NR_FILE_MAPPED: | 1692 | case MEMCG_NR_FILE_MAPPED: |
1693 | if (val > 0) | 1693 | if (val > 0) |
1694 | SetPageCgroupFileMapped(pc); | 1694 | SetPageCgroupFileMapped(pc); |
1695 | else if (!page_mapped(page)) | 1695 | else if (!page_mapped(page)) |
1696 | ClearPageCgroupFileMapped(pc); | 1696 | ClearPageCgroupFileMapped(pc); |
1697 | idx = MEM_CGROUP_STAT_FILE_MAPPED; | 1697 | idx = MEM_CGROUP_STAT_FILE_MAPPED; |
1698 | break; | 1698 | break; |
1699 | default: | 1699 | default: |
1700 | BUG(); | 1700 | BUG(); |
1701 | } | 1701 | } |
1702 | 1702 | ||
1703 | this_cpu_add(mem->stat->count[idx], val); | 1703 | this_cpu_add(mem->stat->count[idx], val); |
1704 | 1704 | ||
1705 | out: | 1705 | out: |
1706 | if (unlikely(need_unlock)) | 1706 | if (unlikely(need_unlock)) |
1707 | move_unlock_page_cgroup(pc, &flags); | 1707 | move_unlock_page_cgroup(pc, &flags); |
1708 | rcu_read_unlock(); | 1708 | rcu_read_unlock(); |
1709 | return; | 1709 | return; |
1710 | } | 1710 | } |
1711 | EXPORT_SYMBOL(mem_cgroup_update_page_stat); | 1711 | EXPORT_SYMBOL(mem_cgroup_update_page_stat); |
1712 | 1712 | ||
1713 | /* | 1713 | /* |
1714 | * size of first charge trial. "32" comes from vmscan.c's magic value. | 1714 | * size of first charge trial. "32" comes from vmscan.c's magic value. |
1715 | * TODO: maybe necessary to use big numbers in big irons. | 1715 | * TODO: maybe necessary to use big numbers in big irons. |
1716 | */ | 1716 | */ |
1717 | #define CHARGE_BATCH 32U | 1717 | #define CHARGE_BATCH 32U |
1718 | struct memcg_stock_pcp { | 1718 | struct memcg_stock_pcp { |
1719 | struct mem_cgroup *cached; /* this never be root cgroup */ | 1719 | struct mem_cgroup *cached; /* this never be root cgroup */ |
1720 | unsigned int nr_pages; | 1720 | unsigned int nr_pages; |
1721 | struct work_struct work; | 1721 | struct work_struct work; |
1722 | }; | 1722 | }; |
1723 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); | 1723 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); |
1724 | static atomic_t memcg_drain_count; | 1724 | static atomic_t memcg_drain_count; |
1725 | 1725 | ||
1726 | /* | 1726 | /* |
1727 | * Try to consume stocked charge on this cpu. If success, one page is consumed | 1727 | * Try to consume stocked charge on this cpu. If success, one page is consumed |
1728 | * from local stock and true is returned. If the stock is 0 or charges from a | 1728 | * from local stock and true is returned. If the stock is 0 or charges from a |
1729 | * cgroup which is not current target, returns false. This stock will be | 1729 | * cgroup which is not current target, returns false. This stock will be |
1730 | * refilled. | 1730 | * refilled. |
1731 | */ | 1731 | */ |
1732 | static bool consume_stock(struct mem_cgroup *mem) | 1732 | static bool consume_stock(struct mem_cgroup *mem) |
1733 | { | 1733 | { |
1734 | struct memcg_stock_pcp *stock; | 1734 | struct memcg_stock_pcp *stock; |
1735 | bool ret = true; | 1735 | bool ret = true; |
1736 | 1736 | ||
1737 | stock = &get_cpu_var(memcg_stock); | 1737 | stock = &get_cpu_var(memcg_stock); |
1738 | if (mem == stock->cached && stock->nr_pages) | 1738 | if (mem == stock->cached && stock->nr_pages) |
1739 | stock->nr_pages--; | 1739 | stock->nr_pages--; |
1740 | else /* need to call res_counter_charge */ | 1740 | else /* need to call res_counter_charge */ |
1741 | ret = false; | 1741 | ret = false; |
1742 | put_cpu_var(memcg_stock); | 1742 | put_cpu_var(memcg_stock); |
1743 | return ret; | 1743 | return ret; |
1744 | } | 1744 | } |
1745 | 1745 | ||
1746 | /* | 1746 | /* |
1747 | * Returns stocks cached in percpu to res_counter and reset cached information. | 1747 | * Returns stocks cached in percpu to res_counter and reset cached information. |
1748 | */ | 1748 | */ |
1749 | static void drain_stock(struct memcg_stock_pcp *stock) | 1749 | static void drain_stock(struct memcg_stock_pcp *stock) |
1750 | { | 1750 | { |
1751 | struct mem_cgroup *old = stock->cached; | 1751 | struct mem_cgroup *old = stock->cached; |
1752 | 1752 | ||
1753 | if (stock->nr_pages) { | 1753 | if (stock->nr_pages) { |
1754 | unsigned long bytes = stock->nr_pages * PAGE_SIZE; | 1754 | unsigned long bytes = stock->nr_pages * PAGE_SIZE; |
1755 | 1755 | ||
1756 | res_counter_uncharge(&old->res, bytes); | 1756 | res_counter_uncharge(&old->res, bytes); |
1757 | if (do_swap_account) | 1757 | if (do_swap_account) |
1758 | res_counter_uncharge(&old->memsw, bytes); | 1758 | res_counter_uncharge(&old->memsw, bytes); |
1759 | stock->nr_pages = 0; | 1759 | stock->nr_pages = 0; |
1760 | } | 1760 | } |
1761 | stock->cached = NULL; | 1761 | stock->cached = NULL; |
1762 | } | 1762 | } |
1763 | 1763 | ||
1764 | /* | 1764 | /* |
1765 | * This must be called under preempt disabled or must be called by | 1765 | * This must be called under preempt disabled or must be called by |
1766 | * a thread which is pinned to local cpu. | 1766 | * a thread which is pinned to local cpu. |
1767 | */ | 1767 | */ |
1768 | static void drain_local_stock(struct work_struct *dummy) | 1768 | static void drain_local_stock(struct work_struct *dummy) |
1769 | { | 1769 | { |
1770 | struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); | 1770 | struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); |
1771 | drain_stock(stock); | 1771 | drain_stock(stock); |
1772 | } | 1772 | } |
1773 | 1773 | ||
1774 | /* | 1774 | /* |
1775 | * Cache charges(val) which is from res_counter, to local per_cpu area. | 1775 | * Cache charges(val) which is from res_counter, to local per_cpu area. |
1776 | * This will be consumed by consume_stock() function, later. | 1776 | * This will be consumed by consume_stock() function, later. |
1777 | */ | 1777 | */ |
1778 | static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages) | 1778 | static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages) |
1779 | { | 1779 | { |
1780 | struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); | 1780 | struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); |
1781 | 1781 | ||
1782 | if (stock->cached != mem) { /* reset if necessary */ | 1782 | if (stock->cached != mem) { /* reset if necessary */ |
1783 | drain_stock(stock); | 1783 | drain_stock(stock); |
1784 | stock->cached = mem; | 1784 | stock->cached = mem; |
1785 | } | 1785 | } |
1786 | stock->nr_pages += nr_pages; | 1786 | stock->nr_pages += nr_pages; |
1787 | put_cpu_var(memcg_stock); | 1787 | put_cpu_var(memcg_stock); |
1788 | } | 1788 | } |
1789 | 1789 | ||
1790 | /* | 1790 | /* |
1791 | * Tries to drain stocked charges in other cpus. This function is asynchronous | 1791 | * Tries to drain stocked charges in other cpus. This function is asynchronous |
1792 | * and just put a work per cpu for draining localy on each cpu. Caller can | 1792 | * and just put a work per cpu for draining localy on each cpu. Caller can |
1793 | * expects some charges will be back to res_counter later but cannot wait for | 1793 | * expects some charges will be back to res_counter later but cannot wait for |
1794 | * it. | 1794 | * it. |
1795 | */ | 1795 | */ |
1796 | static void drain_all_stock_async(void) | 1796 | static void drain_all_stock_async(void) |
1797 | { | 1797 | { |
1798 | int cpu; | 1798 | int cpu; |
1799 | /* This function is for scheduling "drain" in asynchronous way. | 1799 | /* This function is for scheduling "drain" in asynchronous way. |
1800 | * The result of "drain" is not directly handled by callers. Then, | 1800 | * The result of "drain" is not directly handled by callers. Then, |
1801 | * if someone is calling drain, we don't have to call drain more. | 1801 | * if someone is calling drain, we don't have to call drain more. |
1802 | * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if | 1802 | * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if |
1803 | * there is a race. We just do loose check here. | 1803 | * there is a race. We just do loose check here. |
1804 | */ | 1804 | */ |
1805 | if (atomic_read(&memcg_drain_count)) | 1805 | if (atomic_read(&memcg_drain_count)) |
1806 | return; | 1806 | return; |
1807 | /* Notify other cpus that system-wide "drain" is running */ | 1807 | /* Notify other cpus that system-wide "drain" is running */ |
1808 | atomic_inc(&memcg_drain_count); | 1808 | atomic_inc(&memcg_drain_count); |
1809 | get_online_cpus(); | 1809 | get_online_cpus(); |
1810 | for_each_online_cpu(cpu) { | 1810 | for_each_online_cpu(cpu) { |
1811 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); | 1811 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); |
1812 | schedule_work_on(cpu, &stock->work); | 1812 | schedule_work_on(cpu, &stock->work); |
1813 | } | 1813 | } |
1814 | put_online_cpus(); | 1814 | put_online_cpus(); |
1815 | atomic_dec(&memcg_drain_count); | 1815 | atomic_dec(&memcg_drain_count); |
1816 | /* We don't wait for flush_work */ | 1816 | /* We don't wait for flush_work */ |
1817 | } | 1817 | } |
1818 | 1818 | ||
1819 | /* This is a synchronous drain interface. */ | 1819 | /* This is a synchronous drain interface. */ |
1820 | static void drain_all_stock_sync(void) | 1820 | static void drain_all_stock_sync(void) |
1821 | { | 1821 | { |
1822 | /* called when force_empty is called */ | 1822 | /* called when force_empty is called */ |
1823 | atomic_inc(&memcg_drain_count); | 1823 | atomic_inc(&memcg_drain_count); |
1824 | schedule_on_each_cpu(drain_local_stock); | 1824 | schedule_on_each_cpu(drain_local_stock); |
1825 | atomic_dec(&memcg_drain_count); | 1825 | atomic_dec(&memcg_drain_count); |
1826 | } | 1826 | } |
1827 | 1827 | ||
1828 | /* | 1828 | /* |
1829 | * This function drains percpu counter value from DEAD cpu and | 1829 | * This function drains percpu counter value from DEAD cpu and |
1830 | * move it to local cpu. Note that this function can be preempted. | 1830 | * move it to local cpu. Note that this function can be preempted. |
1831 | */ | 1831 | */ |
1832 | static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu) | 1832 | static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu) |
1833 | { | 1833 | { |
1834 | int i; | 1834 | int i; |
1835 | 1835 | ||
1836 | spin_lock(&mem->pcp_counter_lock); | 1836 | spin_lock(&mem->pcp_counter_lock); |
1837 | for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { | 1837 | for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { |
1838 | long x = per_cpu(mem->stat->count[i], cpu); | 1838 | long x = per_cpu(mem->stat->count[i], cpu); |
1839 | 1839 | ||
1840 | per_cpu(mem->stat->count[i], cpu) = 0; | 1840 | per_cpu(mem->stat->count[i], cpu) = 0; |
1841 | mem->nocpu_base.count[i] += x; | 1841 | mem->nocpu_base.count[i] += x; |
1842 | } | 1842 | } |
1843 | for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { | 1843 | for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { |
1844 | unsigned long x = per_cpu(mem->stat->events[i], cpu); | 1844 | unsigned long x = per_cpu(mem->stat->events[i], cpu); |
1845 | 1845 | ||
1846 | per_cpu(mem->stat->events[i], cpu) = 0; | 1846 | per_cpu(mem->stat->events[i], cpu) = 0; |
1847 | mem->nocpu_base.events[i] += x; | 1847 | mem->nocpu_base.events[i] += x; |
1848 | } | 1848 | } |
1849 | /* need to clear ON_MOVE value, works as a kind of lock. */ | 1849 | /* need to clear ON_MOVE value, works as a kind of lock. */ |
1850 | per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; | 1850 | per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; |
1851 | spin_unlock(&mem->pcp_counter_lock); | 1851 | spin_unlock(&mem->pcp_counter_lock); |
1852 | } | 1852 | } |
1853 | 1853 | ||
1854 | static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu) | 1854 | static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu) |
1855 | { | 1855 | { |
1856 | int idx = MEM_CGROUP_ON_MOVE; | 1856 | int idx = MEM_CGROUP_ON_MOVE; |
1857 | 1857 | ||
1858 | spin_lock(&mem->pcp_counter_lock); | 1858 | spin_lock(&mem->pcp_counter_lock); |
1859 | per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx]; | 1859 | per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx]; |
1860 | spin_unlock(&mem->pcp_counter_lock); | 1860 | spin_unlock(&mem->pcp_counter_lock); |
1861 | } | 1861 | } |
1862 | 1862 | ||
1863 | static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, | 1863 | static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, |
1864 | unsigned long action, | 1864 | unsigned long action, |
1865 | void *hcpu) | 1865 | void *hcpu) |
1866 | { | 1866 | { |
1867 | int cpu = (unsigned long)hcpu; | 1867 | int cpu = (unsigned long)hcpu; |
1868 | struct memcg_stock_pcp *stock; | 1868 | struct memcg_stock_pcp *stock; |
1869 | struct mem_cgroup *iter; | 1869 | struct mem_cgroup *iter; |
1870 | 1870 | ||
1871 | if ((action == CPU_ONLINE)) { | 1871 | if ((action == CPU_ONLINE)) { |
1872 | for_each_mem_cgroup_all(iter) | 1872 | for_each_mem_cgroup_all(iter) |
1873 | synchronize_mem_cgroup_on_move(iter, cpu); | 1873 | synchronize_mem_cgroup_on_move(iter, cpu); |
1874 | return NOTIFY_OK; | 1874 | return NOTIFY_OK; |
1875 | } | 1875 | } |
1876 | 1876 | ||
1877 | if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) | 1877 | if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) |
1878 | return NOTIFY_OK; | 1878 | return NOTIFY_OK; |
1879 | 1879 | ||
1880 | for_each_mem_cgroup_all(iter) | 1880 | for_each_mem_cgroup_all(iter) |
1881 | mem_cgroup_drain_pcp_counter(iter, cpu); | 1881 | mem_cgroup_drain_pcp_counter(iter, cpu); |
1882 | 1882 | ||
1883 | stock = &per_cpu(memcg_stock, cpu); | 1883 | stock = &per_cpu(memcg_stock, cpu); |
1884 | drain_stock(stock); | 1884 | drain_stock(stock); |
1885 | return NOTIFY_OK; | 1885 | return NOTIFY_OK; |
1886 | } | 1886 | } |
1887 | 1887 | ||
1888 | 1888 | ||
1889 | /* See __mem_cgroup_try_charge() for details */ | 1889 | /* See __mem_cgroup_try_charge() for details */ |
1890 | enum { | 1890 | enum { |
1891 | CHARGE_OK, /* success */ | 1891 | CHARGE_OK, /* success */ |
1892 | CHARGE_RETRY, /* need to retry but retry is not bad */ | 1892 | CHARGE_RETRY, /* need to retry but retry is not bad */ |
1893 | CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ | 1893 | CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ |
1894 | CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ | 1894 | CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ |
1895 | CHARGE_OOM_DIE, /* the current is killed because of OOM */ | 1895 | CHARGE_OOM_DIE, /* the current is killed because of OOM */ |
1896 | }; | 1896 | }; |
1897 | 1897 | ||
1898 | static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | 1898 | static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, |
1899 | unsigned int nr_pages, bool oom_check) | 1899 | unsigned int nr_pages, bool oom_check) |
1900 | { | 1900 | { |
1901 | unsigned long csize = nr_pages * PAGE_SIZE; | 1901 | unsigned long csize = nr_pages * PAGE_SIZE; |
1902 | struct mem_cgroup *mem_over_limit; | 1902 | struct mem_cgroup *mem_over_limit; |
1903 | struct res_counter *fail_res; | 1903 | struct res_counter *fail_res; |
1904 | unsigned long flags = 0; | 1904 | unsigned long flags = 0; |
1905 | int ret; | 1905 | int ret; |
1906 | 1906 | ||
1907 | ret = res_counter_charge(&mem->res, csize, &fail_res); | 1907 | ret = res_counter_charge(&mem->res, csize, &fail_res); |
1908 | 1908 | ||
1909 | if (likely(!ret)) { | 1909 | if (likely(!ret)) { |
1910 | if (!do_swap_account) | 1910 | if (!do_swap_account) |
1911 | return CHARGE_OK; | 1911 | return CHARGE_OK; |
1912 | ret = res_counter_charge(&mem->memsw, csize, &fail_res); | 1912 | ret = res_counter_charge(&mem->memsw, csize, &fail_res); |
1913 | if (likely(!ret)) | 1913 | if (likely(!ret)) |
1914 | return CHARGE_OK; | 1914 | return CHARGE_OK; |
1915 | 1915 | ||
1916 | res_counter_uncharge(&mem->res, csize); | 1916 | res_counter_uncharge(&mem->res, csize); |
1917 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); | 1917 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); |
1918 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; | 1918 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; |
1919 | } else | 1919 | } else |
1920 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); | 1920 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); |
1921 | /* | 1921 | /* |
1922 | * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch | 1922 | * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch |
1923 | * of regular pages (CHARGE_BATCH), or a single regular page (1). | 1923 | * of regular pages (CHARGE_BATCH), or a single regular page (1). |
1924 | * | 1924 | * |
1925 | * Never reclaim on behalf of optional batching, retry with a | 1925 | * Never reclaim on behalf of optional batching, retry with a |
1926 | * single page instead. | 1926 | * single page instead. |
1927 | */ | 1927 | */ |
1928 | if (nr_pages == CHARGE_BATCH) | 1928 | if (nr_pages == CHARGE_BATCH) |
1929 | return CHARGE_RETRY; | 1929 | return CHARGE_RETRY; |
1930 | 1930 | ||
1931 | if (!(gfp_mask & __GFP_WAIT)) | 1931 | if (!(gfp_mask & __GFP_WAIT)) |
1932 | return CHARGE_WOULDBLOCK; | 1932 | return CHARGE_WOULDBLOCK; |
1933 | 1933 | ||
1934 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, | 1934 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, |
1935 | gfp_mask, flags, NULL); | 1935 | gfp_mask, flags, NULL); |
1936 | if (mem_cgroup_margin(mem_over_limit) >= nr_pages) | 1936 | if (mem_cgroup_margin(mem_over_limit) >= nr_pages) |
1937 | return CHARGE_RETRY; | 1937 | return CHARGE_RETRY; |
1938 | /* | 1938 | /* |
1939 | * Even though the limit is exceeded at this point, reclaim | 1939 | * Even though the limit is exceeded at this point, reclaim |
1940 | * may have been able to free some pages. Retry the charge | 1940 | * may have been able to free some pages. Retry the charge |
1941 | * before killing the task. | 1941 | * before killing the task. |
1942 | * | 1942 | * |
1943 | * Only for regular pages, though: huge pages are rather | 1943 | * Only for regular pages, though: huge pages are rather |
1944 | * unlikely to succeed so close to the limit, and we fall back | 1944 | * unlikely to succeed so close to the limit, and we fall back |
1945 | * to regular pages anyway in case of failure. | 1945 | * to regular pages anyway in case of failure. |
1946 | */ | 1946 | */ |
1947 | if (nr_pages == 1 && ret) | 1947 | if (nr_pages == 1 && ret) |
1948 | return CHARGE_RETRY; | 1948 | return CHARGE_RETRY; |
1949 | 1949 | ||
1950 | /* | 1950 | /* |
1951 | * At task move, charge accounts can be doubly counted. So, it's | 1951 | * At task move, charge accounts can be doubly counted. So, it's |
1952 | * better to wait until the end of task_move if something is going on. | 1952 | * better to wait until the end of task_move if something is going on. |
1953 | */ | 1953 | */ |
1954 | if (mem_cgroup_wait_acct_move(mem_over_limit)) | 1954 | if (mem_cgroup_wait_acct_move(mem_over_limit)) |
1955 | return CHARGE_RETRY; | 1955 | return CHARGE_RETRY; |
1956 | 1956 | ||
1957 | /* If we don't need to call oom-killer at el, return immediately */ | 1957 | /* If we don't need to call oom-killer at el, return immediately */ |
1958 | if (!oom_check) | 1958 | if (!oom_check) |
1959 | return CHARGE_NOMEM; | 1959 | return CHARGE_NOMEM; |
1960 | /* check OOM */ | 1960 | /* check OOM */ |
1961 | if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) | 1961 | if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) |
1962 | return CHARGE_OOM_DIE; | 1962 | return CHARGE_OOM_DIE; |
1963 | 1963 | ||
1964 | return CHARGE_RETRY; | 1964 | return CHARGE_RETRY; |
1965 | } | 1965 | } |
1966 | 1966 | ||
1967 | /* | 1967 | /* |
1968 | * Unlike exported interface, "oom" parameter is added. if oom==true, | 1968 | * Unlike exported interface, "oom" parameter is added. if oom==true, |
1969 | * oom-killer can be invoked. | 1969 | * oom-killer can be invoked. |
1970 | */ | 1970 | */ |
1971 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | 1971 | static int __mem_cgroup_try_charge(struct mm_struct *mm, |
1972 | gfp_t gfp_mask, | 1972 | gfp_t gfp_mask, |
1973 | unsigned int nr_pages, | 1973 | unsigned int nr_pages, |
1974 | struct mem_cgroup **memcg, | 1974 | struct mem_cgroup **memcg, |
1975 | bool oom) | 1975 | bool oom) |
1976 | { | 1976 | { |
1977 | unsigned int batch = max(CHARGE_BATCH, nr_pages); | 1977 | unsigned int batch = max(CHARGE_BATCH, nr_pages); |
1978 | int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; | 1978 | int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; |
1979 | struct mem_cgroup *mem = NULL; | 1979 | struct mem_cgroup *mem = NULL; |
1980 | int ret; | 1980 | int ret; |
1981 | 1981 | ||
1982 | /* | 1982 | /* |
1983 | * Unlike gloval-vm's OOM-kill, we're not in memory shortage | 1983 | * Unlike gloval-vm's OOM-kill, we're not in memory shortage |
1984 | * in system level. So, allow to go ahead dying process in addition to | 1984 | * in system level. So, allow to go ahead dying process in addition to |
1985 | * MEMDIE process. | 1985 | * MEMDIE process. |
1986 | */ | 1986 | */ |
1987 | if (unlikely(test_thread_flag(TIF_MEMDIE) | 1987 | if (unlikely(test_thread_flag(TIF_MEMDIE) |
1988 | || fatal_signal_pending(current))) | 1988 | || fatal_signal_pending(current))) |
1989 | goto bypass; | 1989 | goto bypass; |
1990 | 1990 | ||
1991 | /* | 1991 | /* |
1992 | * We always charge the cgroup the mm_struct belongs to. | 1992 | * We always charge the cgroup the mm_struct belongs to. |
1993 | * The mm_struct's mem_cgroup changes on task migration if the | 1993 | * The mm_struct's mem_cgroup changes on task migration if the |
1994 | * thread group leader migrates. It's possible that mm is not | 1994 | * thread group leader migrates. It's possible that mm is not |
1995 | * set, if so charge the init_mm (happens for pagecache usage). | 1995 | * set, if so charge the init_mm (happens for pagecache usage). |
1996 | */ | 1996 | */ |
1997 | if (!*memcg && !mm) | 1997 | if (!*memcg && !mm) |
1998 | goto bypass; | 1998 | goto bypass; |
1999 | again: | 1999 | again: |
2000 | if (*memcg) { /* css should be a valid one */ | 2000 | if (*memcg) { /* css should be a valid one */ |
2001 | mem = *memcg; | 2001 | mem = *memcg; |
2002 | VM_BUG_ON(css_is_removed(&mem->css)); | 2002 | VM_BUG_ON(css_is_removed(&mem->css)); |
2003 | if (mem_cgroup_is_root(mem)) | 2003 | if (mem_cgroup_is_root(mem)) |
2004 | goto done; | 2004 | goto done; |
2005 | if (nr_pages == 1 && consume_stock(mem)) | 2005 | if (nr_pages == 1 && consume_stock(mem)) |
2006 | goto done; | 2006 | goto done; |
2007 | css_get(&mem->css); | 2007 | css_get(&mem->css); |
2008 | } else { | 2008 | } else { |
2009 | struct task_struct *p; | 2009 | struct task_struct *p; |
2010 | 2010 | ||
2011 | rcu_read_lock(); | 2011 | rcu_read_lock(); |
2012 | p = rcu_dereference(mm->owner); | 2012 | p = rcu_dereference(mm->owner); |
2013 | /* | 2013 | /* |
2014 | * Because we don't have task_lock(), "p" can exit. | 2014 | * Because we don't have task_lock(), "p" can exit. |
2015 | * In that case, "mem" can point to root or p can be NULL with | 2015 | * In that case, "mem" can point to root or p can be NULL with |
2016 | * race with swapoff. Then, we have small risk of mis-accouning. | 2016 | * race with swapoff. Then, we have small risk of mis-accouning. |
2017 | * But such kind of mis-account by race always happens because | 2017 | * But such kind of mis-account by race always happens because |
2018 | * we don't have cgroup_mutex(). It's overkill and we allo that | 2018 | * we don't have cgroup_mutex(). It's overkill and we allo that |
2019 | * small race, here. | 2019 | * small race, here. |
2020 | * (*) swapoff at el will charge against mm-struct not against | 2020 | * (*) swapoff at el will charge against mm-struct not against |
2021 | * task-struct. So, mm->owner can be NULL. | 2021 | * task-struct. So, mm->owner can be NULL. |
2022 | */ | 2022 | */ |
2023 | mem = mem_cgroup_from_task(p); | 2023 | mem = mem_cgroup_from_task(p); |
2024 | if (!mem || mem_cgroup_is_root(mem)) { | 2024 | if (!mem || mem_cgroup_is_root(mem)) { |
2025 | rcu_read_unlock(); | 2025 | rcu_read_unlock(); |
2026 | goto done; | 2026 | goto done; |
2027 | } | 2027 | } |
2028 | if (nr_pages == 1 && consume_stock(mem)) { | 2028 | if (nr_pages == 1 && consume_stock(mem)) { |
2029 | /* | 2029 | /* |
2030 | * It seems dagerous to access memcg without css_get(). | 2030 | * It seems dagerous to access memcg without css_get(). |
2031 | * But considering how consume_stok works, it's not | 2031 | * But considering how consume_stok works, it's not |
2032 | * necessary. If consume_stock success, some charges | 2032 | * necessary. If consume_stock success, some charges |
2033 | * from this memcg are cached on this cpu. So, we | 2033 | * from this memcg are cached on this cpu. So, we |
2034 | * don't need to call css_get()/css_tryget() before | 2034 | * don't need to call css_get()/css_tryget() before |
2035 | * calling consume_stock(). | 2035 | * calling consume_stock(). |
2036 | */ | 2036 | */ |
2037 | rcu_read_unlock(); | 2037 | rcu_read_unlock(); |
2038 | goto done; | 2038 | goto done; |
2039 | } | 2039 | } |
2040 | /* after here, we may be blocked. we need to get refcnt */ | 2040 | /* after here, we may be blocked. we need to get refcnt */ |
2041 | if (!css_tryget(&mem->css)) { | 2041 | if (!css_tryget(&mem->css)) { |
2042 | rcu_read_unlock(); | 2042 | rcu_read_unlock(); |
2043 | goto again; | 2043 | goto again; |
2044 | } | 2044 | } |
2045 | rcu_read_unlock(); | 2045 | rcu_read_unlock(); |
2046 | } | 2046 | } |
2047 | 2047 | ||
2048 | do { | 2048 | do { |
2049 | bool oom_check; | 2049 | bool oom_check; |
2050 | 2050 | ||
2051 | /* If killed, bypass charge */ | 2051 | /* If killed, bypass charge */ |
2052 | if (fatal_signal_pending(current)) { | 2052 | if (fatal_signal_pending(current)) { |
2053 | css_put(&mem->css); | 2053 | css_put(&mem->css); |
2054 | goto bypass; | 2054 | goto bypass; |
2055 | } | 2055 | } |
2056 | 2056 | ||
2057 | oom_check = false; | 2057 | oom_check = false; |
2058 | if (oom && !nr_oom_retries) { | 2058 | if (oom && !nr_oom_retries) { |
2059 | oom_check = true; | 2059 | oom_check = true; |
2060 | nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; | 2060 | nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; |
2061 | } | 2061 | } |
2062 | 2062 | ||
2063 | ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check); | 2063 | ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check); |
2064 | switch (ret) { | 2064 | switch (ret) { |
2065 | case CHARGE_OK: | 2065 | case CHARGE_OK: |
2066 | break; | 2066 | break; |
2067 | case CHARGE_RETRY: /* not in OOM situation but retry */ | 2067 | case CHARGE_RETRY: /* not in OOM situation but retry */ |
2068 | batch = nr_pages; | 2068 | batch = nr_pages; |
2069 | css_put(&mem->css); | 2069 | css_put(&mem->css); |
2070 | mem = NULL; | 2070 | mem = NULL; |
2071 | goto again; | 2071 | goto again; |
2072 | case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ | 2072 | case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ |
2073 | css_put(&mem->css); | 2073 | css_put(&mem->css); |
2074 | goto nomem; | 2074 | goto nomem; |
2075 | case CHARGE_NOMEM: /* OOM routine works */ | 2075 | case CHARGE_NOMEM: /* OOM routine works */ |
2076 | if (!oom) { | 2076 | if (!oom) { |
2077 | css_put(&mem->css); | 2077 | css_put(&mem->css); |
2078 | goto nomem; | 2078 | goto nomem; |
2079 | } | 2079 | } |
2080 | /* If oom, we never return -ENOMEM */ | 2080 | /* If oom, we never return -ENOMEM */ |
2081 | nr_oom_retries--; | 2081 | nr_oom_retries--; |
2082 | break; | 2082 | break; |
2083 | case CHARGE_OOM_DIE: /* Killed by OOM Killer */ | 2083 | case CHARGE_OOM_DIE: /* Killed by OOM Killer */ |
2084 | css_put(&mem->css); | 2084 | css_put(&mem->css); |
2085 | goto bypass; | 2085 | goto bypass; |
2086 | } | 2086 | } |
2087 | } while (ret != CHARGE_OK); | 2087 | } while (ret != CHARGE_OK); |
2088 | 2088 | ||
2089 | if (batch > nr_pages) | 2089 | if (batch > nr_pages) |
2090 | refill_stock(mem, batch - nr_pages); | 2090 | refill_stock(mem, batch - nr_pages); |
2091 | css_put(&mem->css); | 2091 | css_put(&mem->css); |
2092 | done: | 2092 | done: |
2093 | *memcg = mem; | 2093 | *memcg = mem; |
2094 | return 0; | 2094 | return 0; |
2095 | nomem: | 2095 | nomem: |
2096 | *memcg = NULL; | 2096 | *memcg = NULL; |
2097 | return -ENOMEM; | 2097 | return -ENOMEM; |
2098 | bypass: | 2098 | bypass: |
2099 | *memcg = NULL; | 2099 | *memcg = NULL; |
2100 | return 0; | 2100 | return 0; |
2101 | } | 2101 | } |
2102 | 2102 | ||
2103 | /* | 2103 | /* |
2104 | * Somemtimes we have to undo a charge we got by try_charge(). | 2104 | * Somemtimes we have to undo a charge we got by try_charge(). |
2105 | * This function is for that and do uncharge, put css's refcnt. | 2105 | * This function is for that and do uncharge, put css's refcnt. |
2106 | * gotten by try_charge(). | 2106 | * gotten by try_charge(). |
2107 | */ | 2107 | */ |
2108 | static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, | 2108 | static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, |
2109 | unsigned int nr_pages) | 2109 | unsigned int nr_pages) |
2110 | { | 2110 | { |
2111 | if (!mem_cgroup_is_root(mem)) { | 2111 | if (!mem_cgroup_is_root(mem)) { |
2112 | unsigned long bytes = nr_pages * PAGE_SIZE; | 2112 | unsigned long bytes = nr_pages * PAGE_SIZE; |
2113 | 2113 | ||
2114 | res_counter_uncharge(&mem->res, bytes); | 2114 | res_counter_uncharge(&mem->res, bytes); |
2115 | if (do_swap_account) | 2115 | if (do_swap_account) |
2116 | res_counter_uncharge(&mem->memsw, bytes); | 2116 | res_counter_uncharge(&mem->memsw, bytes); |
2117 | } | 2117 | } |
2118 | } | 2118 | } |
2119 | 2119 | ||
2120 | /* | 2120 | /* |
2121 | * A helper function to get mem_cgroup from ID. must be called under | 2121 | * A helper function to get mem_cgroup from ID. must be called under |
2122 | * rcu_read_lock(). The caller must check css_is_removed() or some if | 2122 | * rcu_read_lock(). The caller must check css_is_removed() or some if |
2123 | * it's concern. (dropping refcnt from swap can be called against removed | 2123 | * it's concern. (dropping refcnt from swap can be called against removed |
2124 | * memcg.) | 2124 | * memcg.) |
2125 | */ | 2125 | */ |
2126 | static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) | 2126 | static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) |
2127 | { | 2127 | { |
2128 | struct cgroup_subsys_state *css; | 2128 | struct cgroup_subsys_state *css; |
2129 | 2129 | ||
2130 | /* ID 0 is unused ID */ | 2130 | /* ID 0 is unused ID */ |
2131 | if (!id) | 2131 | if (!id) |
2132 | return NULL; | 2132 | return NULL; |
2133 | css = css_lookup(&mem_cgroup_subsys, id); | 2133 | css = css_lookup(&mem_cgroup_subsys, id); |
2134 | if (!css) | 2134 | if (!css) |
2135 | return NULL; | 2135 | return NULL; |
2136 | return container_of(css, struct mem_cgroup, css); | 2136 | return container_of(css, struct mem_cgroup, css); |
2137 | } | 2137 | } |
2138 | 2138 | ||
2139 | struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) | 2139 | struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) |
2140 | { | 2140 | { |
2141 | struct mem_cgroup *mem = NULL; | 2141 | struct mem_cgroup *mem = NULL; |
2142 | struct page_cgroup *pc; | 2142 | struct page_cgroup *pc; |
2143 | unsigned short id; | 2143 | unsigned short id; |
2144 | swp_entry_t ent; | 2144 | swp_entry_t ent; |
2145 | 2145 | ||
2146 | VM_BUG_ON(!PageLocked(page)); | 2146 | VM_BUG_ON(!PageLocked(page)); |
2147 | 2147 | ||
2148 | pc = lookup_page_cgroup(page); | 2148 | pc = lookup_page_cgroup(page); |
2149 | lock_page_cgroup(pc); | 2149 | lock_page_cgroup(pc); |
2150 | if (PageCgroupUsed(pc)) { | 2150 | if (PageCgroupUsed(pc)) { |
2151 | mem = pc->mem_cgroup; | 2151 | mem = pc->mem_cgroup; |
2152 | if (mem && !css_tryget(&mem->css)) | 2152 | if (mem && !css_tryget(&mem->css)) |
2153 | mem = NULL; | 2153 | mem = NULL; |
2154 | } else if (PageSwapCache(page)) { | 2154 | } else if (PageSwapCache(page)) { |
2155 | ent.val = page_private(page); | 2155 | ent.val = page_private(page); |
2156 | id = lookup_swap_cgroup(ent); | 2156 | id = lookup_swap_cgroup(ent); |
2157 | rcu_read_lock(); | 2157 | rcu_read_lock(); |
2158 | mem = mem_cgroup_lookup(id); | 2158 | mem = mem_cgroup_lookup(id); |
2159 | if (mem && !css_tryget(&mem->css)) | 2159 | if (mem && !css_tryget(&mem->css)) |
2160 | mem = NULL; | 2160 | mem = NULL; |
2161 | rcu_read_unlock(); | 2161 | rcu_read_unlock(); |
2162 | } | 2162 | } |
2163 | unlock_page_cgroup(pc); | 2163 | unlock_page_cgroup(pc); |
2164 | return mem; | 2164 | return mem; |
2165 | } | 2165 | } |
2166 | 2166 | ||
2167 | static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | 2167 | static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, |
2168 | struct page *page, | 2168 | struct page *page, |
2169 | unsigned int nr_pages, | 2169 | unsigned int nr_pages, |
2170 | struct page_cgroup *pc, | 2170 | struct page_cgroup *pc, |
2171 | enum charge_type ctype) | 2171 | enum charge_type ctype) |
2172 | { | 2172 | { |
2173 | lock_page_cgroup(pc); | 2173 | lock_page_cgroup(pc); |
2174 | if (unlikely(PageCgroupUsed(pc))) { | 2174 | if (unlikely(PageCgroupUsed(pc))) { |
2175 | unlock_page_cgroup(pc); | 2175 | unlock_page_cgroup(pc); |
2176 | __mem_cgroup_cancel_charge(mem, nr_pages); | 2176 | __mem_cgroup_cancel_charge(mem, nr_pages); |
2177 | return; | 2177 | return; |
2178 | } | 2178 | } |
2179 | /* | 2179 | /* |
2180 | * we don't need page_cgroup_lock about tail pages, becase they are not | 2180 | * we don't need page_cgroup_lock about tail pages, becase they are not |
2181 | * accessed by any other context at this point. | 2181 | * accessed by any other context at this point. |
2182 | */ | 2182 | */ |
2183 | pc->mem_cgroup = mem; | 2183 | pc->mem_cgroup = mem; |
2184 | /* | 2184 | /* |
2185 | * We access a page_cgroup asynchronously without lock_page_cgroup(). | 2185 | * We access a page_cgroup asynchronously without lock_page_cgroup(). |
2186 | * Especially when a page_cgroup is taken from a page, pc->mem_cgroup | 2186 | * Especially when a page_cgroup is taken from a page, pc->mem_cgroup |
2187 | * is accessed after testing USED bit. To make pc->mem_cgroup visible | 2187 | * is accessed after testing USED bit. To make pc->mem_cgroup visible |
2188 | * before USED bit, we need memory barrier here. | 2188 | * before USED bit, we need memory barrier here. |
2189 | * See mem_cgroup_add_lru_list(), etc. | 2189 | * See mem_cgroup_add_lru_list(), etc. |
2190 | */ | 2190 | */ |
2191 | smp_wmb(); | 2191 | smp_wmb(); |
2192 | switch (ctype) { | 2192 | switch (ctype) { |
2193 | case MEM_CGROUP_CHARGE_TYPE_CACHE: | 2193 | case MEM_CGROUP_CHARGE_TYPE_CACHE: |
2194 | case MEM_CGROUP_CHARGE_TYPE_SHMEM: | 2194 | case MEM_CGROUP_CHARGE_TYPE_SHMEM: |
2195 | SetPageCgroupCache(pc); | 2195 | SetPageCgroupCache(pc); |
2196 | SetPageCgroupUsed(pc); | 2196 | SetPageCgroupUsed(pc); |
2197 | break; | 2197 | break; |
2198 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: | 2198 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: |
2199 | ClearPageCgroupCache(pc); | 2199 | ClearPageCgroupCache(pc); |
2200 | SetPageCgroupUsed(pc); | 2200 | SetPageCgroupUsed(pc); |
2201 | break; | 2201 | break; |
2202 | default: | 2202 | default: |
2203 | break; | 2203 | break; |
2204 | } | 2204 | } |
2205 | 2205 | ||
2206 | mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages); | 2206 | mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages); |
2207 | unlock_page_cgroup(pc); | 2207 | unlock_page_cgroup(pc); |
2208 | /* | 2208 | /* |
2209 | * "charge_statistics" updated event counter. Then, check it. | 2209 | * "charge_statistics" updated event counter. Then, check it. |
2210 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | 2210 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. |
2211 | * if they exceeds softlimit. | 2211 | * if they exceeds softlimit. |
2212 | */ | 2212 | */ |
2213 | memcg_check_events(mem, page); | 2213 | memcg_check_events(mem, page); |
2214 | } | 2214 | } |
2215 | 2215 | ||
2216 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 2216 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
2217 | 2217 | ||
2218 | #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ | 2218 | #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ |
2219 | (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION)) | 2219 | (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION)) |
2220 | /* | 2220 | /* |
2221 | * Because tail pages are not marked as "used", set it. We're under | 2221 | * Because tail pages are not marked as "used", set it. We're under |
2222 | * zone->lru_lock, 'splitting on pmd' and compund_lock. | 2222 | * zone->lru_lock, 'splitting on pmd' and compund_lock. |
2223 | */ | 2223 | */ |
2224 | void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) | 2224 | void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) |
2225 | { | 2225 | { |
2226 | struct page_cgroup *head_pc = lookup_page_cgroup(head); | 2226 | struct page_cgroup *head_pc = lookup_page_cgroup(head); |
2227 | struct page_cgroup *tail_pc = lookup_page_cgroup(tail); | 2227 | struct page_cgroup *tail_pc = lookup_page_cgroup(tail); |
2228 | unsigned long flags; | 2228 | unsigned long flags; |
2229 | 2229 | ||
2230 | if (mem_cgroup_disabled()) | 2230 | if (mem_cgroup_disabled()) |
2231 | return; | 2231 | return; |
2232 | /* | 2232 | /* |
2233 | * We have no races with charge/uncharge but will have races with | 2233 | * We have no races with charge/uncharge but will have races with |
2234 | * page state accounting. | 2234 | * page state accounting. |
2235 | */ | 2235 | */ |
2236 | move_lock_page_cgroup(head_pc, &flags); | 2236 | move_lock_page_cgroup(head_pc, &flags); |
2237 | 2237 | ||
2238 | tail_pc->mem_cgroup = head_pc->mem_cgroup; | 2238 | tail_pc->mem_cgroup = head_pc->mem_cgroup; |
2239 | smp_wmb(); /* see __commit_charge() */ | 2239 | smp_wmb(); /* see __commit_charge() */ |
2240 | if (PageCgroupAcctLRU(head_pc)) { | 2240 | if (PageCgroupAcctLRU(head_pc)) { |
2241 | enum lru_list lru; | 2241 | enum lru_list lru; |
2242 | struct mem_cgroup_per_zone *mz; | 2242 | struct mem_cgroup_per_zone *mz; |
2243 | 2243 | ||
2244 | /* | 2244 | /* |
2245 | * LRU flags cannot be copied because we need to add tail | 2245 | * LRU flags cannot be copied because we need to add tail |
2246 | *.page to LRU by generic call and our hook will be called. | 2246 | *.page to LRU by generic call and our hook will be called. |
2247 | * We hold lru_lock, then, reduce counter directly. | 2247 | * We hold lru_lock, then, reduce counter directly. |
2248 | */ | 2248 | */ |
2249 | lru = page_lru(head); | 2249 | lru = page_lru(head); |
2250 | mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head); | 2250 | mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head); |
2251 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; | 2251 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; |
2252 | } | 2252 | } |
2253 | tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; | 2253 | tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; |
2254 | move_unlock_page_cgroup(head_pc, &flags); | 2254 | move_unlock_page_cgroup(head_pc, &flags); |
2255 | } | 2255 | } |
2256 | #endif | 2256 | #endif |
2257 | 2257 | ||
2258 | /** | 2258 | /** |
2259 | * mem_cgroup_move_account - move account of the page | 2259 | * mem_cgroup_move_account - move account of the page |
2260 | * @page: the page | 2260 | * @page: the page |
2261 | * @nr_pages: number of regular pages (>1 for huge pages) | 2261 | * @nr_pages: number of regular pages (>1 for huge pages) |
2262 | * @pc: page_cgroup of the page. | 2262 | * @pc: page_cgroup of the page. |
2263 | * @from: mem_cgroup which the page is moved from. | 2263 | * @from: mem_cgroup which the page is moved from. |
2264 | * @to: mem_cgroup which the page is moved to. @from != @to. | 2264 | * @to: mem_cgroup which the page is moved to. @from != @to. |
2265 | * @uncharge: whether we should call uncharge and css_put against @from. | 2265 | * @uncharge: whether we should call uncharge and css_put against @from. |
2266 | * | 2266 | * |
2267 | * The caller must confirm following. | 2267 | * The caller must confirm following. |
2268 | * - page is not on LRU (isolate_page() is useful.) | 2268 | * - page is not on LRU (isolate_page() is useful.) |
2269 | * - compound_lock is held when nr_pages > 1 | 2269 | * - compound_lock is held when nr_pages > 1 |
2270 | * | 2270 | * |
2271 | * This function doesn't do "charge" nor css_get to new cgroup. It should be | 2271 | * This function doesn't do "charge" nor css_get to new cgroup. It should be |
2272 | * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is | 2272 | * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is |
2273 | * true, this function does "uncharge" from old cgroup, but it doesn't if | 2273 | * true, this function does "uncharge" from old cgroup, but it doesn't if |
2274 | * @uncharge is false, so a caller should do "uncharge". | 2274 | * @uncharge is false, so a caller should do "uncharge". |
2275 | */ | 2275 | */ |
2276 | static int mem_cgroup_move_account(struct page *page, | 2276 | static int mem_cgroup_move_account(struct page *page, |
2277 | unsigned int nr_pages, | 2277 | unsigned int nr_pages, |
2278 | struct page_cgroup *pc, | 2278 | struct page_cgroup *pc, |
2279 | struct mem_cgroup *from, | 2279 | struct mem_cgroup *from, |
2280 | struct mem_cgroup *to, | 2280 | struct mem_cgroup *to, |
2281 | bool uncharge) | 2281 | bool uncharge) |
2282 | { | 2282 | { |
2283 | unsigned long flags; | 2283 | unsigned long flags; |
2284 | int ret; | 2284 | int ret; |
2285 | 2285 | ||
2286 | VM_BUG_ON(from == to); | 2286 | VM_BUG_ON(from == to); |
2287 | VM_BUG_ON(PageLRU(page)); | 2287 | VM_BUG_ON(PageLRU(page)); |
2288 | /* | 2288 | /* |
2289 | * The page is isolated from LRU. So, collapse function | 2289 | * The page is isolated from LRU. So, collapse function |
2290 | * will not handle this page. But page splitting can happen. | 2290 | * will not handle this page. But page splitting can happen. |
2291 | * Do this check under compound_page_lock(). The caller should | 2291 | * Do this check under compound_page_lock(). The caller should |
2292 | * hold it. | 2292 | * hold it. |
2293 | */ | 2293 | */ |
2294 | ret = -EBUSY; | 2294 | ret = -EBUSY; |
2295 | if (nr_pages > 1 && !PageTransHuge(page)) | 2295 | if (nr_pages > 1 && !PageTransHuge(page)) |
2296 | goto out; | 2296 | goto out; |
2297 | 2297 | ||
2298 | lock_page_cgroup(pc); | 2298 | lock_page_cgroup(pc); |
2299 | 2299 | ||
2300 | ret = -EINVAL; | 2300 | ret = -EINVAL; |
2301 | if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) | 2301 | if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) |
2302 | goto unlock; | 2302 | goto unlock; |
2303 | 2303 | ||
2304 | move_lock_page_cgroup(pc, &flags); | 2304 | move_lock_page_cgroup(pc, &flags); |
2305 | 2305 | ||
2306 | if (PageCgroupFileMapped(pc)) { | 2306 | if (PageCgroupFileMapped(pc)) { |
2307 | /* Update mapped_file data for mem_cgroup */ | 2307 | /* Update mapped_file data for mem_cgroup */ |
2308 | preempt_disable(); | 2308 | preempt_disable(); |
2309 | __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); | 2309 | __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); |
2310 | __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); | 2310 | __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); |
2311 | preempt_enable(); | 2311 | preempt_enable(); |
2312 | } | 2312 | } |
2313 | mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); | 2313 | mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); |
2314 | if (uncharge) | 2314 | if (uncharge) |
2315 | /* This is not "cancel", but cancel_charge does all we need. */ | 2315 | /* This is not "cancel", but cancel_charge does all we need. */ |
2316 | __mem_cgroup_cancel_charge(from, nr_pages); | 2316 | __mem_cgroup_cancel_charge(from, nr_pages); |
2317 | 2317 | ||
2318 | /* caller should have done css_get */ | 2318 | /* caller should have done css_get */ |
2319 | pc->mem_cgroup = to; | 2319 | pc->mem_cgroup = to; |
2320 | mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages); | 2320 | mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages); |
2321 | /* | 2321 | /* |
2322 | * We charges against "to" which may not have any tasks. Then, "to" | 2322 | * We charges against "to" which may not have any tasks. Then, "to" |
2323 | * can be under rmdir(). But in current implementation, caller of | 2323 | * can be under rmdir(). But in current implementation, caller of |
2324 | * this function is just force_empty() and move charge, so it's | 2324 | * this function is just force_empty() and move charge, so it's |
2325 | * guaranteed that "to" is never removed. So, we don't check rmdir | 2325 | * guaranteed that "to" is never removed. So, we don't check rmdir |
2326 | * status here. | 2326 | * status here. |
2327 | */ | 2327 | */ |
2328 | move_unlock_page_cgroup(pc, &flags); | 2328 | move_unlock_page_cgroup(pc, &flags); |
2329 | ret = 0; | 2329 | ret = 0; |
2330 | unlock: | 2330 | unlock: |
2331 | unlock_page_cgroup(pc); | 2331 | unlock_page_cgroup(pc); |
2332 | /* | 2332 | /* |
2333 | * check events | 2333 | * check events |
2334 | */ | 2334 | */ |
2335 | memcg_check_events(to, page); | 2335 | memcg_check_events(to, page); |
2336 | memcg_check_events(from, page); | 2336 | memcg_check_events(from, page); |
2337 | out: | 2337 | out: |
2338 | return ret; | 2338 | return ret; |
2339 | } | 2339 | } |
2340 | 2340 | ||
2341 | /* | 2341 | /* |
2342 | * move charges to its parent. | 2342 | * move charges to its parent. |
2343 | */ | 2343 | */ |
2344 | 2344 | ||
2345 | static int mem_cgroup_move_parent(struct page *page, | 2345 | static int mem_cgroup_move_parent(struct page *page, |
2346 | struct page_cgroup *pc, | 2346 | struct page_cgroup *pc, |
2347 | struct mem_cgroup *child, | 2347 | struct mem_cgroup *child, |
2348 | gfp_t gfp_mask) | 2348 | gfp_t gfp_mask) |
2349 | { | 2349 | { |
2350 | struct cgroup *cg = child->css.cgroup; | 2350 | struct cgroup *cg = child->css.cgroup; |
2351 | struct cgroup *pcg = cg->parent; | 2351 | struct cgroup *pcg = cg->parent; |
2352 | struct mem_cgroup *parent; | 2352 | struct mem_cgroup *parent; |
2353 | unsigned int nr_pages; | 2353 | unsigned int nr_pages; |
2354 | unsigned long uninitialized_var(flags); | 2354 | unsigned long uninitialized_var(flags); |
2355 | int ret; | 2355 | int ret; |
2356 | 2356 | ||
2357 | /* Is ROOT ? */ | 2357 | /* Is ROOT ? */ |
2358 | if (!pcg) | 2358 | if (!pcg) |
2359 | return -EINVAL; | 2359 | return -EINVAL; |
2360 | 2360 | ||
2361 | ret = -EBUSY; | 2361 | ret = -EBUSY; |
2362 | if (!get_page_unless_zero(page)) | 2362 | if (!get_page_unless_zero(page)) |
2363 | goto out; | 2363 | goto out; |
2364 | if (isolate_lru_page(page)) | 2364 | if (isolate_lru_page(page)) |
2365 | goto put; | 2365 | goto put; |
2366 | 2366 | ||
2367 | nr_pages = hpage_nr_pages(page); | 2367 | nr_pages = hpage_nr_pages(page); |
2368 | 2368 | ||
2369 | parent = mem_cgroup_from_cont(pcg); | 2369 | parent = mem_cgroup_from_cont(pcg); |
2370 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); | 2370 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); |
2371 | if (ret || !parent) | 2371 | if (ret || !parent) |
2372 | goto put_back; | 2372 | goto put_back; |
2373 | 2373 | ||
2374 | if (nr_pages > 1) | 2374 | if (nr_pages > 1) |
2375 | flags = compound_lock_irqsave(page); | 2375 | flags = compound_lock_irqsave(page); |
2376 | 2376 | ||
2377 | ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true); | 2377 | ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true); |
2378 | if (ret) | 2378 | if (ret) |
2379 | __mem_cgroup_cancel_charge(parent, nr_pages); | 2379 | __mem_cgroup_cancel_charge(parent, nr_pages); |
2380 | 2380 | ||
2381 | if (nr_pages > 1) | 2381 | if (nr_pages > 1) |
2382 | compound_unlock_irqrestore(page, flags); | 2382 | compound_unlock_irqrestore(page, flags); |
2383 | put_back: | 2383 | put_back: |
2384 | putback_lru_page(page); | 2384 | putback_lru_page(page); |
2385 | put: | 2385 | put: |
2386 | put_page(page); | 2386 | put_page(page); |
2387 | out: | 2387 | out: |
2388 | return ret; | 2388 | return ret; |
2389 | } | 2389 | } |
2390 | 2390 | ||
2391 | /* | 2391 | /* |
2392 | * Charge the memory controller for page usage. | 2392 | * Charge the memory controller for page usage. |
2393 | * Return | 2393 | * Return |
2394 | * 0 if the charge was successful | 2394 | * 0 if the charge was successful |
2395 | * < 0 if the cgroup is over its limit | 2395 | * < 0 if the cgroup is over its limit |
2396 | */ | 2396 | */ |
2397 | static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | 2397 | static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, |
2398 | gfp_t gfp_mask, enum charge_type ctype) | 2398 | gfp_t gfp_mask, enum charge_type ctype) |
2399 | { | 2399 | { |
2400 | struct mem_cgroup *mem = NULL; | 2400 | struct mem_cgroup *mem = NULL; |
2401 | unsigned int nr_pages = 1; | 2401 | unsigned int nr_pages = 1; |
2402 | struct page_cgroup *pc; | 2402 | struct page_cgroup *pc; |
2403 | bool oom = true; | 2403 | bool oom = true; |
2404 | int ret; | 2404 | int ret; |
2405 | 2405 | ||
2406 | if (PageTransHuge(page)) { | 2406 | if (PageTransHuge(page)) { |
2407 | nr_pages <<= compound_order(page); | 2407 | nr_pages <<= compound_order(page); |
2408 | VM_BUG_ON(!PageTransHuge(page)); | 2408 | VM_BUG_ON(!PageTransHuge(page)); |
2409 | /* | 2409 | /* |
2410 | * Never OOM-kill a process for a huge page. The | 2410 | * Never OOM-kill a process for a huge page. The |
2411 | * fault handler will fall back to regular pages. | 2411 | * fault handler will fall back to regular pages. |
2412 | */ | 2412 | */ |
2413 | oom = false; | 2413 | oom = false; |
2414 | } | 2414 | } |
2415 | 2415 | ||
2416 | pc = lookup_page_cgroup(page); | 2416 | pc = lookup_page_cgroup(page); |
2417 | BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */ | 2417 | BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */ |
2418 | 2418 | ||
2419 | ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom); | 2419 | ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom); |
2420 | if (ret || !mem) | 2420 | if (ret || !mem) |
2421 | return ret; | 2421 | return ret; |
2422 | 2422 | ||
2423 | __mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype); | 2423 | __mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype); |
2424 | return 0; | 2424 | return 0; |
2425 | } | 2425 | } |
2426 | 2426 | ||
2427 | int mem_cgroup_newpage_charge(struct page *page, | 2427 | int mem_cgroup_newpage_charge(struct page *page, |
2428 | struct mm_struct *mm, gfp_t gfp_mask) | 2428 | struct mm_struct *mm, gfp_t gfp_mask) |
2429 | { | 2429 | { |
2430 | if (mem_cgroup_disabled()) | 2430 | if (mem_cgroup_disabled()) |
2431 | return 0; | 2431 | return 0; |
2432 | /* | 2432 | /* |
2433 | * If already mapped, we don't have to account. | 2433 | * If already mapped, we don't have to account. |
2434 | * If page cache, page->mapping has address_space. | 2434 | * If page cache, page->mapping has address_space. |
2435 | * But page->mapping may have out-of-use anon_vma pointer, | 2435 | * But page->mapping may have out-of-use anon_vma pointer, |
2436 | * detecit it by PageAnon() check. newly-mapped-anon's page->mapping | 2436 | * detecit it by PageAnon() check. newly-mapped-anon's page->mapping |
2437 | * is NULL. | 2437 | * is NULL. |
2438 | */ | 2438 | */ |
2439 | if (page_mapped(page) || (page->mapping && !PageAnon(page))) | 2439 | if (page_mapped(page) || (page->mapping && !PageAnon(page))) |
2440 | return 0; | 2440 | return 0; |
2441 | if (unlikely(!mm)) | 2441 | if (unlikely(!mm)) |
2442 | mm = &init_mm; | 2442 | mm = &init_mm; |
2443 | return mem_cgroup_charge_common(page, mm, gfp_mask, | 2443 | return mem_cgroup_charge_common(page, mm, gfp_mask, |
2444 | MEM_CGROUP_CHARGE_TYPE_MAPPED); | 2444 | MEM_CGROUP_CHARGE_TYPE_MAPPED); |
2445 | } | 2445 | } |
2446 | 2446 | ||
2447 | static void | 2447 | static void |
2448 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | 2448 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, |
2449 | enum charge_type ctype); | 2449 | enum charge_type ctype); |
2450 | 2450 | ||
2451 | static void | 2451 | static void |
2452 | __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem, | 2452 | __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem, |
2453 | enum charge_type ctype) | 2453 | enum charge_type ctype) |
2454 | { | 2454 | { |
2455 | struct page_cgroup *pc = lookup_page_cgroup(page); | 2455 | struct page_cgroup *pc = lookup_page_cgroup(page); |
2456 | /* | 2456 | /* |
2457 | * In some case, SwapCache, FUSE(splice_buf->radixtree), the page | 2457 | * In some case, SwapCache, FUSE(splice_buf->radixtree), the page |
2458 | * is already on LRU. It means the page may on some other page_cgroup's | 2458 | * is already on LRU. It means the page may on some other page_cgroup's |
2459 | * LRU. Take care of it. | 2459 | * LRU. Take care of it. |
2460 | */ | 2460 | */ |
2461 | mem_cgroup_lru_del_before_commit(page); | 2461 | mem_cgroup_lru_del_before_commit(page); |
2462 | __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); | 2462 | __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); |
2463 | mem_cgroup_lru_add_after_commit(page); | 2463 | mem_cgroup_lru_add_after_commit(page); |
2464 | return; | 2464 | return; |
2465 | } | 2465 | } |
2466 | 2466 | ||
2467 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | 2467 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, |
2468 | gfp_t gfp_mask) | 2468 | gfp_t gfp_mask) |
2469 | { | 2469 | { |
2470 | struct mem_cgroup *mem = NULL; | 2470 | struct mem_cgroup *mem = NULL; |
2471 | int ret; | 2471 | int ret; |
2472 | 2472 | ||
2473 | if (mem_cgroup_disabled()) | 2473 | if (mem_cgroup_disabled()) |
2474 | return 0; | 2474 | return 0; |
2475 | if (PageCompound(page)) | 2475 | if (PageCompound(page)) |
2476 | return 0; | 2476 | return 0; |
2477 | /* | 2477 | /* |
2478 | * Corner case handling. This is called from add_to_page_cache() | 2478 | * Corner case handling. This is called from add_to_page_cache() |
2479 | * in usual. But some FS (shmem) precharges this page before calling it | 2479 | * in usual. But some FS (shmem) precharges this page before calling it |
2480 | * and call add_to_page_cache() with GFP_NOWAIT. | 2480 | * and call add_to_page_cache() with GFP_NOWAIT. |
2481 | * | 2481 | * |
2482 | * For GFP_NOWAIT case, the page may be pre-charged before calling | 2482 | * For GFP_NOWAIT case, the page may be pre-charged before calling |
2483 | * add_to_page_cache(). (See shmem.c) check it here and avoid to call | 2483 | * add_to_page_cache(). (See shmem.c) check it here and avoid to call |
2484 | * charge twice. (It works but has to pay a bit larger cost.) | 2484 | * charge twice. (It works but has to pay a bit larger cost.) |
2485 | * And when the page is SwapCache, it should take swap information | 2485 | * And when the page is SwapCache, it should take swap information |
2486 | * into account. This is under lock_page() now. | 2486 | * into account. This is under lock_page() now. |
2487 | */ | 2487 | */ |
2488 | if (!(gfp_mask & __GFP_WAIT)) { | 2488 | if (!(gfp_mask & __GFP_WAIT)) { |
2489 | struct page_cgroup *pc; | 2489 | struct page_cgroup *pc; |
2490 | 2490 | ||
2491 | pc = lookup_page_cgroup(page); | 2491 | pc = lookup_page_cgroup(page); |
2492 | if (!pc) | 2492 | if (!pc) |
2493 | return 0; | 2493 | return 0; |
2494 | lock_page_cgroup(pc); | 2494 | lock_page_cgroup(pc); |
2495 | if (PageCgroupUsed(pc)) { | 2495 | if (PageCgroupUsed(pc)) { |
2496 | unlock_page_cgroup(pc); | 2496 | unlock_page_cgroup(pc); |
2497 | return 0; | 2497 | return 0; |
2498 | } | 2498 | } |
2499 | unlock_page_cgroup(pc); | 2499 | unlock_page_cgroup(pc); |
2500 | } | 2500 | } |
2501 | 2501 | ||
2502 | if (unlikely(!mm)) | 2502 | if (unlikely(!mm)) |
2503 | mm = &init_mm; | 2503 | mm = &init_mm; |
2504 | 2504 | ||
2505 | if (page_is_file_cache(page)) { | 2505 | if (page_is_file_cache(page)) { |
2506 | ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true); | 2506 | ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true); |
2507 | if (ret || !mem) | 2507 | if (ret || !mem) |
2508 | return ret; | 2508 | return ret; |
2509 | 2509 | ||
2510 | /* | 2510 | /* |
2511 | * FUSE reuses pages without going through the final | 2511 | * FUSE reuses pages without going through the final |
2512 | * put that would remove them from the LRU list, make | 2512 | * put that would remove them from the LRU list, make |
2513 | * sure that they get relinked properly. | 2513 | * sure that they get relinked properly. |
2514 | */ | 2514 | */ |
2515 | __mem_cgroup_commit_charge_lrucare(page, mem, | 2515 | __mem_cgroup_commit_charge_lrucare(page, mem, |
2516 | MEM_CGROUP_CHARGE_TYPE_CACHE); | 2516 | MEM_CGROUP_CHARGE_TYPE_CACHE); |
2517 | return ret; | 2517 | return ret; |
2518 | } | 2518 | } |
2519 | /* shmem */ | 2519 | /* shmem */ |
2520 | if (PageSwapCache(page)) { | 2520 | if (PageSwapCache(page)) { |
2521 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); | 2521 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); |
2522 | if (!ret) | 2522 | if (!ret) |
2523 | __mem_cgroup_commit_charge_swapin(page, mem, | 2523 | __mem_cgroup_commit_charge_swapin(page, mem, |
2524 | MEM_CGROUP_CHARGE_TYPE_SHMEM); | 2524 | MEM_CGROUP_CHARGE_TYPE_SHMEM); |
2525 | } else | 2525 | } else |
2526 | ret = mem_cgroup_charge_common(page, mm, gfp_mask, | 2526 | ret = mem_cgroup_charge_common(page, mm, gfp_mask, |
2527 | MEM_CGROUP_CHARGE_TYPE_SHMEM); | 2527 | MEM_CGROUP_CHARGE_TYPE_SHMEM); |
2528 | 2528 | ||
2529 | return ret; | 2529 | return ret; |
2530 | } | 2530 | } |
2531 | 2531 | ||
2532 | /* | 2532 | /* |
2533 | * While swap-in, try_charge -> commit or cancel, the page is locked. | 2533 | * While swap-in, try_charge -> commit or cancel, the page is locked. |
2534 | * And when try_charge() successfully returns, one refcnt to memcg without | 2534 | * And when try_charge() successfully returns, one refcnt to memcg without |
2535 | * struct page_cgroup is acquired. This refcnt will be consumed by | 2535 | * struct page_cgroup is acquired. This refcnt will be consumed by |
2536 | * "commit()" or removed by "cancel()" | 2536 | * "commit()" or removed by "cancel()" |
2537 | */ | 2537 | */ |
2538 | int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | 2538 | int mem_cgroup_try_charge_swapin(struct mm_struct *mm, |
2539 | struct page *page, | 2539 | struct page *page, |
2540 | gfp_t mask, struct mem_cgroup **ptr) | 2540 | gfp_t mask, struct mem_cgroup **ptr) |
2541 | { | 2541 | { |
2542 | struct mem_cgroup *mem; | 2542 | struct mem_cgroup *mem; |
2543 | int ret; | 2543 | int ret; |
2544 | 2544 | ||
2545 | *ptr = NULL; | 2545 | *ptr = NULL; |
2546 | 2546 | ||
2547 | if (mem_cgroup_disabled()) | 2547 | if (mem_cgroup_disabled()) |
2548 | return 0; | 2548 | return 0; |
2549 | 2549 | ||
2550 | if (!do_swap_account) | 2550 | if (!do_swap_account) |
2551 | goto charge_cur_mm; | 2551 | goto charge_cur_mm; |
2552 | /* | 2552 | /* |
2553 | * A racing thread's fault, or swapoff, may have already updated | 2553 | * A racing thread's fault, or swapoff, may have already updated |
2554 | * the pte, and even removed page from swap cache: in those cases | 2554 | * the pte, and even removed page from swap cache: in those cases |
2555 | * do_swap_page()'s pte_same() test will fail; but there's also a | 2555 | * do_swap_page()'s pte_same() test will fail; but there's also a |
2556 | * KSM case which does need to charge the page. | 2556 | * KSM case which does need to charge the page. |
2557 | */ | 2557 | */ |
2558 | if (!PageSwapCache(page)) | 2558 | if (!PageSwapCache(page)) |
2559 | goto charge_cur_mm; | 2559 | goto charge_cur_mm; |
2560 | mem = try_get_mem_cgroup_from_page(page); | 2560 | mem = try_get_mem_cgroup_from_page(page); |
2561 | if (!mem) | 2561 | if (!mem) |
2562 | goto charge_cur_mm; | 2562 | goto charge_cur_mm; |
2563 | *ptr = mem; | 2563 | *ptr = mem; |
2564 | ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true); | 2564 | ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true); |
2565 | css_put(&mem->css); | 2565 | css_put(&mem->css); |
2566 | return ret; | 2566 | return ret; |
2567 | charge_cur_mm: | 2567 | charge_cur_mm: |
2568 | if (unlikely(!mm)) | 2568 | if (unlikely(!mm)) |
2569 | mm = &init_mm; | 2569 | mm = &init_mm; |
2570 | return __mem_cgroup_try_charge(mm, mask, 1, ptr, true); | 2570 | return __mem_cgroup_try_charge(mm, mask, 1, ptr, true); |
2571 | } | 2571 | } |
2572 | 2572 | ||
2573 | static void | 2573 | static void |
2574 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | 2574 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, |
2575 | enum charge_type ctype) | 2575 | enum charge_type ctype) |
2576 | { | 2576 | { |
2577 | if (mem_cgroup_disabled()) | 2577 | if (mem_cgroup_disabled()) |
2578 | return; | 2578 | return; |
2579 | if (!ptr) | 2579 | if (!ptr) |
2580 | return; | 2580 | return; |
2581 | cgroup_exclude_rmdir(&ptr->css); | 2581 | cgroup_exclude_rmdir(&ptr->css); |
2582 | 2582 | ||
2583 | __mem_cgroup_commit_charge_lrucare(page, ptr, ctype); | 2583 | __mem_cgroup_commit_charge_lrucare(page, ptr, ctype); |
2584 | /* | 2584 | /* |
2585 | * Now swap is on-memory. This means this page may be | 2585 | * Now swap is on-memory. This means this page may be |
2586 | * counted both as mem and swap....double count. | 2586 | * counted both as mem and swap....double count. |
2587 | * Fix it by uncharging from memsw. Basically, this SwapCache is stable | 2587 | * Fix it by uncharging from memsw. Basically, this SwapCache is stable |
2588 | * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() | 2588 | * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() |
2589 | * may call delete_from_swap_cache() before reach here. | 2589 | * may call delete_from_swap_cache() before reach here. |
2590 | */ | 2590 | */ |
2591 | if (do_swap_account && PageSwapCache(page)) { | 2591 | if (do_swap_account && PageSwapCache(page)) { |
2592 | swp_entry_t ent = {.val = page_private(page)}; | 2592 | swp_entry_t ent = {.val = page_private(page)}; |
2593 | unsigned short id; | 2593 | unsigned short id; |
2594 | struct mem_cgroup *memcg; | 2594 | struct mem_cgroup *memcg; |
2595 | 2595 | ||
2596 | id = swap_cgroup_record(ent, 0); | 2596 | id = swap_cgroup_record(ent, 0); |
2597 | rcu_read_lock(); | 2597 | rcu_read_lock(); |
2598 | memcg = mem_cgroup_lookup(id); | 2598 | memcg = mem_cgroup_lookup(id); |
2599 | if (memcg) { | 2599 | if (memcg) { |
2600 | /* | 2600 | /* |
2601 | * This recorded memcg can be obsolete one. So, avoid | 2601 | * This recorded memcg can be obsolete one. So, avoid |
2602 | * calling css_tryget | 2602 | * calling css_tryget |
2603 | */ | 2603 | */ |
2604 | if (!mem_cgroup_is_root(memcg)) | 2604 | if (!mem_cgroup_is_root(memcg)) |
2605 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | 2605 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); |
2606 | mem_cgroup_swap_statistics(memcg, false); | 2606 | mem_cgroup_swap_statistics(memcg, false); |
2607 | mem_cgroup_put(memcg); | 2607 | mem_cgroup_put(memcg); |
2608 | } | 2608 | } |
2609 | rcu_read_unlock(); | 2609 | rcu_read_unlock(); |
2610 | } | 2610 | } |
2611 | /* | 2611 | /* |
2612 | * At swapin, we may charge account against cgroup which has no tasks. | 2612 | * At swapin, we may charge account against cgroup which has no tasks. |
2613 | * So, rmdir()->pre_destroy() can be called while we do this charge. | 2613 | * So, rmdir()->pre_destroy() can be called while we do this charge. |
2614 | * In that case, we need to call pre_destroy() again. check it here. | 2614 | * In that case, we need to call pre_destroy() again. check it here. |
2615 | */ | 2615 | */ |
2616 | cgroup_release_and_wakeup_rmdir(&ptr->css); | 2616 | cgroup_release_and_wakeup_rmdir(&ptr->css); |
2617 | } | 2617 | } |
2618 | 2618 | ||
2619 | void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) | 2619 | void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) |
2620 | { | 2620 | { |
2621 | __mem_cgroup_commit_charge_swapin(page, ptr, | 2621 | __mem_cgroup_commit_charge_swapin(page, ptr, |
2622 | MEM_CGROUP_CHARGE_TYPE_MAPPED); | 2622 | MEM_CGROUP_CHARGE_TYPE_MAPPED); |
2623 | } | 2623 | } |
2624 | 2624 | ||
2625 | void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | 2625 | void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) |
2626 | { | 2626 | { |
2627 | if (mem_cgroup_disabled()) | 2627 | if (mem_cgroup_disabled()) |
2628 | return; | 2628 | return; |
2629 | if (!mem) | 2629 | if (!mem) |
2630 | return; | 2630 | return; |
2631 | __mem_cgroup_cancel_charge(mem, 1); | 2631 | __mem_cgroup_cancel_charge(mem, 1); |
2632 | } | 2632 | } |
2633 | 2633 | ||
2634 | static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, | 2634 | static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, |
2635 | unsigned int nr_pages, | 2635 | unsigned int nr_pages, |
2636 | const enum charge_type ctype) | 2636 | const enum charge_type ctype) |
2637 | { | 2637 | { |
2638 | struct memcg_batch_info *batch = NULL; | 2638 | struct memcg_batch_info *batch = NULL; |
2639 | bool uncharge_memsw = true; | 2639 | bool uncharge_memsw = true; |
2640 | 2640 | ||
2641 | /* If swapout, usage of swap doesn't decrease */ | 2641 | /* If swapout, usage of swap doesn't decrease */ |
2642 | if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 2642 | if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
2643 | uncharge_memsw = false; | 2643 | uncharge_memsw = false; |
2644 | 2644 | ||
2645 | batch = ¤t->memcg_batch; | 2645 | batch = ¤t->memcg_batch; |
2646 | /* | 2646 | /* |
2647 | * In usual, we do css_get() when we remember memcg pointer. | 2647 | * In usual, we do css_get() when we remember memcg pointer. |
2648 | * But in this case, we keep res->usage until end of a series of | 2648 | * But in this case, we keep res->usage until end of a series of |
2649 | * uncharges. Then, it's ok to ignore memcg's refcnt. | 2649 | * uncharges. Then, it's ok to ignore memcg's refcnt. |
2650 | */ | 2650 | */ |
2651 | if (!batch->memcg) | 2651 | if (!batch->memcg) |
2652 | batch->memcg = mem; | 2652 | batch->memcg = mem; |
2653 | /* | 2653 | /* |
2654 | * do_batch > 0 when unmapping pages or inode invalidate/truncate. | 2654 | * do_batch > 0 when unmapping pages or inode invalidate/truncate. |
2655 | * In those cases, all pages freed continuously can be expected to be in | 2655 | * In those cases, all pages freed continuously can be expected to be in |
2656 | * the same cgroup and we have chance to coalesce uncharges. | 2656 | * the same cgroup and we have chance to coalesce uncharges. |
2657 | * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) | 2657 | * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) |
2658 | * because we want to do uncharge as soon as possible. | 2658 | * because we want to do uncharge as soon as possible. |
2659 | */ | 2659 | */ |
2660 | 2660 | ||
2661 | if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) | 2661 | if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) |
2662 | goto direct_uncharge; | 2662 | goto direct_uncharge; |
2663 | 2663 | ||
2664 | if (nr_pages > 1) | 2664 | if (nr_pages > 1) |
2665 | goto direct_uncharge; | 2665 | goto direct_uncharge; |
2666 | 2666 | ||
2667 | /* | 2667 | /* |
2668 | * In typical case, batch->memcg == mem. This means we can | 2668 | * In typical case, batch->memcg == mem. This means we can |
2669 | * merge a series of uncharges to an uncharge of res_counter. | 2669 | * merge a series of uncharges to an uncharge of res_counter. |
2670 | * If not, we uncharge res_counter ony by one. | 2670 | * If not, we uncharge res_counter ony by one. |
2671 | */ | 2671 | */ |
2672 | if (batch->memcg != mem) | 2672 | if (batch->memcg != mem) |
2673 | goto direct_uncharge; | 2673 | goto direct_uncharge; |
2674 | /* remember freed charge and uncharge it later */ | 2674 | /* remember freed charge and uncharge it later */ |
2675 | batch->nr_pages++; | 2675 | batch->nr_pages++; |
2676 | if (uncharge_memsw) | 2676 | if (uncharge_memsw) |
2677 | batch->memsw_nr_pages++; | 2677 | batch->memsw_nr_pages++; |
2678 | return; | 2678 | return; |
2679 | direct_uncharge: | 2679 | direct_uncharge: |
2680 | res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE); | 2680 | res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE); |
2681 | if (uncharge_memsw) | 2681 | if (uncharge_memsw) |
2682 | res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE); | 2682 | res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE); |
2683 | if (unlikely(batch->memcg != mem)) | 2683 | if (unlikely(batch->memcg != mem)) |
2684 | memcg_oom_recover(mem); | 2684 | memcg_oom_recover(mem); |
2685 | return; | 2685 | return; |
2686 | } | 2686 | } |
2687 | 2687 | ||
2688 | /* | 2688 | /* |
2689 | * uncharge if !page_mapped(page) | 2689 | * uncharge if !page_mapped(page) |
2690 | */ | 2690 | */ |
2691 | static struct mem_cgroup * | 2691 | static struct mem_cgroup * |
2692 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | 2692 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) |
2693 | { | 2693 | { |
2694 | struct mem_cgroup *mem = NULL; | 2694 | struct mem_cgroup *mem = NULL; |
2695 | unsigned int nr_pages = 1; | 2695 | unsigned int nr_pages = 1; |
2696 | struct page_cgroup *pc; | 2696 | struct page_cgroup *pc; |
2697 | 2697 | ||
2698 | if (mem_cgroup_disabled()) | 2698 | if (mem_cgroup_disabled()) |
2699 | return NULL; | 2699 | return NULL; |
2700 | 2700 | ||
2701 | if (PageSwapCache(page)) | 2701 | if (PageSwapCache(page)) |
2702 | return NULL; | 2702 | return NULL; |
2703 | 2703 | ||
2704 | if (PageTransHuge(page)) { | 2704 | if (PageTransHuge(page)) { |
2705 | nr_pages <<= compound_order(page); | 2705 | nr_pages <<= compound_order(page); |
2706 | VM_BUG_ON(!PageTransHuge(page)); | 2706 | VM_BUG_ON(!PageTransHuge(page)); |
2707 | } | 2707 | } |
2708 | /* | 2708 | /* |
2709 | * Check if our page_cgroup is valid | 2709 | * Check if our page_cgroup is valid |
2710 | */ | 2710 | */ |
2711 | pc = lookup_page_cgroup(page); | 2711 | pc = lookup_page_cgroup(page); |
2712 | if (unlikely(!pc || !PageCgroupUsed(pc))) | 2712 | if (unlikely(!pc || !PageCgroupUsed(pc))) |
2713 | return NULL; | 2713 | return NULL; |
2714 | 2714 | ||
2715 | lock_page_cgroup(pc); | 2715 | lock_page_cgroup(pc); |
2716 | 2716 | ||
2717 | mem = pc->mem_cgroup; | 2717 | mem = pc->mem_cgroup; |
2718 | 2718 | ||
2719 | if (!PageCgroupUsed(pc)) | 2719 | if (!PageCgroupUsed(pc)) |
2720 | goto unlock_out; | 2720 | goto unlock_out; |
2721 | 2721 | ||
2722 | switch (ctype) { | 2722 | switch (ctype) { |
2723 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: | 2723 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: |
2724 | case MEM_CGROUP_CHARGE_TYPE_DROP: | 2724 | case MEM_CGROUP_CHARGE_TYPE_DROP: |
2725 | /* See mem_cgroup_prepare_migration() */ | 2725 | /* See mem_cgroup_prepare_migration() */ |
2726 | if (page_mapped(page) || PageCgroupMigration(pc)) | 2726 | if (page_mapped(page) || PageCgroupMigration(pc)) |
2727 | goto unlock_out; | 2727 | goto unlock_out; |
2728 | break; | 2728 | break; |
2729 | case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: | 2729 | case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: |
2730 | if (!PageAnon(page)) { /* Shared memory */ | 2730 | if (!PageAnon(page)) { /* Shared memory */ |
2731 | if (page->mapping && !page_is_file_cache(page)) | 2731 | if (page->mapping && !page_is_file_cache(page)) |
2732 | goto unlock_out; | 2732 | goto unlock_out; |
2733 | } else if (page_mapped(page)) /* Anon */ | 2733 | } else if (page_mapped(page)) /* Anon */ |
2734 | goto unlock_out; | 2734 | goto unlock_out; |
2735 | break; | 2735 | break; |
2736 | default: | 2736 | default: |
2737 | break; | 2737 | break; |
2738 | } | 2738 | } |
2739 | 2739 | ||
2740 | mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages); | 2740 | mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages); |
2741 | 2741 | ||
2742 | ClearPageCgroupUsed(pc); | 2742 | ClearPageCgroupUsed(pc); |
2743 | /* | 2743 | /* |
2744 | * pc->mem_cgroup is not cleared here. It will be accessed when it's | 2744 | * pc->mem_cgroup is not cleared here. It will be accessed when it's |
2745 | * freed from LRU. This is safe because uncharged page is expected not | 2745 | * freed from LRU. This is safe because uncharged page is expected not |
2746 | * to be reused (freed soon). Exception is SwapCache, it's handled by | 2746 | * to be reused (freed soon). Exception is SwapCache, it's handled by |
2747 | * special functions. | 2747 | * special functions. |
2748 | */ | 2748 | */ |
2749 | 2749 | ||
2750 | unlock_page_cgroup(pc); | 2750 | unlock_page_cgroup(pc); |
2751 | /* | 2751 | /* |
2752 | * even after unlock, we have mem->res.usage here and this memcg | 2752 | * even after unlock, we have mem->res.usage here and this memcg |
2753 | * will never be freed. | 2753 | * will never be freed. |
2754 | */ | 2754 | */ |
2755 | memcg_check_events(mem, page); | 2755 | memcg_check_events(mem, page); |
2756 | if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { | 2756 | if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { |
2757 | mem_cgroup_swap_statistics(mem, true); | 2757 | mem_cgroup_swap_statistics(mem, true); |
2758 | mem_cgroup_get(mem); | 2758 | mem_cgroup_get(mem); |
2759 | } | 2759 | } |
2760 | if (!mem_cgroup_is_root(mem)) | 2760 | if (!mem_cgroup_is_root(mem)) |
2761 | mem_cgroup_do_uncharge(mem, nr_pages, ctype); | 2761 | mem_cgroup_do_uncharge(mem, nr_pages, ctype); |
2762 | 2762 | ||
2763 | return mem; | 2763 | return mem; |
2764 | 2764 | ||
2765 | unlock_out: | 2765 | unlock_out: |
2766 | unlock_page_cgroup(pc); | 2766 | unlock_page_cgroup(pc); |
2767 | return NULL; | 2767 | return NULL; |
2768 | } | 2768 | } |
2769 | 2769 | ||
2770 | void mem_cgroup_uncharge_page(struct page *page) | 2770 | void mem_cgroup_uncharge_page(struct page *page) |
2771 | { | 2771 | { |
2772 | /* early check. */ | 2772 | /* early check. */ |
2773 | if (page_mapped(page)) | 2773 | if (page_mapped(page)) |
2774 | return; | 2774 | return; |
2775 | if (page->mapping && !PageAnon(page)) | 2775 | if (page->mapping && !PageAnon(page)) |
2776 | return; | 2776 | return; |
2777 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); | 2777 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); |
2778 | } | 2778 | } |
2779 | 2779 | ||
2780 | void mem_cgroup_uncharge_cache_page(struct page *page) | 2780 | void mem_cgroup_uncharge_cache_page(struct page *page) |
2781 | { | 2781 | { |
2782 | VM_BUG_ON(page_mapped(page)); | 2782 | VM_BUG_ON(page_mapped(page)); |
2783 | VM_BUG_ON(page->mapping); | 2783 | VM_BUG_ON(page->mapping); |
2784 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); | 2784 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); |
2785 | } | 2785 | } |
2786 | 2786 | ||
2787 | /* | 2787 | /* |
2788 | * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. | 2788 | * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. |
2789 | * In that cases, pages are freed continuously and we can expect pages | 2789 | * In that cases, pages are freed continuously and we can expect pages |
2790 | * are in the same memcg. All these calls itself limits the number of | 2790 | * are in the same memcg. All these calls itself limits the number of |
2791 | * pages freed at once, then uncharge_start/end() is called properly. | 2791 | * pages freed at once, then uncharge_start/end() is called properly. |
2792 | * This may be called prural(2) times in a context, | 2792 | * This may be called prural(2) times in a context, |
2793 | */ | 2793 | */ |
2794 | 2794 | ||
2795 | void mem_cgroup_uncharge_start(void) | 2795 | void mem_cgroup_uncharge_start(void) |
2796 | { | 2796 | { |
2797 | current->memcg_batch.do_batch++; | 2797 | current->memcg_batch.do_batch++; |
2798 | /* We can do nest. */ | 2798 | /* We can do nest. */ |
2799 | if (current->memcg_batch.do_batch == 1) { | 2799 | if (current->memcg_batch.do_batch == 1) { |
2800 | current->memcg_batch.memcg = NULL; | 2800 | current->memcg_batch.memcg = NULL; |
2801 | current->memcg_batch.nr_pages = 0; | 2801 | current->memcg_batch.nr_pages = 0; |
2802 | current->memcg_batch.memsw_nr_pages = 0; | 2802 | current->memcg_batch.memsw_nr_pages = 0; |
2803 | } | 2803 | } |
2804 | } | 2804 | } |
2805 | 2805 | ||
2806 | void mem_cgroup_uncharge_end(void) | 2806 | void mem_cgroup_uncharge_end(void) |
2807 | { | 2807 | { |
2808 | struct memcg_batch_info *batch = ¤t->memcg_batch; | 2808 | struct memcg_batch_info *batch = ¤t->memcg_batch; |
2809 | 2809 | ||
2810 | if (!batch->do_batch) | 2810 | if (!batch->do_batch) |
2811 | return; | 2811 | return; |
2812 | 2812 | ||
2813 | batch->do_batch--; | 2813 | batch->do_batch--; |
2814 | if (batch->do_batch) /* If stacked, do nothing. */ | 2814 | if (batch->do_batch) /* If stacked, do nothing. */ |
2815 | return; | 2815 | return; |
2816 | 2816 | ||
2817 | if (!batch->memcg) | 2817 | if (!batch->memcg) |
2818 | return; | 2818 | return; |
2819 | /* | 2819 | /* |
2820 | * This "batch->memcg" is valid without any css_get/put etc... | 2820 | * This "batch->memcg" is valid without any css_get/put etc... |
2821 | * bacause we hide charges behind us. | 2821 | * bacause we hide charges behind us. |
2822 | */ | 2822 | */ |
2823 | if (batch->nr_pages) | 2823 | if (batch->nr_pages) |
2824 | res_counter_uncharge(&batch->memcg->res, | 2824 | res_counter_uncharge(&batch->memcg->res, |
2825 | batch->nr_pages * PAGE_SIZE); | 2825 | batch->nr_pages * PAGE_SIZE); |
2826 | if (batch->memsw_nr_pages) | 2826 | if (batch->memsw_nr_pages) |
2827 | res_counter_uncharge(&batch->memcg->memsw, | 2827 | res_counter_uncharge(&batch->memcg->memsw, |
2828 | batch->memsw_nr_pages * PAGE_SIZE); | 2828 | batch->memsw_nr_pages * PAGE_SIZE); |
2829 | memcg_oom_recover(batch->memcg); | 2829 | memcg_oom_recover(batch->memcg); |
2830 | /* forget this pointer (for sanity check) */ | 2830 | /* forget this pointer (for sanity check) */ |
2831 | batch->memcg = NULL; | 2831 | batch->memcg = NULL; |
2832 | } | 2832 | } |
2833 | 2833 | ||
2834 | #ifdef CONFIG_SWAP | 2834 | #ifdef CONFIG_SWAP |
2835 | /* | 2835 | /* |
2836 | * called after __delete_from_swap_cache() and drop "page" account. | 2836 | * called after __delete_from_swap_cache() and drop "page" account. |
2837 | * memcg information is recorded to swap_cgroup of "ent" | 2837 | * memcg information is recorded to swap_cgroup of "ent" |
2838 | */ | 2838 | */ |
2839 | void | 2839 | void |
2840 | mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) | 2840 | mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) |
2841 | { | 2841 | { |
2842 | struct mem_cgroup *memcg; | 2842 | struct mem_cgroup *memcg; |
2843 | int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; | 2843 | int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; |
2844 | 2844 | ||
2845 | if (!swapout) /* this was a swap cache but the swap is unused ! */ | 2845 | if (!swapout) /* this was a swap cache but the swap is unused ! */ |
2846 | ctype = MEM_CGROUP_CHARGE_TYPE_DROP; | 2846 | ctype = MEM_CGROUP_CHARGE_TYPE_DROP; |
2847 | 2847 | ||
2848 | memcg = __mem_cgroup_uncharge_common(page, ctype); | 2848 | memcg = __mem_cgroup_uncharge_common(page, ctype); |
2849 | 2849 | ||
2850 | /* | 2850 | /* |
2851 | * record memcg information, if swapout && memcg != NULL, | 2851 | * record memcg information, if swapout && memcg != NULL, |
2852 | * mem_cgroup_get() was called in uncharge(). | 2852 | * mem_cgroup_get() was called in uncharge(). |
2853 | */ | 2853 | */ |
2854 | if (do_swap_account && swapout && memcg) | 2854 | if (do_swap_account && swapout && memcg) |
2855 | swap_cgroup_record(ent, css_id(&memcg->css)); | 2855 | swap_cgroup_record(ent, css_id(&memcg->css)); |
2856 | } | 2856 | } |
2857 | #endif | 2857 | #endif |
2858 | 2858 | ||
2859 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 2859 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
2860 | /* | 2860 | /* |
2861 | * called from swap_entry_free(). remove record in swap_cgroup and | 2861 | * called from swap_entry_free(). remove record in swap_cgroup and |
2862 | * uncharge "memsw" account. | 2862 | * uncharge "memsw" account. |
2863 | */ | 2863 | */ |
2864 | void mem_cgroup_uncharge_swap(swp_entry_t ent) | 2864 | void mem_cgroup_uncharge_swap(swp_entry_t ent) |
2865 | { | 2865 | { |
2866 | struct mem_cgroup *memcg; | 2866 | struct mem_cgroup *memcg; |
2867 | unsigned short id; | 2867 | unsigned short id; |
2868 | 2868 | ||
2869 | if (!do_swap_account) | 2869 | if (!do_swap_account) |
2870 | return; | 2870 | return; |
2871 | 2871 | ||
2872 | id = swap_cgroup_record(ent, 0); | 2872 | id = swap_cgroup_record(ent, 0); |
2873 | rcu_read_lock(); | 2873 | rcu_read_lock(); |
2874 | memcg = mem_cgroup_lookup(id); | 2874 | memcg = mem_cgroup_lookup(id); |
2875 | if (memcg) { | 2875 | if (memcg) { |
2876 | /* | 2876 | /* |
2877 | * We uncharge this because swap is freed. | 2877 | * We uncharge this because swap is freed. |
2878 | * This memcg can be obsolete one. We avoid calling css_tryget | 2878 | * This memcg can be obsolete one. We avoid calling css_tryget |
2879 | */ | 2879 | */ |
2880 | if (!mem_cgroup_is_root(memcg)) | 2880 | if (!mem_cgroup_is_root(memcg)) |
2881 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | 2881 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); |
2882 | mem_cgroup_swap_statistics(memcg, false); | 2882 | mem_cgroup_swap_statistics(memcg, false); |
2883 | mem_cgroup_put(memcg); | 2883 | mem_cgroup_put(memcg); |
2884 | } | 2884 | } |
2885 | rcu_read_unlock(); | 2885 | rcu_read_unlock(); |
2886 | } | 2886 | } |
2887 | 2887 | ||
2888 | /** | 2888 | /** |
2889 | * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. | 2889 | * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. |
2890 | * @entry: swap entry to be moved | 2890 | * @entry: swap entry to be moved |
2891 | * @from: mem_cgroup which the entry is moved from | 2891 | * @from: mem_cgroup which the entry is moved from |
2892 | * @to: mem_cgroup which the entry is moved to | 2892 | * @to: mem_cgroup which the entry is moved to |
2893 | * @need_fixup: whether we should fixup res_counters and refcounts. | 2893 | * @need_fixup: whether we should fixup res_counters and refcounts. |
2894 | * | 2894 | * |
2895 | * It succeeds only when the swap_cgroup's record for this entry is the same | 2895 | * It succeeds only when the swap_cgroup's record for this entry is the same |
2896 | * as the mem_cgroup's id of @from. | 2896 | * as the mem_cgroup's id of @from. |
2897 | * | 2897 | * |
2898 | * Returns 0 on success, -EINVAL on failure. | 2898 | * Returns 0 on success, -EINVAL on failure. |
2899 | * | 2899 | * |
2900 | * The caller must have charged to @to, IOW, called res_counter_charge() about | 2900 | * The caller must have charged to @to, IOW, called res_counter_charge() about |
2901 | * both res and memsw, and called css_get(). | 2901 | * both res and memsw, and called css_get(). |
2902 | */ | 2902 | */ |
2903 | static int mem_cgroup_move_swap_account(swp_entry_t entry, | 2903 | static int mem_cgroup_move_swap_account(swp_entry_t entry, |
2904 | struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) | 2904 | struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) |
2905 | { | 2905 | { |
2906 | unsigned short old_id, new_id; | 2906 | unsigned short old_id, new_id; |
2907 | 2907 | ||
2908 | old_id = css_id(&from->css); | 2908 | old_id = css_id(&from->css); |
2909 | new_id = css_id(&to->css); | 2909 | new_id = css_id(&to->css); |
2910 | 2910 | ||
2911 | if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { | 2911 | if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { |
2912 | mem_cgroup_swap_statistics(from, false); | 2912 | mem_cgroup_swap_statistics(from, false); |
2913 | mem_cgroup_swap_statistics(to, true); | 2913 | mem_cgroup_swap_statistics(to, true); |
2914 | /* | 2914 | /* |
2915 | * This function is only called from task migration context now. | 2915 | * This function is only called from task migration context now. |
2916 | * It postpones res_counter and refcount handling till the end | 2916 | * It postpones res_counter and refcount handling till the end |
2917 | * of task migration(mem_cgroup_clear_mc()) for performance | 2917 | * of task migration(mem_cgroup_clear_mc()) for performance |
2918 | * improvement. But we cannot postpone mem_cgroup_get(to) | 2918 | * improvement. But we cannot postpone mem_cgroup_get(to) |
2919 | * because if the process that has been moved to @to does | 2919 | * because if the process that has been moved to @to does |
2920 | * swap-in, the refcount of @to might be decreased to 0. | 2920 | * swap-in, the refcount of @to might be decreased to 0. |
2921 | */ | 2921 | */ |
2922 | mem_cgroup_get(to); | 2922 | mem_cgroup_get(to); |
2923 | if (need_fixup) { | 2923 | if (need_fixup) { |
2924 | if (!mem_cgroup_is_root(from)) | 2924 | if (!mem_cgroup_is_root(from)) |
2925 | res_counter_uncharge(&from->memsw, PAGE_SIZE); | 2925 | res_counter_uncharge(&from->memsw, PAGE_SIZE); |
2926 | mem_cgroup_put(from); | 2926 | mem_cgroup_put(from); |
2927 | /* | 2927 | /* |
2928 | * we charged both to->res and to->memsw, so we should | 2928 | * we charged both to->res and to->memsw, so we should |
2929 | * uncharge to->res. | 2929 | * uncharge to->res. |
2930 | */ | 2930 | */ |
2931 | if (!mem_cgroup_is_root(to)) | 2931 | if (!mem_cgroup_is_root(to)) |
2932 | res_counter_uncharge(&to->res, PAGE_SIZE); | 2932 | res_counter_uncharge(&to->res, PAGE_SIZE); |
2933 | } | 2933 | } |
2934 | return 0; | 2934 | return 0; |
2935 | } | 2935 | } |
2936 | return -EINVAL; | 2936 | return -EINVAL; |
2937 | } | 2937 | } |
2938 | #else | 2938 | #else |
2939 | static inline int mem_cgroup_move_swap_account(swp_entry_t entry, | 2939 | static inline int mem_cgroup_move_swap_account(swp_entry_t entry, |
2940 | struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) | 2940 | struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) |
2941 | { | 2941 | { |
2942 | return -EINVAL; | 2942 | return -EINVAL; |
2943 | } | 2943 | } |
2944 | #endif | 2944 | #endif |
2945 | 2945 | ||
2946 | /* | 2946 | /* |
2947 | * Before starting migration, account PAGE_SIZE to mem_cgroup that the old | 2947 | * Before starting migration, account PAGE_SIZE to mem_cgroup that the old |
2948 | * page belongs to. | 2948 | * page belongs to. |
2949 | */ | 2949 | */ |
2950 | int mem_cgroup_prepare_migration(struct page *page, | 2950 | int mem_cgroup_prepare_migration(struct page *page, |
2951 | struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) | 2951 | struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) |
2952 | { | 2952 | { |
2953 | struct mem_cgroup *mem = NULL; | 2953 | struct mem_cgroup *mem = NULL; |
2954 | struct page_cgroup *pc; | 2954 | struct page_cgroup *pc; |
2955 | enum charge_type ctype; | 2955 | enum charge_type ctype; |
2956 | int ret = 0; | 2956 | int ret = 0; |
2957 | 2957 | ||
2958 | *ptr = NULL; | 2958 | *ptr = NULL; |
2959 | 2959 | ||
2960 | VM_BUG_ON(PageTransHuge(page)); | 2960 | VM_BUG_ON(PageTransHuge(page)); |
2961 | if (mem_cgroup_disabled()) | 2961 | if (mem_cgroup_disabled()) |
2962 | return 0; | 2962 | return 0; |
2963 | 2963 | ||
2964 | pc = lookup_page_cgroup(page); | 2964 | pc = lookup_page_cgroup(page); |
2965 | lock_page_cgroup(pc); | 2965 | lock_page_cgroup(pc); |
2966 | if (PageCgroupUsed(pc)) { | 2966 | if (PageCgroupUsed(pc)) { |
2967 | mem = pc->mem_cgroup; | 2967 | mem = pc->mem_cgroup; |
2968 | css_get(&mem->css); | 2968 | css_get(&mem->css); |
2969 | /* | 2969 | /* |
2970 | * At migrating an anonymous page, its mapcount goes down | 2970 | * At migrating an anonymous page, its mapcount goes down |
2971 | * to 0 and uncharge() will be called. But, even if it's fully | 2971 | * to 0 and uncharge() will be called. But, even if it's fully |
2972 | * unmapped, migration may fail and this page has to be | 2972 | * unmapped, migration may fail and this page has to be |
2973 | * charged again. We set MIGRATION flag here and delay uncharge | 2973 | * charged again. We set MIGRATION flag here and delay uncharge |
2974 | * until end_migration() is called | 2974 | * until end_migration() is called |
2975 | * | 2975 | * |
2976 | * Corner Case Thinking | 2976 | * Corner Case Thinking |
2977 | * A) | 2977 | * A) |
2978 | * When the old page was mapped as Anon and it's unmap-and-freed | 2978 | * When the old page was mapped as Anon and it's unmap-and-freed |
2979 | * while migration was ongoing. | 2979 | * while migration was ongoing. |
2980 | * If unmap finds the old page, uncharge() of it will be delayed | 2980 | * If unmap finds the old page, uncharge() of it will be delayed |
2981 | * until end_migration(). If unmap finds a new page, it's | 2981 | * until end_migration(). If unmap finds a new page, it's |
2982 | * uncharged when it make mapcount to be 1->0. If unmap code | 2982 | * uncharged when it make mapcount to be 1->0. If unmap code |
2983 | * finds swap_migration_entry, the new page will not be mapped | 2983 | * finds swap_migration_entry, the new page will not be mapped |
2984 | * and end_migration() will find it(mapcount==0). | 2984 | * and end_migration() will find it(mapcount==0). |
2985 | * | 2985 | * |
2986 | * B) | 2986 | * B) |
2987 | * When the old page was mapped but migraion fails, the kernel | 2987 | * When the old page was mapped but migraion fails, the kernel |
2988 | * remaps it. A charge for it is kept by MIGRATION flag even | 2988 | * remaps it. A charge for it is kept by MIGRATION flag even |
2989 | * if mapcount goes down to 0. We can do remap successfully | 2989 | * if mapcount goes down to 0. We can do remap successfully |
2990 | * without charging it again. | 2990 | * without charging it again. |
2991 | * | 2991 | * |
2992 | * C) | 2992 | * C) |
2993 | * The "old" page is under lock_page() until the end of | 2993 | * The "old" page is under lock_page() until the end of |
2994 | * migration, so, the old page itself will not be swapped-out. | 2994 | * migration, so, the old page itself will not be swapped-out. |
2995 | * If the new page is swapped out before end_migraton, our | 2995 | * If the new page is swapped out before end_migraton, our |
2996 | * hook to usual swap-out path will catch the event. | 2996 | * hook to usual swap-out path will catch the event. |
2997 | */ | 2997 | */ |
2998 | if (PageAnon(page)) | 2998 | if (PageAnon(page)) |
2999 | SetPageCgroupMigration(pc); | 2999 | SetPageCgroupMigration(pc); |
3000 | } | 3000 | } |
3001 | unlock_page_cgroup(pc); | 3001 | unlock_page_cgroup(pc); |
3002 | /* | 3002 | /* |
3003 | * If the page is not charged at this point, | 3003 | * If the page is not charged at this point, |
3004 | * we return here. | 3004 | * we return here. |
3005 | */ | 3005 | */ |
3006 | if (!mem) | 3006 | if (!mem) |
3007 | return 0; | 3007 | return 0; |
3008 | 3008 | ||
3009 | *ptr = mem; | 3009 | *ptr = mem; |
3010 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false); | 3010 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false); |
3011 | css_put(&mem->css);/* drop extra refcnt */ | 3011 | css_put(&mem->css);/* drop extra refcnt */ |
3012 | if (ret || *ptr == NULL) { | 3012 | if (ret || *ptr == NULL) { |
3013 | if (PageAnon(page)) { | 3013 | if (PageAnon(page)) { |
3014 | lock_page_cgroup(pc); | 3014 | lock_page_cgroup(pc); |
3015 | ClearPageCgroupMigration(pc); | 3015 | ClearPageCgroupMigration(pc); |
3016 | unlock_page_cgroup(pc); | 3016 | unlock_page_cgroup(pc); |
3017 | /* | 3017 | /* |
3018 | * The old page may be fully unmapped while we kept it. | 3018 | * The old page may be fully unmapped while we kept it. |
3019 | */ | 3019 | */ |
3020 | mem_cgroup_uncharge_page(page); | 3020 | mem_cgroup_uncharge_page(page); |
3021 | } | 3021 | } |
3022 | return -ENOMEM; | 3022 | return -ENOMEM; |
3023 | } | 3023 | } |
3024 | /* | 3024 | /* |
3025 | * We charge new page before it's used/mapped. So, even if unlock_page() | 3025 | * We charge new page before it's used/mapped. So, even if unlock_page() |
3026 | * is called before end_migration, we can catch all events on this new | 3026 | * is called before end_migration, we can catch all events on this new |
3027 | * page. In the case new page is migrated but not remapped, new page's | 3027 | * page. In the case new page is migrated but not remapped, new page's |
3028 | * mapcount will be finally 0 and we call uncharge in end_migration(). | 3028 | * mapcount will be finally 0 and we call uncharge in end_migration(). |
3029 | */ | 3029 | */ |
3030 | pc = lookup_page_cgroup(newpage); | 3030 | pc = lookup_page_cgroup(newpage); |
3031 | if (PageAnon(page)) | 3031 | if (PageAnon(page)) |
3032 | ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; | 3032 | ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; |
3033 | else if (page_is_file_cache(page)) | 3033 | else if (page_is_file_cache(page)) |
3034 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; | 3034 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; |
3035 | else | 3035 | else |
3036 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; | 3036 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; |
3037 | __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); | 3037 | __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); |
3038 | return ret; | 3038 | return ret; |
3039 | } | 3039 | } |
3040 | 3040 | ||
3041 | /* remove redundant charge if migration failed*/ | 3041 | /* remove redundant charge if migration failed*/ |
3042 | void mem_cgroup_end_migration(struct mem_cgroup *mem, | 3042 | void mem_cgroup_end_migration(struct mem_cgroup *mem, |
3043 | struct page *oldpage, struct page *newpage, bool migration_ok) | 3043 | struct page *oldpage, struct page *newpage, bool migration_ok) |
3044 | { | 3044 | { |
3045 | struct page *used, *unused; | 3045 | struct page *used, *unused; |
3046 | struct page_cgroup *pc; | 3046 | struct page_cgroup *pc; |
3047 | 3047 | ||
3048 | if (!mem) | 3048 | if (!mem) |
3049 | return; | 3049 | return; |
3050 | /* blocks rmdir() */ | 3050 | /* blocks rmdir() */ |
3051 | cgroup_exclude_rmdir(&mem->css); | 3051 | cgroup_exclude_rmdir(&mem->css); |
3052 | if (!migration_ok) { | 3052 | if (!migration_ok) { |
3053 | used = oldpage; | 3053 | used = oldpage; |
3054 | unused = newpage; | 3054 | unused = newpage; |
3055 | } else { | 3055 | } else { |
3056 | used = newpage; | 3056 | used = newpage; |
3057 | unused = oldpage; | 3057 | unused = oldpage; |
3058 | } | 3058 | } |
3059 | /* | 3059 | /* |
3060 | * We disallowed uncharge of pages under migration because mapcount | 3060 | * We disallowed uncharge of pages under migration because mapcount |
3061 | * of the page goes down to zero, temporarly. | 3061 | * of the page goes down to zero, temporarly. |
3062 | * Clear the flag and check the page should be charged. | 3062 | * Clear the flag and check the page should be charged. |
3063 | */ | 3063 | */ |
3064 | pc = lookup_page_cgroup(oldpage); | 3064 | pc = lookup_page_cgroup(oldpage); |
3065 | lock_page_cgroup(pc); | 3065 | lock_page_cgroup(pc); |
3066 | ClearPageCgroupMigration(pc); | 3066 | ClearPageCgroupMigration(pc); |
3067 | unlock_page_cgroup(pc); | 3067 | unlock_page_cgroup(pc); |
3068 | 3068 | ||
3069 | __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); | 3069 | __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); |
3070 | 3070 | ||
3071 | /* | 3071 | /* |
3072 | * If a page is a file cache, radix-tree replacement is very atomic | 3072 | * If a page is a file cache, radix-tree replacement is very atomic |
3073 | * and we can skip this check. When it was an Anon page, its mapcount | 3073 | * and we can skip this check. When it was an Anon page, its mapcount |
3074 | * goes down to 0. But because we added MIGRATION flage, it's not | 3074 | * goes down to 0. But because we added MIGRATION flage, it's not |
3075 | * uncharged yet. There are several case but page->mapcount check | 3075 | * uncharged yet. There are several case but page->mapcount check |
3076 | * and USED bit check in mem_cgroup_uncharge_page() will do enough | 3076 | * and USED bit check in mem_cgroup_uncharge_page() will do enough |
3077 | * check. (see prepare_charge() also) | 3077 | * check. (see prepare_charge() also) |
3078 | */ | 3078 | */ |
3079 | if (PageAnon(used)) | 3079 | if (PageAnon(used)) |
3080 | mem_cgroup_uncharge_page(used); | 3080 | mem_cgroup_uncharge_page(used); |
3081 | /* | 3081 | /* |
3082 | * At migration, we may charge account against cgroup which has no | 3082 | * At migration, we may charge account against cgroup which has no |
3083 | * tasks. | 3083 | * tasks. |
3084 | * So, rmdir()->pre_destroy() can be called while we do this charge. | 3084 | * So, rmdir()->pre_destroy() can be called while we do this charge. |
3085 | * In that case, we need to call pre_destroy() again. check it here. | 3085 | * In that case, we need to call pre_destroy() again. check it here. |
3086 | */ | 3086 | */ |
3087 | cgroup_release_and_wakeup_rmdir(&mem->css); | 3087 | cgroup_release_and_wakeup_rmdir(&mem->css); |
3088 | } | 3088 | } |
3089 | 3089 | ||
3090 | /* | 3090 | /* |
3091 | * A call to try to shrink memory usage on charge failure at shmem's swapin. | 3091 | * A call to try to shrink memory usage on charge failure at shmem's swapin. |
3092 | * Calling hierarchical_reclaim is not enough because we should update | 3092 | * Calling hierarchical_reclaim is not enough because we should update |
3093 | * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM. | 3093 | * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM. |
3094 | * Moreover considering hierarchy, we should reclaim from the mem_over_limit, | 3094 | * Moreover considering hierarchy, we should reclaim from the mem_over_limit, |
3095 | * not from the memcg which this page would be charged to. | 3095 | * not from the memcg which this page would be charged to. |
3096 | * try_charge_swapin does all of these works properly. | 3096 | * try_charge_swapin does all of these works properly. |
3097 | */ | 3097 | */ |
3098 | int mem_cgroup_shmem_charge_fallback(struct page *page, | 3098 | int mem_cgroup_shmem_charge_fallback(struct page *page, |
3099 | struct mm_struct *mm, | 3099 | struct mm_struct *mm, |
3100 | gfp_t gfp_mask) | 3100 | gfp_t gfp_mask) |
3101 | { | 3101 | { |
3102 | struct mem_cgroup *mem; | 3102 | struct mem_cgroup *mem; |
3103 | int ret; | 3103 | int ret; |
3104 | 3104 | ||
3105 | if (mem_cgroup_disabled()) | 3105 | if (mem_cgroup_disabled()) |
3106 | return 0; | 3106 | return 0; |
3107 | 3107 | ||
3108 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); | 3108 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); |
3109 | if (!ret) | 3109 | if (!ret) |
3110 | mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */ | 3110 | mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */ |
3111 | 3111 | ||
3112 | return ret; | 3112 | return ret; |
3113 | } | 3113 | } |
3114 | 3114 | ||
3115 | #ifdef CONFIG_DEBUG_VM | 3115 | #ifdef CONFIG_DEBUG_VM |
3116 | static struct page_cgroup *lookup_page_cgroup_used(struct page *page) | 3116 | static struct page_cgroup *lookup_page_cgroup_used(struct page *page) |
3117 | { | 3117 | { |
3118 | struct page_cgroup *pc; | 3118 | struct page_cgroup *pc; |
3119 | 3119 | ||
3120 | pc = lookup_page_cgroup(page); | 3120 | pc = lookup_page_cgroup(page); |
3121 | if (likely(pc) && PageCgroupUsed(pc)) | 3121 | if (likely(pc) && PageCgroupUsed(pc)) |
3122 | return pc; | 3122 | return pc; |
3123 | return NULL; | 3123 | return NULL; |
3124 | } | 3124 | } |
3125 | 3125 | ||
3126 | bool mem_cgroup_bad_page_check(struct page *page) | 3126 | bool mem_cgroup_bad_page_check(struct page *page) |
3127 | { | 3127 | { |
3128 | if (mem_cgroup_disabled()) | 3128 | if (mem_cgroup_disabled()) |
3129 | return false; | 3129 | return false; |
3130 | 3130 | ||
3131 | return lookup_page_cgroup_used(page) != NULL; | 3131 | return lookup_page_cgroup_used(page) != NULL; |
3132 | } | 3132 | } |
3133 | 3133 | ||
3134 | void mem_cgroup_print_bad_page(struct page *page) | 3134 | void mem_cgroup_print_bad_page(struct page *page) |
3135 | { | 3135 | { |
3136 | struct page_cgroup *pc; | 3136 | struct page_cgroup *pc; |
3137 | 3137 | ||
3138 | pc = lookup_page_cgroup_used(page); | 3138 | pc = lookup_page_cgroup_used(page); |
3139 | if (pc) { | 3139 | if (pc) { |
3140 | int ret = -1; | 3140 | int ret = -1; |
3141 | char *path; | 3141 | char *path; |
3142 | 3142 | ||
3143 | printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p", | 3143 | printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p", |
3144 | pc, pc->flags, pc->mem_cgroup); | 3144 | pc, pc->flags, pc->mem_cgroup); |
3145 | 3145 | ||
3146 | path = kmalloc(PATH_MAX, GFP_KERNEL); | 3146 | path = kmalloc(PATH_MAX, GFP_KERNEL); |
3147 | if (path) { | 3147 | if (path) { |
3148 | rcu_read_lock(); | 3148 | rcu_read_lock(); |
3149 | ret = cgroup_path(pc->mem_cgroup->css.cgroup, | 3149 | ret = cgroup_path(pc->mem_cgroup->css.cgroup, |
3150 | path, PATH_MAX); | 3150 | path, PATH_MAX); |
3151 | rcu_read_unlock(); | 3151 | rcu_read_unlock(); |
3152 | } | 3152 | } |
3153 | 3153 | ||
3154 | printk(KERN_CONT "(%s)\n", | 3154 | printk(KERN_CONT "(%s)\n", |
3155 | (ret < 0) ? "cannot get the path" : path); | 3155 | (ret < 0) ? "cannot get the path" : path); |
3156 | kfree(path); | 3156 | kfree(path); |
3157 | } | 3157 | } |
3158 | } | 3158 | } |
3159 | #endif | 3159 | #endif |
3160 | 3160 | ||
3161 | static DEFINE_MUTEX(set_limit_mutex); | 3161 | static DEFINE_MUTEX(set_limit_mutex); |
3162 | 3162 | ||
3163 | static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | 3163 | static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, |
3164 | unsigned long long val) | 3164 | unsigned long long val) |
3165 | { | 3165 | { |
3166 | int retry_count; | 3166 | int retry_count; |
3167 | u64 memswlimit, memlimit; | 3167 | u64 memswlimit, memlimit; |
3168 | int ret = 0; | 3168 | int ret = 0; |
3169 | int children = mem_cgroup_count_children(memcg); | 3169 | int children = mem_cgroup_count_children(memcg); |
3170 | u64 curusage, oldusage; | 3170 | u64 curusage, oldusage; |
3171 | int enlarge; | 3171 | int enlarge; |
3172 | 3172 | ||
3173 | /* | 3173 | /* |
3174 | * For keeping hierarchical_reclaim simple, how long we should retry | 3174 | * For keeping hierarchical_reclaim simple, how long we should retry |
3175 | * is depends on callers. We set our retry-count to be function | 3175 | * is depends on callers. We set our retry-count to be function |
3176 | * of # of children which we should visit in this loop. | 3176 | * of # of children which we should visit in this loop. |
3177 | */ | 3177 | */ |
3178 | retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; | 3178 | retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; |
3179 | 3179 | ||
3180 | oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); | 3180 | oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); |
3181 | 3181 | ||
3182 | enlarge = 0; | 3182 | enlarge = 0; |
3183 | while (retry_count) { | 3183 | while (retry_count) { |
3184 | if (signal_pending(current)) { | 3184 | if (signal_pending(current)) { |
3185 | ret = -EINTR; | 3185 | ret = -EINTR; |
3186 | break; | 3186 | break; |
3187 | } | 3187 | } |
3188 | /* | 3188 | /* |
3189 | * Rather than hide all in some function, I do this in | 3189 | * Rather than hide all in some function, I do this in |
3190 | * open coded manner. You see what this really does. | 3190 | * open coded manner. You see what this really does. |
3191 | * We have to guarantee mem->res.limit < mem->memsw.limit. | 3191 | * We have to guarantee mem->res.limit < mem->memsw.limit. |
3192 | */ | 3192 | */ |
3193 | mutex_lock(&set_limit_mutex); | 3193 | mutex_lock(&set_limit_mutex); |
3194 | memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | 3194 | memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); |
3195 | if (memswlimit < val) { | 3195 | if (memswlimit < val) { |
3196 | ret = -EINVAL; | 3196 | ret = -EINVAL; |
3197 | mutex_unlock(&set_limit_mutex); | 3197 | mutex_unlock(&set_limit_mutex); |
3198 | break; | 3198 | break; |
3199 | } | 3199 | } |
3200 | 3200 | ||
3201 | memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); | 3201 | memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); |
3202 | if (memlimit < val) | 3202 | if (memlimit < val) |
3203 | enlarge = 1; | 3203 | enlarge = 1; |
3204 | 3204 | ||
3205 | ret = res_counter_set_limit(&memcg->res, val); | 3205 | ret = res_counter_set_limit(&memcg->res, val); |
3206 | if (!ret) { | 3206 | if (!ret) { |
3207 | if (memswlimit == val) | 3207 | if (memswlimit == val) |
3208 | memcg->memsw_is_minimum = true; | 3208 | memcg->memsw_is_minimum = true; |
3209 | else | 3209 | else |
3210 | memcg->memsw_is_minimum = false; | 3210 | memcg->memsw_is_minimum = false; |
3211 | } | 3211 | } |
3212 | mutex_unlock(&set_limit_mutex); | 3212 | mutex_unlock(&set_limit_mutex); |
3213 | 3213 | ||
3214 | if (!ret) | 3214 | if (!ret) |
3215 | break; | 3215 | break; |
3216 | 3216 | ||
3217 | mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, | 3217 | mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, |
3218 | MEM_CGROUP_RECLAIM_SHRINK, | 3218 | MEM_CGROUP_RECLAIM_SHRINK, |
3219 | NULL); | 3219 | NULL); |
3220 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); | 3220 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); |
3221 | /* Usage is reduced ? */ | 3221 | /* Usage is reduced ? */ |
3222 | if (curusage >= oldusage) | 3222 | if (curusage >= oldusage) |
3223 | retry_count--; | 3223 | retry_count--; |
3224 | else | 3224 | else |
3225 | oldusage = curusage; | 3225 | oldusage = curusage; |
3226 | } | 3226 | } |
3227 | if (!ret && enlarge) | 3227 | if (!ret && enlarge) |
3228 | memcg_oom_recover(memcg); | 3228 | memcg_oom_recover(memcg); |
3229 | 3229 | ||
3230 | return ret; | 3230 | return ret; |
3231 | } | 3231 | } |
3232 | 3232 | ||
3233 | static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | 3233 | static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, |
3234 | unsigned long long val) | 3234 | unsigned long long val) |
3235 | { | 3235 | { |
3236 | int retry_count; | 3236 | int retry_count; |
3237 | u64 memlimit, memswlimit, oldusage, curusage; | 3237 | u64 memlimit, memswlimit, oldusage, curusage; |
3238 | int children = mem_cgroup_count_children(memcg); | 3238 | int children = mem_cgroup_count_children(memcg); |
3239 | int ret = -EBUSY; | 3239 | int ret = -EBUSY; |
3240 | int enlarge = 0; | 3240 | int enlarge = 0; |
3241 | 3241 | ||
3242 | /* see mem_cgroup_resize_res_limit */ | 3242 | /* see mem_cgroup_resize_res_limit */ |
3243 | retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; | 3243 | retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; |
3244 | oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | 3244 | oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); |
3245 | while (retry_count) { | 3245 | while (retry_count) { |
3246 | if (signal_pending(current)) { | 3246 | if (signal_pending(current)) { |
3247 | ret = -EINTR; | 3247 | ret = -EINTR; |
3248 | break; | 3248 | break; |
3249 | } | 3249 | } |
3250 | /* | 3250 | /* |
3251 | * Rather than hide all in some function, I do this in | 3251 | * Rather than hide all in some function, I do this in |
3252 | * open coded manner. You see what this really does. | 3252 | * open coded manner. You see what this really does. |
3253 | * We have to guarantee mem->res.limit < mem->memsw.limit. | 3253 | * We have to guarantee mem->res.limit < mem->memsw.limit. |
3254 | */ | 3254 | */ |
3255 | mutex_lock(&set_limit_mutex); | 3255 | mutex_lock(&set_limit_mutex); |
3256 | memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); | 3256 | memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); |
3257 | if (memlimit > val) { | 3257 | if (memlimit > val) { |
3258 | ret = -EINVAL; | 3258 | ret = -EINVAL; |
3259 | mutex_unlock(&set_limit_mutex); | 3259 | mutex_unlock(&set_limit_mutex); |
3260 | break; | 3260 | break; |
3261 | } | 3261 | } |
3262 | memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | 3262 | memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); |
3263 | if (memswlimit < val) | 3263 | if (memswlimit < val) |
3264 | enlarge = 1; | 3264 | enlarge = 1; |
3265 | ret = res_counter_set_limit(&memcg->memsw, val); | 3265 | ret = res_counter_set_limit(&memcg->memsw, val); |
3266 | if (!ret) { | 3266 | if (!ret) { |
3267 | if (memlimit == val) | 3267 | if (memlimit == val) |
3268 | memcg->memsw_is_minimum = true; | 3268 | memcg->memsw_is_minimum = true; |
3269 | else | 3269 | else |
3270 | memcg->memsw_is_minimum = false; | 3270 | memcg->memsw_is_minimum = false; |
3271 | } | 3271 | } |
3272 | mutex_unlock(&set_limit_mutex); | 3272 | mutex_unlock(&set_limit_mutex); |
3273 | 3273 | ||
3274 | if (!ret) | 3274 | if (!ret) |
3275 | break; | 3275 | break; |
3276 | 3276 | ||
3277 | mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, | 3277 | mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, |
3278 | MEM_CGROUP_RECLAIM_NOSWAP | | 3278 | MEM_CGROUP_RECLAIM_NOSWAP | |
3279 | MEM_CGROUP_RECLAIM_SHRINK, | 3279 | MEM_CGROUP_RECLAIM_SHRINK, |
3280 | NULL); | 3280 | NULL); |
3281 | curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | 3281 | curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); |
3282 | /* Usage is reduced ? */ | 3282 | /* Usage is reduced ? */ |
3283 | if (curusage >= oldusage) | 3283 | if (curusage >= oldusage) |
3284 | retry_count--; | 3284 | retry_count--; |
3285 | else | 3285 | else |
3286 | oldusage = curusage; | 3286 | oldusage = curusage; |
3287 | } | 3287 | } |
3288 | if (!ret && enlarge) | 3288 | if (!ret && enlarge) |
3289 | memcg_oom_recover(memcg); | 3289 | memcg_oom_recover(memcg); |
3290 | return ret; | 3290 | return ret; |
3291 | } | 3291 | } |
3292 | 3292 | ||
3293 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | 3293 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, |
3294 | gfp_t gfp_mask, | 3294 | gfp_t gfp_mask, |
3295 | unsigned long *total_scanned) | 3295 | unsigned long *total_scanned) |
3296 | { | 3296 | { |
3297 | unsigned long nr_reclaimed = 0; | 3297 | unsigned long nr_reclaimed = 0; |
3298 | struct mem_cgroup_per_zone *mz, *next_mz = NULL; | 3298 | struct mem_cgroup_per_zone *mz, *next_mz = NULL; |
3299 | unsigned long reclaimed; | 3299 | unsigned long reclaimed; |
3300 | int loop = 0; | 3300 | int loop = 0; |
3301 | struct mem_cgroup_tree_per_zone *mctz; | 3301 | struct mem_cgroup_tree_per_zone *mctz; |
3302 | unsigned long long excess; | 3302 | unsigned long long excess; |
3303 | unsigned long nr_scanned; | 3303 | unsigned long nr_scanned; |
3304 | 3304 | ||
3305 | if (order > 0) | 3305 | if (order > 0) |
3306 | return 0; | 3306 | return 0; |
3307 | 3307 | ||
3308 | mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); | 3308 | mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); |
3309 | /* | 3309 | /* |
3310 | * This loop can run a while, specially if mem_cgroup's continuously | 3310 | * This loop can run a while, specially if mem_cgroup's continuously |
3311 | * keep exceeding their soft limit and putting the system under | 3311 | * keep exceeding their soft limit and putting the system under |
3312 | * pressure | 3312 | * pressure |
3313 | */ | 3313 | */ |
3314 | do { | 3314 | do { |
3315 | if (next_mz) | 3315 | if (next_mz) |
3316 | mz = next_mz; | 3316 | mz = next_mz; |
3317 | else | 3317 | else |
3318 | mz = mem_cgroup_largest_soft_limit_node(mctz); | 3318 | mz = mem_cgroup_largest_soft_limit_node(mctz); |
3319 | if (!mz) | 3319 | if (!mz) |
3320 | break; | 3320 | break; |
3321 | 3321 | ||
3322 | nr_scanned = 0; | 3322 | nr_scanned = 0; |
3323 | reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, | 3323 | reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, |
3324 | gfp_mask, | 3324 | gfp_mask, |
3325 | MEM_CGROUP_RECLAIM_SOFT, | 3325 | MEM_CGROUP_RECLAIM_SOFT, |
3326 | &nr_scanned); | 3326 | &nr_scanned); |
3327 | nr_reclaimed += reclaimed; | 3327 | nr_reclaimed += reclaimed; |
3328 | *total_scanned += nr_scanned; | 3328 | *total_scanned += nr_scanned; |
3329 | spin_lock(&mctz->lock); | 3329 | spin_lock(&mctz->lock); |
3330 | 3330 | ||
3331 | /* | 3331 | /* |
3332 | * If we failed to reclaim anything from this memory cgroup | 3332 | * If we failed to reclaim anything from this memory cgroup |
3333 | * it is time to move on to the next cgroup | 3333 | * it is time to move on to the next cgroup |
3334 | */ | 3334 | */ |
3335 | next_mz = NULL; | 3335 | next_mz = NULL; |
3336 | if (!reclaimed) { | 3336 | if (!reclaimed) { |
3337 | do { | 3337 | do { |
3338 | /* | 3338 | /* |
3339 | * Loop until we find yet another one. | 3339 | * Loop until we find yet another one. |
3340 | * | 3340 | * |
3341 | * By the time we get the soft_limit lock | 3341 | * By the time we get the soft_limit lock |
3342 | * again, someone might have aded the | 3342 | * again, someone might have aded the |
3343 | * group back on the RB tree. Iterate to | 3343 | * group back on the RB tree. Iterate to |
3344 | * make sure we get a different mem. | 3344 | * make sure we get a different mem. |
3345 | * mem_cgroup_largest_soft_limit_node returns | 3345 | * mem_cgroup_largest_soft_limit_node returns |
3346 | * NULL if no other cgroup is present on | 3346 | * NULL if no other cgroup is present on |
3347 | * the tree | 3347 | * the tree |
3348 | */ | 3348 | */ |
3349 | next_mz = | 3349 | next_mz = |
3350 | __mem_cgroup_largest_soft_limit_node(mctz); | 3350 | __mem_cgroup_largest_soft_limit_node(mctz); |
3351 | if (next_mz == mz) { | 3351 | if (next_mz == mz) |
3352 | css_put(&next_mz->mem->css); | 3352 | css_put(&next_mz->mem->css); |
3353 | next_mz = NULL; | 3353 | else /* next_mz == NULL or other memcg */ |
3354 | } else /* next_mz == NULL or other memcg */ | ||
3355 | break; | 3354 | break; |
3356 | } while (1); | 3355 | } while (1); |
3357 | } | 3356 | } |
3358 | __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); | 3357 | __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); |
3359 | excess = res_counter_soft_limit_excess(&mz->mem->res); | 3358 | excess = res_counter_soft_limit_excess(&mz->mem->res); |
3360 | /* | 3359 | /* |
3361 | * One school of thought says that we should not add | 3360 | * One school of thought says that we should not add |
3362 | * back the node to the tree if reclaim returns 0. | 3361 | * back the node to the tree if reclaim returns 0. |
3363 | * But our reclaim could return 0, simply because due | 3362 | * But our reclaim could return 0, simply because due |
3364 | * to priority we are exposing a smaller subset of | 3363 | * to priority we are exposing a smaller subset of |
3365 | * memory to reclaim from. Consider this as a longer | 3364 | * memory to reclaim from. Consider this as a longer |
3366 | * term TODO. | 3365 | * term TODO. |
3367 | */ | 3366 | */ |
3368 | /* If excess == 0, no tree ops */ | 3367 | /* If excess == 0, no tree ops */ |
3369 | __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess); | 3368 | __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess); |
3370 | spin_unlock(&mctz->lock); | 3369 | spin_unlock(&mctz->lock); |
3371 | css_put(&mz->mem->css); | 3370 | css_put(&mz->mem->css); |
3372 | loop++; | 3371 | loop++; |
3373 | /* | 3372 | /* |
3374 | * Could not reclaim anything and there are no more | 3373 | * Could not reclaim anything and there are no more |
3375 | * mem cgroups to try or we seem to be looping without | 3374 | * mem cgroups to try or we seem to be looping without |
3376 | * reclaiming anything. | 3375 | * reclaiming anything. |
3377 | */ | 3376 | */ |
3378 | if (!nr_reclaimed && | 3377 | if (!nr_reclaimed && |
3379 | (next_mz == NULL || | 3378 | (next_mz == NULL || |
3380 | loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) | 3379 | loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) |
3381 | break; | 3380 | break; |
3382 | } while (!nr_reclaimed); | 3381 | } while (!nr_reclaimed); |
3383 | if (next_mz) | 3382 | if (next_mz) |
3384 | css_put(&next_mz->mem->css); | 3383 | css_put(&next_mz->mem->css); |
3385 | return nr_reclaimed; | 3384 | return nr_reclaimed; |
3386 | } | 3385 | } |
3387 | 3386 | ||
3388 | /* | 3387 | /* |
3389 | * This routine traverse page_cgroup in given list and drop them all. | 3388 | * This routine traverse page_cgroup in given list and drop them all. |
3390 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. | 3389 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. |
3391 | */ | 3390 | */ |
3392 | static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, | 3391 | static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, |
3393 | int node, int zid, enum lru_list lru) | 3392 | int node, int zid, enum lru_list lru) |
3394 | { | 3393 | { |
3395 | struct zone *zone; | 3394 | struct zone *zone; |
3396 | struct mem_cgroup_per_zone *mz; | 3395 | struct mem_cgroup_per_zone *mz; |
3397 | struct page_cgroup *pc, *busy; | 3396 | struct page_cgroup *pc, *busy; |
3398 | unsigned long flags, loop; | 3397 | unsigned long flags, loop; |
3399 | struct list_head *list; | 3398 | struct list_head *list; |
3400 | int ret = 0; | 3399 | int ret = 0; |
3401 | 3400 | ||
3402 | zone = &NODE_DATA(node)->node_zones[zid]; | 3401 | zone = &NODE_DATA(node)->node_zones[zid]; |
3403 | mz = mem_cgroup_zoneinfo(mem, node, zid); | 3402 | mz = mem_cgroup_zoneinfo(mem, node, zid); |
3404 | list = &mz->lists[lru]; | 3403 | list = &mz->lists[lru]; |
3405 | 3404 | ||
3406 | loop = MEM_CGROUP_ZSTAT(mz, lru); | 3405 | loop = MEM_CGROUP_ZSTAT(mz, lru); |
3407 | /* give some margin against EBUSY etc...*/ | 3406 | /* give some margin against EBUSY etc...*/ |
3408 | loop += 256; | 3407 | loop += 256; |
3409 | busy = NULL; | 3408 | busy = NULL; |
3410 | while (loop--) { | 3409 | while (loop--) { |
3411 | struct page *page; | 3410 | struct page *page; |
3412 | 3411 | ||
3413 | ret = 0; | 3412 | ret = 0; |
3414 | spin_lock_irqsave(&zone->lru_lock, flags); | 3413 | spin_lock_irqsave(&zone->lru_lock, flags); |
3415 | if (list_empty(list)) { | 3414 | if (list_empty(list)) { |
3416 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 3415 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
3417 | break; | 3416 | break; |
3418 | } | 3417 | } |
3419 | pc = list_entry(list->prev, struct page_cgroup, lru); | 3418 | pc = list_entry(list->prev, struct page_cgroup, lru); |
3420 | if (busy == pc) { | 3419 | if (busy == pc) { |
3421 | list_move(&pc->lru, list); | 3420 | list_move(&pc->lru, list); |
3422 | busy = NULL; | 3421 | busy = NULL; |
3423 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 3422 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
3424 | continue; | 3423 | continue; |
3425 | } | 3424 | } |
3426 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 3425 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
3427 | 3426 | ||
3428 | page = lookup_cgroup_page(pc); | 3427 | page = lookup_cgroup_page(pc); |
3429 | 3428 | ||
3430 | ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL); | 3429 | ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL); |
3431 | if (ret == -ENOMEM) | 3430 | if (ret == -ENOMEM) |
3432 | break; | 3431 | break; |
3433 | 3432 | ||
3434 | if (ret == -EBUSY || ret == -EINVAL) { | 3433 | if (ret == -EBUSY || ret == -EINVAL) { |
3435 | /* found lock contention or "pc" is obsolete. */ | 3434 | /* found lock contention or "pc" is obsolete. */ |
3436 | busy = pc; | 3435 | busy = pc; |
3437 | cond_resched(); | 3436 | cond_resched(); |
3438 | } else | 3437 | } else |
3439 | busy = NULL; | 3438 | busy = NULL; |
3440 | } | 3439 | } |
3441 | 3440 | ||
3442 | if (!ret && !list_empty(list)) | 3441 | if (!ret && !list_empty(list)) |
3443 | return -EBUSY; | 3442 | return -EBUSY; |
3444 | return ret; | 3443 | return ret; |
3445 | } | 3444 | } |
3446 | 3445 | ||
3447 | /* | 3446 | /* |
3448 | * make mem_cgroup's charge to be 0 if there is no task. | 3447 | * make mem_cgroup's charge to be 0 if there is no task. |
3449 | * This enables deleting this mem_cgroup. | 3448 | * This enables deleting this mem_cgroup. |
3450 | */ | 3449 | */ |
3451 | static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) | 3450 | static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) |
3452 | { | 3451 | { |
3453 | int ret; | 3452 | int ret; |
3454 | int node, zid, shrink; | 3453 | int node, zid, shrink; |
3455 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 3454 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
3456 | struct cgroup *cgrp = mem->css.cgroup; | 3455 | struct cgroup *cgrp = mem->css.cgroup; |
3457 | 3456 | ||
3458 | css_get(&mem->css); | 3457 | css_get(&mem->css); |
3459 | 3458 | ||
3460 | shrink = 0; | 3459 | shrink = 0; |
3461 | /* should free all ? */ | 3460 | /* should free all ? */ |
3462 | if (free_all) | 3461 | if (free_all) |
3463 | goto try_to_free; | 3462 | goto try_to_free; |
3464 | move_account: | 3463 | move_account: |
3465 | do { | 3464 | do { |
3466 | ret = -EBUSY; | 3465 | ret = -EBUSY; |
3467 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) | 3466 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) |
3468 | goto out; | 3467 | goto out; |
3469 | ret = -EINTR; | 3468 | ret = -EINTR; |
3470 | if (signal_pending(current)) | 3469 | if (signal_pending(current)) |
3471 | goto out; | 3470 | goto out; |
3472 | /* This is for making all *used* pages to be on LRU. */ | 3471 | /* This is for making all *used* pages to be on LRU. */ |
3473 | lru_add_drain_all(); | 3472 | lru_add_drain_all(); |
3474 | drain_all_stock_sync(); | 3473 | drain_all_stock_sync(); |
3475 | ret = 0; | 3474 | ret = 0; |
3476 | mem_cgroup_start_move(mem); | 3475 | mem_cgroup_start_move(mem); |
3477 | for_each_node_state(node, N_HIGH_MEMORY) { | 3476 | for_each_node_state(node, N_HIGH_MEMORY) { |
3478 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { | 3477 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { |
3479 | enum lru_list l; | 3478 | enum lru_list l; |
3480 | for_each_lru(l) { | 3479 | for_each_lru(l) { |
3481 | ret = mem_cgroup_force_empty_list(mem, | 3480 | ret = mem_cgroup_force_empty_list(mem, |
3482 | node, zid, l); | 3481 | node, zid, l); |
3483 | if (ret) | 3482 | if (ret) |
3484 | break; | 3483 | break; |
3485 | } | 3484 | } |
3486 | } | 3485 | } |
3487 | if (ret) | 3486 | if (ret) |
3488 | break; | 3487 | break; |
3489 | } | 3488 | } |
3490 | mem_cgroup_end_move(mem); | 3489 | mem_cgroup_end_move(mem); |
3491 | memcg_oom_recover(mem); | 3490 | memcg_oom_recover(mem); |
3492 | /* it seems parent cgroup doesn't have enough mem */ | 3491 | /* it seems parent cgroup doesn't have enough mem */ |
3493 | if (ret == -ENOMEM) | 3492 | if (ret == -ENOMEM) |
3494 | goto try_to_free; | 3493 | goto try_to_free; |
3495 | cond_resched(); | 3494 | cond_resched(); |
3496 | /* "ret" should also be checked to ensure all lists are empty. */ | 3495 | /* "ret" should also be checked to ensure all lists are empty. */ |
3497 | } while (mem->res.usage > 0 || ret); | 3496 | } while (mem->res.usage > 0 || ret); |
3498 | out: | 3497 | out: |
3499 | css_put(&mem->css); | 3498 | css_put(&mem->css); |
3500 | return ret; | 3499 | return ret; |
3501 | 3500 | ||
3502 | try_to_free: | 3501 | try_to_free: |
3503 | /* returns EBUSY if there is a task or if we come here twice. */ | 3502 | /* returns EBUSY if there is a task or if we come here twice. */ |
3504 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { | 3503 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { |
3505 | ret = -EBUSY; | 3504 | ret = -EBUSY; |
3506 | goto out; | 3505 | goto out; |
3507 | } | 3506 | } |
3508 | /* we call try-to-free pages for make this cgroup empty */ | 3507 | /* we call try-to-free pages for make this cgroup empty */ |
3509 | lru_add_drain_all(); | 3508 | lru_add_drain_all(); |
3510 | /* try to free all pages in this cgroup */ | 3509 | /* try to free all pages in this cgroup */ |
3511 | shrink = 1; | 3510 | shrink = 1; |
3512 | while (nr_retries && mem->res.usage > 0) { | 3511 | while (nr_retries && mem->res.usage > 0) { |
3513 | int progress; | 3512 | int progress; |
3514 | 3513 | ||
3515 | if (signal_pending(current)) { | 3514 | if (signal_pending(current)) { |
3516 | ret = -EINTR; | 3515 | ret = -EINTR; |
3517 | goto out; | 3516 | goto out; |
3518 | } | 3517 | } |
3519 | progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, | 3518 | progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, |
3520 | false, get_swappiness(mem)); | 3519 | false, get_swappiness(mem)); |
3521 | if (!progress) { | 3520 | if (!progress) { |
3522 | nr_retries--; | 3521 | nr_retries--; |
3523 | /* maybe some writeback is necessary */ | 3522 | /* maybe some writeback is necessary */ |
3524 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 3523 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
3525 | } | 3524 | } |
3526 | 3525 | ||
3527 | } | 3526 | } |
3528 | lru_add_drain(); | 3527 | lru_add_drain(); |
3529 | /* try move_account...there may be some *locked* pages. */ | 3528 | /* try move_account...there may be some *locked* pages. */ |
3530 | goto move_account; | 3529 | goto move_account; |
3531 | } | 3530 | } |
3532 | 3531 | ||
3533 | int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) | 3532 | int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) |
3534 | { | 3533 | { |
3535 | return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); | 3534 | return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); |
3536 | } | 3535 | } |
3537 | 3536 | ||
3538 | 3537 | ||
3539 | static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) | 3538 | static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) |
3540 | { | 3539 | { |
3541 | return mem_cgroup_from_cont(cont)->use_hierarchy; | 3540 | return mem_cgroup_from_cont(cont)->use_hierarchy; |
3542 | } | 3541 | } |
3543 | 3542 | ||
3544 | static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | 3543 | static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, |
3545 | u64 val) | 3544 | u64 val) |
3546 | { | 3545 | { |
3547 | int retval = 0; | 3546 | int retval = 0; |
3548 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 3547 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
3549 | struct cgroup *parent = cont->parent; | 3548 | struct cgroup *parent = cont->parent; |
3550 | struct mem_cgroup *parent_mem = NULL; | 3549 | struct mem_cgroup *parent_mem = NULL; |
3551 | 3550 | ||
3552 | if (parent) | 3551 | if (parent) |
3553 | parent_mem = mem_cgroup_from_cont(parent); | 3552 | parent_mem = mem_cgroup_from_cont(parent); |
3554 | 3553 | ||
3555 | cgroup_lock(); | 3554 | cgroup_lock(); |
3556 | /* | 3555 | /* |
3557 | * If parent's use_hierarchy is set, we can't make any modifications | 3556 | * If parent's use_hierarchy is set, we can't make any modifications |
3558 | * in the child subtrees. If it is unset, then the change can | 3557 | * in the child subtrees. If it is unset, then the change can |
3559 | * occur, provided the current cgroup has no children. | 3558 | * occur, provided the current cgroup has no children. |
3560 | * | 3559 | * |
3561 | * For the root cgroup, parent_mem is NULL, we allow value to be | 3560 | * For the root cgroup, parent_mem is NULL, we allow value to be |
3562 | * set if there are no children. | 3561 | * set if there are no children. |
3563 | */ | 3562 | */ |
3564 | if ((!parent_mem || !parent_mem->use_hierarchy) && | 3563 | if ((!parent_mem || !parent_mem->use_hierarchy) && |
3565 | (val == 1 || val == 0)) { | 3564 | (val == 1 || val == 0)) { |
3566 | if (list_empty(&cont->children)) | 3565 | if (list_empty(&cont->children)) |
3567 | mem->use_hierarchy = val; | 3566 | mem->use_hierarchy = val; |
3568 | else | 3567 | else |
3569 | retval = -EBUSY; | 3568 | retval = -EBUSY; |
3570 | } else | 3569 | } else |
3571 | retval = -EINVAL; | 3570 | retval = -EINVAL; |
3572 | cgroup_unlock(); | 3571 | cgroup_unlock(); |
3573 | 3572 | ||
3574 | return retval; | 3573 | return retval; |
3575 | } | 3574 | } |
3576 | 3575 | ||
3577 | 3576 | ||
3578 | static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem, | 3577 | static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem, |
3579 | enum mem_cgroup_stat_index idx) | 3578 | enum mem_cgroup_stat_index idx) |
3580 | { | 3579 | { |
3581 | struct mem_cgroup *iter; | 3580 | struct mem_cgroup *iter; |
3582 | long val = 0; | 3581 | long val = 0; |
3583 | 3582 | ||
3584 | /* Per-cpu values can be negative, use a signed accumulator */ | 3583 | /* Per-cpu values can be negative, use a signed accumulator */ |
3585 | for_each_mem_cgroup_tree(iter, mem) | 3584 | for_each_mem_cgroup_tree(iter, mem) |
3586 | val += mem_cgroup_read_stat(iter, idx); | 3585 | val += mem_cgroup_read_stat(iter, idx); |
3587 | 3586 | ||
3588 | if (val < 0) /* race ? */ | 3587 | if (val < 0) /* race ? */ |
3589 | val = 0; | 3588 | val = 0; |
3590 | return val; | 3589 | return val; |
3591 | } | 3590 | } |
3592 | 3591 | ||
3593 | static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) | 3592 | static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) |
3594 | { | 3593 | { |
3595 | u64 val; | 3594 | u64 val; |
3596 | 3595 | ||
3597 | if (!mem_cgroup_is_root(mem)) { | 3596 | if (!mem_cgroup_is_root(mem)) { |
3598 | if (!swap) | 3597 | if (!swap) |
3599 | return res_counter_read_u64(&mem->res, RES_USAGE); | 3598 | return res_counter_read_u64(&mem->res, RES_USAGE); |
3600 | else | 3599 | else |
3601 | return res_counter_read_u64(&mem->memsw, RES_USAGE); | 3600 | return res_counter_read_u64(&mem->memsw, RES_USAGE); |
3602 | } | 3601 | } |
3603 | 3602 | ||
3604 | val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE); | 3603 | val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE); |
3605 | val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS); | 3604 | val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS); |
3606 | 3605 | ||
3607 | if (swap) | 3606 | if (swap) |
3608 | val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT); | 3607 | val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT); |
3609 | 3608 | ||
3610 | return val << PAGE_SHIFT; | 3609 | return val << PAGE_SHIFT; |
3611 | } | 3610 | } |
3612 | 3611 | ||
3613 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | 3612 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) |
3614 | { | 3613 | { |
3615 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 3614 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
3616 | u64 val; | 3615 | u64 val; |
3617 | int type, name; | 3616 | int type, name; |
3618 | 3617 | ||
3619 | type = MEMFILE_TYPE(cft->private); | 3618 | type = MEMFILE_TYPE(cft->private); |
3620 | name = MEMFILE_ATTR(cft->private); | 3619 | name = MEMFILE_ATTR(cft->private); |
3621 | switch (type) { | 3620 | switch (type) { |
3622 | case _MEM: | 3621 | case _MEM: |
3623 | if (name == RES_USAGE) | 3622 | if (name == RES_USAGE) |
3624 | val = mem_cgroup_usage(mem, false); | 3623 | val = mem_cgroup_usage(mem, false); |
3625 | else | 3624 | else |
3626 | val = res_counter_read_u64(&mem->res, name); | 3625 | val = res_counter_read_u64(&mem->res, name); |
3627 | break; | 3626 | break; |
3628 | case _MEMSWAP: | 3627 | case _MEMSWAP: |
3629 | if (name == RES_USAGE) | 3628 | if (name == RES_USAGE) |
3630 | val = mem_cgroup_usage(mem, true); | 3629 | val = mem_cgroup_usage(mem, true); |
3631 | else | 3630 | else |
3632 | val = res_counter_read_u64(&mem->memsw, name); | 3631 | val = res_counter_read_u64(&mem->memsw, name); |
3633 | break; | 3632 | break; |
3634 | default: | 3633 | default: |
3635 | BUG(); | 3634 | BUG(); |
3636 | break; | 3635 | break; |
3637 | } | 3636 | } |
3638 | return val; | 3637 | return val; |
3639 | } | 3638 | } |
3640 | /* | 3639 | /* |
3641 | * The user of this function is... | 3640 | * The user of this function is... |
3642 | * RES_LIMIT. | 3641 | * RES_LIMIT. |
3643 | */ | 3642 | */ |
3644 | static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | 3643 | static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, |
3645 | const char *buffer) | 3644 | const char *buffer) |
3646 | { | 3645 | { |
3647 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 3646 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
3648 | int type, name; | 3647 | int type, name; |
3649 | unsigned long long val; | 3648 | unsigned long long val; |
3650 | int ret; | 3649 | int ret; |
3651 | 3650 | ||
3652 | type = MEMFILE_TYPE(cft->private); | 3651 | type = MEMFILE_TYPE(cft->private); |
3653 | name = MEMFILE_ATTR(cft->private); | 3652 | name = MEMFILE_ATTR(cft->private); |
3654 | switch (name) { | 3653 | switch (name) { |
3655 | case RES_LIMIT: | 3654 | case RES_LIMIT: |
3656 | if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ | 3655 | if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ |
3657 | ret = -EINVAL; | 3656 | ret = -EINVAL; |
3658 | break; | 3657 | break; |
3659 | } | 3658 | } |
3660 | /* This function does all necessary parse...reuse it */ | 3659 | /* This function does all necessary parse...reuse it */ |
3661 | ret = res_counter_memparse_write_strategy(buffer, &val); | 3660 | ret = res_counter_memparse_write_strategy(buffer, &val); |
3662 | if (ret) | 3661 | if (ret) |
3663 | break; | 3662 | break; |
3664 | if (type == _MEM) | 3663 | if (type == _MEM) |
3665 | ret = mem_cgroup_resize_limit(memcg, val); | 3664 | ret = mem_cgroup_resize_limit(memcg, val); |
3666 | else | 3665 | else |
3667 | ret = mem_cgroup_resize_memsw_limit(memcg, val); | 3666 | ret = mem_cgroup_resize_memsw_limit(memcg, val); |
3668 | break; | 3667 | break; |
3669 | case RES_SOFT_LIMIT: | 3668 | case RES_SOFT_LIMIT: |
3670 | ret = res_counter_memparse_write_strategy(buffer, &val); | 3669 | ret = res_counter_memparse_write_strategy(buffer, &val); |
3671 | if (ret) | 3670 | if (ret) |
3672 | break; | 3671 | break; |
3673 | /* | 3672 | /* |
3674 | * For memsw, soft limits are hard to implement in terms | 3673 | * For memsw, soft limits are hard to implement in terms |
3675 | * of semantics, for now, we support soft limits for | 3674 | * of semantics, for now, we support soft limits for |
3676 | * control without swap | 3675 | * control without swap |
3677 | */ | 3676 | */ |
3678 | if (type == _MEM) | 3677 | if (type == _MEM) |
3679 | ret = res_counter_set_soft_limit(&memcg->res, val); | 3678 | ret = res_counter_set_soft_limit(&memcg->res, val); |
3680 | else | 3679 | else |
3681 | ret = -EINVAL; | 3680 | ret = -EINVAL; |
3682 | break; | 3681 | break; |
3683 | default: | 3682 | default: |
3684 | ret = -EINVAL; /* should be BUG() ? */ | 3683 | ret = -EINVAL; /* should be BUG() ? */ |
3685 | break; | 3684 | break; |
3686 | } | 3685 | } |
3687 | return ret; | 3686 | return ret; |
3688 | } | 3687 | } |
3689 | 3688 | ||
3690 | static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, | 3689 | static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, |
3691 | unsigned long long *mem_limit, unsigned long long *memsw_limit) | 3690 | unsigned long long *mem_limit, unsigned long long *memsw_limit) |
3692 | { | 3691 | { |
3693 | struct cgroup *cgroup; | 3692 | struct cgroup *cgroup; |
3694 | unsigned long long min_limit, min_memsw_limit, tmp; | 3693 | unsigned long long min_limit, min_memsw_limit, tmp; |
3695 | 3694 | ||
3696 | min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); | 3695 | min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); |
3697 | min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | 3696 | min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); |
3698 | cgroup = memcg->css.cgroup; | 3697 | cgroup = memcg->css.cgroup; |
3699 | if (!memcg->use_hierarchy) | 3698 | if (!memcg->use_hierarchy) |
3700 | goto out; | 3699 | goto out; |
3701 | 3700 | ||
3702 | while (cgroup->parent) { | 3701 | while (cgroup->parent) { |
3703 | cgroup = cgroup->parent; | 3702 | cgroup = cgroup->parent; |
3704 | memcg = mem_cgroup_from_cont(cgroup); | 3703 | memcg = mem_cgroup_from_cont(cgroup); |
3705 | if (!memcg->use_hierarchy) | 3704 | if (!memcg->use_hierarchy) |
3706 | break; | 3705 | break; |
3707 | tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); | 3706 | tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); |
3708 | min_limit = min(min_limit, tmp); | 3707 | min_limit = min(min_limit, tmp); |
3709 | tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | 3708 | tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); |
3710 | min_memsw_limit = min(min_memsw_limit, tmp); | 3709 | min_memsw_limit = min(min_memsw_limit, tmp); |
3711 | } | 3710 | } |
3712 | out: | 3711 | out: |
3713 | *mem_limit = min_limit; | 3712 | *mem_limit = min_limit; |
3714 | *memsw_limit = min_memsw_limit; | 3713 | *memsw_limit = min_memsw_limit; |
3715 | return; | 3714 | return; |
3716 | } | 3715 | } |
3717 | 3716 | ||
3718 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | 3717 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) |
3719 | { | 3718 | { |
3720 | struct mem_cgroup *mem; | 3719 | struct mem_cgroup *mem; |
3721 | int type, name; | 3720 | int type, name; |
3722 | 3721 | ||
3723 | mem = mem_cgroup_from_cont(cont); | 3722 | mem = mem_cgroup_from_cont(cont); |
3724 | type = MEMFILE_TYPE(event); | 3723 | type = MEMFILE_TYPE(event); |
3725 | name = MEMFILE_ATTR(event); | 3724 | name = MEMFILE_ATTR(event); |
3726 | switch (name) { | 3725 | switch (name) { |
3727 | case RES_MAX_USAGE: | 3726 | case RES_MAX_USAGE: |
3728 | if (type == _MEM) | 3727 | if (type == _MEM) |
3729 | res_counter_reset_max(&mem->res); | 3728 | res_counter_reset_max(&mem->res); |
3730 | else | 3729 | else |
3731 | res_counter_reset_max(&mem->memsw); | 3730 | res_counter_reset_max(&mem->memsw); |
3732 | break; | 3731 | break; |
3733 | case RES_FAILCNT: | 3732 | case RES_FAILCNT: |
3734 | if (type == _MEM) | 3733 | if (type == _MEM) |
3735 | res_counter_reset_failcnt(&mem->res); | 3734 | res_counter_reset_failcnt(&mem->res); |
3736 | else | 3735 | else |
3737 | res_counter_reset_failcnt(&mem->memsw); | 3736 | res_counter_reset_failcnt(&mem->memsw); |
3738 | break; | 3737 | break; |
3739 | } | 3738 | } |
3740 | 3739 | ||
3741 | return 0; | 3740 | return 0; |
3742 | } | 3741 | } |
3743 | 3742 | ||
3744 | static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, | 3743 | static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, |
3745 | struct cftype *cft) | 3744 | struct cftype *cft) |
3746 | { | 3745 | { |
3747 | return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; | 3746 | return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; |
3748 | } | 3747 | } |
3749 | 3748 | ||
3750 | #ifdef CONFIG_MMU | 3749 | #ifdef CONFIG_MMU |
3751 | static int mem_cgroup_move_charge_write(struct cgroup *cgrp, | 3750 | static int mem_cgroup_move_charge_write(struct cgroup *cgrp, |
3752 | struct cftype *cft, u64 val) | 3751 | struct cftype *cft, u64 val) |
3753 | { | 3752 | { |
3754 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | 3753 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); |
3755 | 3754 | ||
3756 | if (val >= (1 << NR_MOVE_TYPE)) | 3755 | if (val >= (1 << NR_MOVE_TYPE)) |
3757 | return -EINVAL; | 3756 | return -EINVAL; |
3758 | /* | 3757 | /* |
3759 | * We check this value several times in both in can_attach() and | 3758 | * We check this value several times in both in can_attach() and |
3760 | * attach(), so we need cgroup lock to prevent this value from being | 3759 | * attach(), so we need cgroup lock to prevent this value from being |
3761 | * inconsistent. | 3760 | * inconsistent. |
3762 | */ | 3761 | */ |
3763 | cgroup_lock(); | 3762 | cgroup_lock(); |
3764 | mem->move_charge_at_immigrate = val; | 3763 | mem->move_charge_at_immigrate = val; |
3765 | cgroup_unlock(); | 3764 | cgroup_unlock(); |
3766 | 3765 | ||
3767 | return 0; | 3766 | return 0; |
3768 | } | 3767 | } |
3769 | #else | 3768 | #else |
3770 | static int mem_cgroup_move_charge_write(struct cgroup *cgrp, | 3769 | static int mem_cgroup_move_charge_write(struct cgroup *cgrp, |
3771 | struct cftype *cft, u64 val) | 3770 | struct cftype *cft, u64 val) |
3772 | { | 3771 | { |
3773 | return -ENOSYS; | 3772 | return -ENOSYS; |
3774 | } | 3773 | } |
3775 | #endif | 3774 | #endif |
3776 | 3775 | ||
3777 | 3776 | ||
3778 | /* For read statistics */ | 3777 | /* For read statistics */ |
3779 | enum { | 3778 | enum { |
3780 | MCS_CACHE, | 3779 | MCS_CACHE, |
3781 | MCS_RSS, | 3780 | MCS_RSS, |
3782 | MCS_FILE_MAPPED, | 3781 | MCS_FILE_MAPPED, |
3783 | MCS_PGPGIN, | 3782 | MCS_PGPGIN, |
3784 | MCS_PGPGOUT, | 3783 | MCS_PGPGOUT, |
3785 | MCS_SWAP, | 3784 | MCS_SWAP, |
3786 | MCS_INACTIVE_ANON, | 3785 | MCS_INACTIVE_ANON, |
3787 | MCS_ACTIVE_ANON, | 3786 | MCS_ACTIVE_ANON, |
3788 | MCS_INACTIVE_FILE, | 3787 | MCS_INACTIVE_FILE, |
3789 | MCS_ACTIVE_FILE, | 3788 | MCS_ACTIVE_FILE, |
3790 | MCS_UNEVICTABLE, | 3789 | MCS_UNEVICTABLE, |
3791 | NR_MCS_STAT, | 3790 | NR_MCS_STAT, |
3792 | }; | 3791 | }; |
3793 | 3792 | ||
3794 | struct mcs_total_stat { | 3793 | struct mcs_total_stat { |
3795 | s64 stat[NR_MCS_STAT]; | 3794 | s64 stat[NR_MCS_STAT]; |
3796 | }; | 3795 | }; |
3797 | 3796 | ||
3798 | struct { | 3797 | struct { |
3799 | char *local_name; | 3798 | char *local_name; |
3800 | char *total_name; | 3799 | char *total_name; |
3801 | } memcg_stat_strings[NR_MCS_STAT] = { | 3800 | } memcg_stat_strings[NR_MCS_STAT] = { |
3802 | {"cache", "total_cache"}, | 3801 | {"cache", "total_cache"}, |
3803 | {"rss", "total_rss"}, | 3802 | {"rss", "total_rss"}, |
3804 | {"mapped_file", "total_mapped_file"}, | 3803 | {"mapped_file", "total_mapped_file"}, |
3805 | {"pgpgin", "total_pgpgin"}, | 3804 | {"pgpgin", "total_pgpgin"}, |
3806 | {"pgpgout", "total_pgpgout"}, | 3805 | {"pgpgout", "total_pgpgout"}, |
3807 | {"swap", "total_swap"}, | 3806 | {"swap", "total_swap"}, |
3808 | {"inactive_anon", "total_inactive_anon"}, | 3807 | {"inactive_anon", "total_inactive_anon"}, |
3809 | {"active_anon", "total_active_anon"}, | 3808 | {"active_anon", "total_active_anon"}, |
3810 | {"inactive_file", "total_inactive_file"}, | 3809 | {"inactive_file", "total_inactive_file"}, |
3811 | {"active_file", "total_active_file"}, | 3810 | {"active_file", "total_active_file"}, |
3812 | {"unevictable", "total_unevictable"} | 3811 | {"unevictable", "total_unevictable"} |
3813 | }; | 3812 | }; |
3814 | 3813 | ||
3815 | 3814 | ||
3816 | static void | 3815 | static void |
3817 | mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) | 3816 | mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) |
3818 | { | 3817 | { |
3819 | s64 val; | 3818 | s64 val; |
3820 | 3819 | ||
3821 | /* per cpu stat */ | 3820 | /* per cpu stat */ |
3822 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); | 3821 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); |
3823 | s->stat[MCS_CACHE] += val * PAGE_SIZE; | 3822 | s->stat[MCS_CACHE] += val * PAGE_SIZE; |
3824 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); | 3823 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); |
3825 | s->stat[MCS_RSS] += val * PAGE_SIZE; | 3824 | s->stat[MCS_RSS] += val * PAGE_SIZE; |
3826 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); | 3825 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); |
3827 | s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; | 3826 | s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; |
3828 | val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN); | 3827 | val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN); |
3829 | s->stat[MCS_PGPGIN] += val; | 3828 | s->stat[MCS_PGPGIN] += val; |
3830 | val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT); | 3829 | val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT); |
3831 | s->stat[MCS_PGPGOUT] += val; | 3830 | s->stat[MCS_PGPGOUT] += val; |
3832 | if (do_swap_account) { | 3831 | if (do_swap_account) { |
3833 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); | 3832 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); |
3834 | s->stat[MCS_SWAP] += val * PAGE_SIZE; | 3833 | s->stat[MCS_SWAP] += val * PAGE_SIZE; |
3835 | } | 3834 | } |
3836 | 3835 | ||
3837 | /* per zone stat */ | 3836 | /* per zone stat */ |
3838 | val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); | 3837 | val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); |
3839 | s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; | 3838 | s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; |
3840 | val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON); | 3839 | val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON); |
3841 | s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; | 3840 | s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; |
3842 | val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE); | 3841 | val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE); |
3843 | s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; | 3842 | s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; |
3844 | val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE); | 3843 | val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE); |
3845 | s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; | 3844 | s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; |
3846 | val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); | 3845 | val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); |
3847 | s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; | 3846 | s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; |
3848 | } | 3847 | } |
3849 | 3848 | ||
3850 | static void | 3849 | static void |
3851 | mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) | 3850 | mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) |
3852 | { | 3851 | { |
3853 | struct mem_cgroup *iter; | 3852 | struct mem_cgroup *iter; |
3854 | 3853 | ||
3855 | for_each_mem_cgroup_tree(iter, mem) | 3854 | for_each_mem_cgroup_tree(iter, mem) |
3856 | mem_cgroup_get_local_stat(iter, s); | 3855 | mem_cgroup_get_local_stat(iter, s); |
3857 | } | 3856 | } |
3858 | 3857 | ||
3859 | static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | 3858 | static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, |
3860 | struct cgroup_map_cb *cb) | 3859 | struct cgroup_map_cb *cb) |
3861 | { | 3860 | { |
3862 | struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); | 3861 | struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); |
3863 | struct mcs_total_stat mystat; | 3862 | struct mcs_total_stat mystat; |
3864 | int i; | 3863 | int i; |
3865 | 3864 | ||
3866 | memset(&mystat, 0, sizeof(mystat)); | 3865 | memset(&mystat, 0, sizeof(mystat)); |
3867 | mem_cgroup_get_local_stat(mem_cont, &mystat); | 3866 | mem_cgroup_get_local_stat(mem_cont, &mystat); |
3868 | 3867 | ||
3869 | for (i = 0; i < NR_MCS_STAT; i++) { | 3868 | for (i = 0; i < NR_MCS_STAT; i++) { |
3870 | if (i == MCS_SWAP && !do_swap_account) | 3869 | if (i == MCS_SWAP && !do_swap_account) |
3871 | continue; | 3870 | continue; |
3872 | cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); | 3871 | cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); |
3873 | } | 3872 | } |
3874 | 3873 | ||
3875 | /* Hierarchical information */ | 3874 | /* Hierarchical information */ |
3876 | { | 3875 | { |
3877 | unsigned long long limit, memsw_limit; | 3876 | unsigned long long limit, memsw_limit; |
3878 | memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); | 3877 | memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); |
3879 | cb->fill(cb, "hierarchical_memory_limit", limit); | 3878 | cb->fill(cb, "hierarchical_memory_limit", limit); |
3880 | if (do_swap_account) | 3879 | if (do_swap_account) |
3881 | cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); | 3880 | cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); |
3882 | } | 3881 | } |
3883 | 3882 | ||
3884 | memset(&mystat, 0, sizeof(mystat)); | 3883 | memset(&mystat, 0, sizeof(mystat)); |
3885 | mem_cgroup_get_total_stat(mem_cont, &mystat); | 3884 | mem_cgroup_get_total_stat(mem_cont, &mystat); |
3886 | for (i = 0; i < NR_MCS_STAT; i++) { | 3885 | for (i = 0; i < NR_MCS_STAT; i++) { |
3887 | if (i == MCS_SWAP && !do_swap_account) | 3886 | if (i == MCS_SWAP && !do_swap_account) |
3888 | continue; | 3887 | continue; |
3889 | cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); | 3888 | cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); |
3890 | } | 3889 | } |
3891 | 3890 | ||
3892 | #ifdef CONFIG_DEBUG_VM | 3891 | #ifdef CONFIG_DEBUG_VM |
3893 | cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); | 3892 | cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); |
3894 | 3893 | ||
3895 | { | 3894 | { |
3896 | int nid, zid; | 3895 | int nid, zid; |
3897 | struct mem_cgroup_per_zone *mz; | 3896 | struct mem_cgroup_per_zone *mz; |
3898 | unsigned long recent_rotated[2] = {0, 0}; | 3897 | unsigned long recent_rotated[2] = {0, 0}; |
3899 | unsigned long recent_scanned[2] = {0, 0}; | 3898 | unsigned long recent_scanned[2] = {0, 0}; |
3900 | 3899 | ||
3901 | for_each_online_node(nid) | 3900 | for_each_online_node(nid) |
3902 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 3901 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { |
3903 | mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); | 3902 | mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); |
3904 | 3903 | ||
3905 | recent_rotated[0] += | 3904 | recent_rotated[0] += |
3906 | mz->reclaim_stat.recent_rotated[0]; | 3905 | mz->reclaim_stat.recent_rotated[0]; |
3907 | recent_rotated[1] += | 3906 | recent_rotated[1] += |
3908 | mz->reclaim_stat.recent_rotated[1]; | 3907 | mz->reclaim_stat.recent_rotated[1]; |
3909 | recent_scanned[0] += | 3908 | recent_scanned[0] += |
3910 | mz->reclaim_stat.recent_scanned[0]; | 3909 | mz->reclaim_stat.recent_scanned[0]; |
3911 | recent_scanned[1] += | 3910 | recent_scanned[1] += |
3912 | mz->reclaim_stat.recent_scanned[1]; | 3911 | mz->reclaim_stat.recent_scanned[1]; |
3913 | } | 3912 | } |
3914 | cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); | 3913 | cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); |
3915 | cb->fill(cb, "recent_rotated_file", recent_rotated[1]); | 3914 | cb->fill(cb, "recent_rotated_file", recent_rotated[1]); |
3916 | cb->fill(cb, "recent_scanned_anon", recent_scanned[0]); | 3915 | cb->fill(cb, "recent_scanned_anon", recent_scanned[0]); |
3917 | cb->fill(cb, "recent_scanned_file", recent_scanned[1]); | 3916 | cb->fill(cb, "recent_scanned_file", recent_scanned[1]); |
3918 | } | 3917 | } |
3919 | #endif | 3918 | #endif |
3920 | 3919 | ||
3921 | return 0; | 3920 | return 0; |
3922 | } | 3921 | } |
3923 | 3922 | ||
3924 | static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) | 3923 | static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) |
3925 | { | 3924 | { |
3926 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | 3925 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
3927 | 3926 | ||
3928 | return get_swappiness(memcg); | 3927 | return get_swappiness(memcg); |
3929 | } | 3928 | } |
3930 | 3929 | ||
3931 | static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, | 3930 | static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, |
3932 | u64 val) | 3931 | u64 val) |
3933 | { | 3932 | { |
3934 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | 3933 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
3935 | struct mem_cgroup *parent; | 3934 | struct mem_cgroup *parent; |
3936 | 3935 | ||
3937 | if (val > 100) | 3936 | if (val > 100) |
3938 | return -EINVAL; | 3937 | return -EINVAL; |
3939 | 3938 | ||
3940 | if (cgrp->parent == NULL) | 3939 | if (cgrp->parent == NULL) |
3941 | return -EINVAL; | 3940 | return -EINVAL; |
3942 | 3941 | ||
3943 | parent = mem_cgroup_from_cont(cgrp->parent); | 3942 | parent = mem_cgroup_from_cont(cgrp->parent); |
3944 | 3943 | ||
3945 | cgroup_lock(); | 3944 | cgroup_lock(); |
3946 | 3945 | ||
3947 | /* If under hierarchy, only empty-root can set this value */ | 3946 | /* If under hierarchy, only empty-root can set this value */ |
3948 | if ((parent->use_hierarchy) || | 3947 | if ((parent->use_hierarchy) || |
3949 | (memcg->use_hierarchy && !list_empty(&cgrp->children))) { | 3948 | (memcg->use_hierarchy && !list_empty(&cgrp->children))) { |
3950 | cgroup_unlock(); | 3949 | cgroup_unlock(); |
3951 | return -EINVAL; | 3950 | return -EINVAL; |
3952 | } | 3951 | } |
3953 | 3952 | ||
3954 | memcg->swappiness = val; | 3953 | memcg->swappiness = val; |
3955 | 3954 | ||
3956 | cgroup_unlock(); | 3955 | cgroup_unlock(); |
3957 | 3956 | ||
3958 | return 0; | 3957 | return 0; |
3959 | } | 3958 | } |
3960 | 3959 | ||
3961 | static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) | 3960 | static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) |
3962 | { | 3961 | { |
3963 | struct mem_cgroup_threshold_ary *t; | 3962 | struct mem_cgroup_threshold_ary *t; |
3964 | u64 usage; | 3963 | u64 usage; |
3965 | int i; | 3964 | int i; |
3966 | 3965 | ||
3967 | rcu_read_lock(); | 3966 | rcu_read_lock(); |
3968 | if (!swap) | 3967 | if (!swap) |
3969 | t = rcu_dereference(memcg->thresholds.primary); | 3968 | t = rcu_dereference(memcg->thresholds.primary); |
3970 | else | 3969 | else |
3971 | t = rcu_dereference(memcg->memsw_thresholds.primary); | 3970 | t = rcu_dereference(memcg->memsw_thresholds.primary); |
3972 | 3971 | ||
3973 | if (!t) | 3972 | if (!t) |
3974 | goto unlock; | 3973 | goto unlock; |
3975 | 3974 | ||
3976 | usage = mem_cgroup_usage(memcg, swap); | 3975 | usage = mem_cgroup_usage(memcg, swap); |
3977 | 3976 | ||
3978 | /* | 3977 | /* |
3979 | * current_threshold points to threshold just below usage. | 3978 | * current_threshold points to threshold just below usage. |
3980 | * If it's not true, a threshold was crossed after last | 3979 | * If it's not true, a threshold was crossed after last |
3981 | * call of __mem_cgroup_threshold(). | 3980 | * call of __mem_cgroup_threshold(). |
3982 | */ | 3981 | */ |
3983 | i = t->current_threshold; | 3982 | i = t->current_threshold; |
3984 | 3983 | ||
3985 | /* | 3984 | /* |
3986 | * Iterate backward over array of thresholds starting from | 3985 | * Iterate backward over array of thresholds starting from |
3987 | * current_threshold and check if a threshold is crossed. | 3986 | * current_threshold and check if a threshold is crossed. |
3988 | * If none of thresholds below usage is crossed, we read | 3987 | * If none of thresholds below usage is crossed, we read |
3989 | * only one element of the array here. | 3988 | * only one element of the array here. |
3990 | */ | 3989 | */ |
3991 | for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) | 3990 | for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) |
3992 | eventfd_signal(t->entries[i].eventfd, 1); | 3991 | eventfd_signal(t->entries[i].eventfd, 1); |
3993 | 3992 | ||
3994 | /* i = current_threshold + 1 */ | 3993 | /* i = current_threshold + 1 */ |
3995 | i++; | 3994 | i++; |
3996 | 3995 | ||
3997 | /* | 3996 | /* |
3998 | * Iterate forward over array of thresholds starting from | 3997 | * Iterate forward over array of thresholds starting from |
3999 | * current_threshold+1 and check if a threshold is crossed. | 3998 | * current_threshold+1 and check if a threshold is crossed. |
4000 | * If none of thresholds above usage is crossed, we read | 3999 | * If none of thresholds above usage is crossed, we read |
4001 | * only one element of the array here. | 4000 | * only one element of the array here. |
4002 | */ | 4001 | */ |
4003 | for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) | 4002 | for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) |
4004 | eventfd_signal(t->entries[i].eventfd, 1); | 4003 | eventfd_signal(t->entries[i].eventfd, 1); |
4005 | 4004 | ||
4006 | /* Update current_threshold */ | 4005 | /* Update current_threshold */ |
4007 | t->current_threshold = i - 1; | 4006 | t->current_threshold = i - 1; |
4008 | unlock: | 4007 | unlock: |
4009 | rcu_read_unlock(); | 4008 | rcu_read_unlock(); |
4010 | } | 4009 | } |
4011 | 4010 | ||
4012 | static void mem_cgroup_threshold(struct mem_cgroup *memcg) | 4011 | static void mem_cgroup_threshold(struct mem_cgroup *memcg) |
4013 | { | 4012 | { |
4014 | while (memcg) { | 4013 | while (memcg) { |
4015 | __mem_cgroup_threshold(memcg, false); | 4014 | __mem_cgroup_threshold(memcg, false); |
4016 | if (do_swap_account) | 4015 | if (do_swap_account) |
4017 | __mem_cgroup_threshold(memcg, true); | 4016 | __mem_cgroup_threshold(memcg, true); |
4018 | 4017 | ||
4019 | memcg = parent_mem_cgroup(memcg); | 4018 | memcg = parent_mem_cgroup(memcg); |
4020 | } | 4019 | } |
4021 | } | 4020 | } |
4022 | 4021 | ||
4023 | static int compare_thresholds(const void *a, const void *b) | 4022 | static int compare_thresholds(const void *a, const void *b) |
4024 | { | 4023 | { |
4025 | const struct mem_cgroup_threshold *_a = a; | 4024 | const struct mem_cgroup_threshold *_a = a; |
4026 | const struct mem_cgroup_threshold *_b = b; | 4025 | const struct mem_cgroup_threshold *_b = b; |
4027 | 4026 | ||
4028 | return _a->threshold - _b->threshold; | 4027 | return _a->threshold - _b->threshold; |
4029 | } | 4028 | } |
4030 | 4029 | ||
4031 | static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem) | 4030 | static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem) |
4032 | { | 4031 | { |
4033 | struct mem_cgroup_eventfd_list *ev; | 4032 | struct mem_cgroup_eventfd_list *ev; |
4034 | 4033 | ||
4035 | list_for_each_entry(ev, &mem->oom_notify, list) | 4034 | list_for_each_entry(ev, &mem->oom_notify, list) |
4036 | eventfd_signal(ev->eventfd, 1); | 4035 | eventfd_signal(ev->eventfd, 1); |
4037 | return 0; | 4036 | return 0; |
4038 | } | 4037 | } |
4039 | 4038 | ||
4040 | static void mem_cgroup_oom_notify(struct mem_cgroup *mem) | 4039 | static void mem_cgroup_oom_notify(struct mem_cgroup *mem) |
4041 | { | 4040 | { |
4042 | struct mem_cgroup *iter; | 4041 | struct mem_cgroup *iter; |
4043 | 4042 | ||
4044 | for_each_mem_cgroup_tree(iter, mem) | 4043 | for_each_mem_cgroup_tree(iter, mem) |
4045 | mem_cgroup_oom_notify_cb(iter); | 4044 | mem_cgroup_oom_notify_cb(iter); |
4046 | } | 4045 | } |
4047 | 4046 | ||
4048 | static int mem_cgroup_usage_register_event(struct cgroup *cgrp, | 4047 | static int mem_cgroup_usage_register_event(struct cgroup *cgrp, |
4049 | struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) | 4048 | struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) |
4050 | { | 4049 | { |
4051 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | 4050 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
4052 | struct mem_cgroup_thresholds *thresholds; | 4051 | struct mem_cgroup_thresholds *thresholds; |
4053 | struct mem_cgroup_threshold_ary *new; | 4052 | struct mem_cgroup_threshold_ary *new; |
4054 | int type = MEMFILE_TYPE(cft->private); | 4053 | int type = MEMFILE_TYPE(cft->private); |
4055 | u64 threshold, usage; | 4054 | u64 threshold, usage; |
4056 | int i, size, ret; | 4055 | int i, size, ret; |
4057 | 4056 | ||
4058 | ret = res_counter_memparse_write_strategy(args, &threshold); | 4057 | ret = res_counter_memparse_write_strategy(args, &threshold); |
4059 | if (ret) | 4058 | if (ret) |
4060 | return ret; | 4059 | return ret; |
4061 | 4060 | ||
4062 | mutex_lock(&memcg->thresholds_lock); | 4061 | mutex_lock(&memcg->thresholds_lock); |
4063 | 4062 | ||
4064 | if (type == _MEM) | 4063 | if (type == _MEM) |
4065 | thresholds = &memcg->thresholds; | 4064 | thresholds = &memcg->thresholds; |
4066 | else if (type == _MEMSWAP) | 4065 | else if (type == _MEMSWAP) |
4067 | thresholds = &memcg->memsw_thresholds; | 4066 | thresholds = &memcg->memsw_thresholds; |
4068 | else | 4067 | else |
4069 | BUG(); | 4068 | BUG(); |
4070 | 4069 | ||
4071 | usage = mem_cgroup_usage(memcg, type == _MEMSWAP); | 4070 | usage = mem_cgroup_usage(memcg, type == _MEMSWAP); |
4072 | 4071 | ||
4073 | /* Check if a threshold crossed before adding a new one */ | 4072 | /* Check if a threshold crossed before adding a new one */ |
4074 | if (thresholds->primary) | 4073 | if (thresholds->primary) |
4075 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); | 4074 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); |
4076 | 4075 | ||
4077 | size = thresholds->primary ? thresholds->primary->size + 1 : 1; | 4076 | size = thresholds->primary ? thresholds->primary->size + 1 : 1; |
4078 | 4077 | ||
4079 | /* Allocate memory for new array of thresholds */ | 4078 | /* Allocate memory for new array of thresholds */ |
4080 | new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), | 4079 | new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), |
4081 | GFP_KERNEL); | 4080 | GFP_KERNEL); |
4082 | if (!new) { | 4081 | if (!new) { |
4083 | ret = -ENOMEM; | 4082 | ret = -ENOMEM; |
4084 | goto unlock; | 4083 | goto unlock; |
4085 | } | 4084 | } |
4086 | new->size = size; | 4085 | new->size = size; |
4087 | 4086 | ||
4088 | /* Copy thresholds (if any) to new array */ | 4087 | /* Copy thresholds (if any) to new array */ |
4089 | if (thresholds->primary) { | 4088 | if (thresholds->primary) { |
4090 | memcpy(new->entries, thresholds->primary->entries, (size - 1) * | 4089 | memcpy(new->entries, thresholds->primary->entries, (size - 1) * |
4091 | sizeof(struct mem_cgroup_threshold)); | 4090 | sizeof(struct mem_cgroup_threshold)); |
4092 | } | 4091 | } |
4093 | 4092 | ||
4094 | /* Add new threshold */ | 4093 | /* Add new threshold */ |
4095 | new->entries[size - 1].eventfd = eventfd; | 4094 | new->entries[size - 1].eventfd = eventfd; |
4096 | new->entries[size - 1].threshold = threshold; | 4095 | new->entries[size - 1].threshold = threshold; |
4097 | 4096 | ||
4098 | /* Sort thresholds. Registering of new threshold isn't time-critical */ | 4097 | /* Sort thresholds. Registering of new threshold isn't time-critical */ |
4099 | sort(new->entries, size, sizeof(struct mem_cgroup_threshold), | 4098 | sort(new->entries, size, sizeof(struct mem_cgroup_threshold), |
4100 | compare_thresholds, NULL); | 4099 | compare_thresholds, NULL); |
4101 | 4100 | ||
4102 | /* Find current threshold */ | 4101 | /* Find current threshold */ |
4103 | new->current_threshold = -1; | 4102 | new->current_threshold = -1; |
4104 | for (i = 0; i < size; i++) { | 4103 | for (i = 0; i < size; i++) { |
4105 | if (new->entries[i].threshold < usage) { | 4104 | if (new->entries[i].threshold < usage) { |
4106 | /* | 4105 | /* |
4107 | * new->current_threshold will not be used until | 4106 | * new->current_threshold will not be used until |
4108 | * rcu_assign_pointer(), so it's safe to increment | 4107 | * rcu_assign_pointer(), so it's safe to increment |
4109 | * it here. | 4108 | * it here. |
4110 | */ | 4109 | */ |
4111 | ++new->current_threshold; | 4110 | ++new->current_threshold; |
4112 | } | 4111 | } |
4113 | } | 4112 | } |
4114 | 4113 | ||
4115 | /* Free old spare buffer and save old primary buffer as spare */ | 4114 | /* Free old spare buffer and save old primary buffer as spare */ |
4116 | kfree(thresholds->spare); | 4115 | kfree(thresholds->spare); |
4117 | thresholds->spare = thresholds->primary; | 4116 | thresholds->spare = thresholds->primary; |
4118 | 4117 | ||
4119 | rcu_assign_pointer(thresholds->primary, new); | 4118 | rcu_assign_pointer(thresholds->primary, new); |
4120 | 4119 | ||
4121 | /* To be sure that nobody uses thresholds */ | 4120 | /* To be sure that nobody uses thresholds */ |
4122 | synchronize_rcu(); | 4121 | synchronize_rcu(); |
4123 | 4122 | ||
4124 | unlock: | 4123 | unlock: |
4125 | mutex_unlock(&memcg->thresholds_lock); | 4124 | mutex_unlock(&memcg->thresholds_lock); |
4126 | 4125 | ||
4127 | return ret; | 4126 | return ret; |
4128 | } | 4127 | } |
4129 | 4128 | ||
4130 | static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, | 4129 | static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, |
4131 | struct cftype *cft, struct eventfd_ctx *eventfd) | 4130 | struct cftype *cft, struct eventfd_ctx *eventfd) |
4132 | { | 4131 | { |
4133 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | 4132 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
4134 | struct mem_cgroup_thresholds *thresholds; | 4133 | struct mem_cgroup_thresholds *thresholds; |
4135 | struct mem_cgroup_threshold_ary *new; | 4134 | struct mem_cgroup_threshold_ary *new; |
4136 | int type = MEMFILE_TYPE(cft->private); | 4135 | int type = MEMFILE_TYPE(cft->private); |
4137 | u64 usage; | 4136 | u64 usage; |
4138 | int i, j, size; | 4137 | int i, j, size; |
4139 | 4138 | ||
4140 | mutex_lock(&memcg->thresholds_lock); | 4139 | mutex_lock(&memcg->thresholds_lock); |
4141 | if (type == _MEM) | 4140 | if (type == _MEM) |
4142 | thresholds = &memcg->thresholds; | 4141 | thresholds = &memcg->thresholds; |
4143 | else if (type == _MEMSWAP) | 4142 | else if (type == _MEMSWAP) |
4144 | thresholds = &memcg->memsw_thresholds; | 4143 | thresholds = &memcg->memsw_thresholds; |
4145 | else | 4144 | else |
4146 | BUG(); | 4145 | BUG(); |
4147 | 4146 | ||
4148 | /* | 4147 | /* |
4149 | * Something went wrong if we trying to unregister a threshold | 4148 | * Something went wrong if we trying to unregister a threshold |
4150 | * if we don't have thresholds | 4149 | * if we don't have thresholds |
4151 | */ | 4150 | */ |
4152 | BUG_ON(!thresholds); | 4151 | BUG_ON(!thresholds); |
4153 | 4152 | ||
4154 | usage = mem_cgroup_usage(memcg, type == _MEMSWAP); | 4153 | usage = mem_cgroup_usage(memcg, type == _MEMSWAP); |
4155 | 4154 | ||
4156 | /* Check if a threshold crossed before removing */ | 4155 | /* Check if a threshold crossed before removing */ |
4157 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); | 4156 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); |
4158 | 4157 | ||
4159 | /* Calculate new number of threshold */ | 4158 | /* Calculate new number of threshold */ |
4160 | size = 0; | 4159 | size = 0; |
4161 | for (i = 0; i < thresholds->primary->size; i++) { | 4160 | for (i = 0; i < thresholds->primary->size; i++) { |
4162 | if (thresholds->primary->entries[i].eventfd != eventfd) | 4161 | if (thresholds->primary->entries[i].eventfd != eventfd) |
4163 | size++; | 4162 | size++; |
4164 | } | 4163 | } |
4165 | 4164 | ||
4166 | new = thresholds->spare; | 4165 | new = thresholds->spare; |
4167 | 4166 | ||
4168 | /* Set thresholds array to NULL if we don't have thresholds */ | 4167 | /* Set thresholds array to NULL if we don't have thresholds */ |
4169 | if (!size) { | 4168 | if (!size) { |
4170 | kfree(new); | 4169 | kfree(new); |
4171 | new = NULL; | 4170 | new = NULL; |
4172 | goto swap_buffers; | 4171 | goto swap_buffers; |
4173 | } | 4172 | } |
4174 | 4173 | ||
4175 | new->size = size; | 4174 | new->size = size; |
4176 | 4175 | ||
4177 | /* Copy thresholds and find current threshold */ | 4176 | /* Copy thresholds and find current threshold */ |
4178 | new->current_threshold = -1; | 4177 | new->current_threshold = -1; |
4179 | for (i = 0, j = 0; i < thresholds->primary->size; i++) { | 4178 | for (i = 0, j = 0; i < thresholds->primary->size; i++) { |
4180 | if (thresholds->primary->entries[i].eventfd == eventfd) | 4179 | if (thresholds->primary->entries[i].eventfd == eventfd) |
4181 | continue; | 4180 | continue; |
4182 | 4181 | ||
4183 | new->entries[j] = thresholds->primary->entries[i]; | 4182 | new->entries[j] = thresholds->primary->entries[i]; |
4184 | if (new->entries[j].threshold < usage) { | 4183 | if (new->entries[j].threshold < usage) { |
4185 | /* | 4184 | /* |
4186 | * new->current_threshold will not be used | 4185 | * new->current_threshold will not be used |
4187 | * until rcu_assign_pointer(), so it's safe to increment | 4186 | * until rcu_assign_pointer(), so it's safe to increment |
4188 | * it here. | 4187 | * it here. |
4189 | */ | 4188 | */ |
4190 | ++new->current_threshold; | 4189 | ++new->current_threshold; |
4191 | } | 4190 | } |
4192 | j++; | 4191 | j++; |
4193 | } | 4192 | } |
4194 | 4193 | ||
4195 | swap_buffers: | 4194 | swap_buffers: |
4196 | /* Swap primary and spare array */ | 4195 | /* Swap primary and spare array */ |
4197 | thresholds->spare = thresholds->primary; | 4196 | thresholds->spare = thresholds->primary; |
4198 | rcu_assign_pointer(thresholds->primary, new); | 4197 | rcu_assign_pointer(thresholds->primary, new); |
4199 | 4198 | ||
4200 | /* To be sure that nobody uses thresholds */ | 4199 | /* To be sure that nobody uses thresholds */ |
4201 | synchronize_rcu(); | 4200 | synchronize_rcu(); |
4202 | 4201 | ||
4203 | mutex_unlock(&memcg->thresholds_lock); | 4202 | mutex_unlock(&memcg->thresholds_lock); |
4204 | } | 4203 | } |
4205 | 4204 | ||
4206 | static int mem_cgroup_oom_register_event(struct cgroup *cgrp, | 4205 | static int mem_cgroup_oom_register_event(struct cgroup *cgrp, |
4207 | struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) | 4206 | struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) |
4208 | { | 4207 | { |
4209 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | 4208 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
4210 | struct mem_cgroup_eventfd_list *event; | 4209 | struct mem_cgroup_eventfd_list *event; |
4211 | int type = MEMFILE_TYPE(cft->private); | 4210 | int type = MEMFILE_TYPE(cft->private); |
4212 | 4211 | ||
4213 | BUG_ON(type != _OOM_TYPE); | 4212 | BUG_ON(type != _OOM_TYPE); |
4214 | event = kmalloc(sizeof(*event), GFP_KERNEL); | 4213 | event = kmalloc(sizeof(*event), GFP_KERNEL); |
4215 | if (!event) | 4214 | if (!event) |
4216 | return -ENOMEM; | 4215 | return -ENOMEM; |
4217 | 4216 | ||
4218 | mutex_lock(&memcg_oom_mutex); | 4217 | mutex_lock(&memcg_oom_mutex); |
4219 | 4218 | ||
4220 | event->eventfd = eventfd; | 4219 | event->eventfd = eventfd; |
4221 | list_add(&event->list, &memcg->oom_notify); | 4220 | list_add(&event->list, &memcg->oom_notify); |
4222 | 4221 | ||
4223 | /* already in OOM ? */ | 4222 | /* already in OOM ? */ |
4224 | if (atomic_read(&memcg->oom_lock)) | 4223 | if (atomic_read(&memcg->oom_lock)) |
4225 | eventfd_signal(eventfd, 1); | 4224 | eventfd_signal(eventfd, 1); |
4226 | mutex_unlock(&memcg_oom_mutex); | 4225 | mutex_unlock(&memcg_oom_mutex); |
4227 | 4226 | ||
4228 | return 0; | 4227 | return 0; |
4229 | } | 4228 | } |
4230 | 4229 | ||
4231 | static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, | 4230 | static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, |
4232 | struct cftype *cft, struct eventfd_ctx *eventfd) | 4231 | struct cftype *cft, struct eventfd_ctx *eventfd) |
4233 | { | 4232 | { |
4234 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | 4233 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); |
4235 | struct mem_cgroup_eventfd_list *ev, *tmp; | 4234 | struct mem_cgroup_eventfd_list *ev, *tmp; |
4236 | int type = MEMFILE_TYPE(cft->private); | 4235 | int type = MEMFILE_TYPE(cft->private); |
4237 | 4236 | ||
4238 | BUG_ON(type != _OOM_TYPE); | 4237 | BUG_ON(type != _OOM_TYPE); |
4239 | 4238 | ||
4240 | mutex_lock(&memcg_oom_mutex); | 4239 | mutex_lock(&memcg_oom_mutex); |
4241 | 4240 | ||
4242 | list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { | 4241 | list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { |
4243 | if (ev->eventfd == eventfd) { | 4242 | if (ev->eventfd == eventfd) { |
4244 | list_del(&ev->list); | 4243 | list_del(&ev->list); |
4245 | kfree(ev); | 4244 | kfree(ev); |
4246 | } | 4245 | } |
4247 | } | 4246 | } |
4248 | 4247 | ||
4249 | mutex_unlock(&memcg_oom_mutex); | 4248 | mutex_unlock(&memcg_oom_mutex); |
4250 | } | 4249 | } |
4251 | 4250 | ||
4252 | static int mem_cgroup_oom_control_read(struct cgroup *cgrp, | 4251 | static int mem_cgroup_oom_control_read(struct cgroup *cgrp, |
4253 | struct cftype *cft, struct cgroup_map_cb *cb) | 4252 | struct cftype *cft, struct cgroup_map_cb *cb) |
4254 | { | 4253 | { |
4255 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | 4254 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); |
4256 | 4255 | ||
4257 | cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); | 4256 | cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); |
4258 | 4257 | ||
4259 | if (atomic_read(&mem->oom_lock)) | 4258 | if (atomic_read(&mem->oom_lock)) |
4260 | cb->fill(cb, "under_oom", 1); | 4259 | cb->fill(cb, "under_oom", 1); |
4261 | else | 4260 | else |
4262 | cb->fill(cb, "under_oom", 0); | 4261 | cb->fill(cb, "under_oom", 0); |
4263 | return 0; | 4262 | return 0; |
4264 | } | 4263 | } |
4265 | 4264 | ||
4266 | static int mem_cgroup_oom_control_write(struct cgroup *cgrp, | 4265 | static int mem_cgroup_oom_control_write(struct cgroup *cgrp, |
4267 | struct cftype *cft, u64 val) | 4266 | struct cftype *cft, u64 val) |
4268 | { | 4267 | { |
4269 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | 4268 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); |
4270 | struct mem_cgroup *parent; | 4269 | struct mem_cgroup *parent; |
4271 | 4270 | ||
4272 | /* cannot set to root cgroup and only 0 and 1 are allowed */ | 4271 | /* cannot set to root cgroup and only 0 and 1 are allowed */ |
4273 | if (!cgrp->parent || !((val == 0) || (val == 1))) | 4272 | if (!cgrp->parent || !((val == 0) || (val == 1))) |
4274 | return -EINVAL; | 4273 | return -EINVAL; |
4275 | 4274 | ||
4276 | parent = mem_cgroup_from_cont(cgrp->parent); | 4275 | parent = mem_cgroup_from_cont(cgrp->parent); |
4277 | 4276 | ||
4278 | cgroup_lock(); | 4277 | cgroup_lock(); |
4279 | /* oom-kill-disable is a flag for subhierarchy. */ | 4278 | /* oom-kill-disable is a flag for subhierarchy. */ |
4280 | if ((parent->use_hierarchy) || | 4279 | if ((parent->use_hierarchy) || |
4281 | (mem->use_hierarchy && !list_empty(&cgrp->children))) { | 4280 | (mem->use_hierarchy && !list_empty(&cgrp->children))) { |
4282 | cgroup_unlock(); | 4281 | cgroup_unlock(); |
4283 | return -EINVAL; | 4282 | return -EINVAL; |
4284 | } | 4283 | } |
4285 | mem->oom_kill_disable = val; | 4284 | mem->oom_kill_disable = val; |
4286 | if (!val) | 4285 | if (!val) |
4287 | memcg_oom_recover(mem); | 4286 | memcg_oom_recover(mem); |
4288 | cgroup_unlock(); | 4287 | cgroup_unlock(); |
4289 | return 0; | 4288 | return 0; |
4290 | } | 4289 | } |
4291 | 4290 | ||
4292 | static struct cftype mem_cgroup_files[] = { | 4291 | static struct cftype mem_cgroup_files[] = { |
4293 | { | 4292 | { |
4294 | .name = "usage_in_bytes", | 4293 | .name = "usage_in_bytes", |
4295 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), | 4294 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), |
4296 | .read_u64 = mem_cgroup_read, | 4295 | .read_u64 = mem_cgroup_read, |
4297 | .register_event = mem_cgroup_usage_register_event, | 4296 | .register_event = mem_cgroup_usage_register_event, |
4298 | .unregister_event = mem_cgroup_usage_unregister_event, | 4297 | .unregister_event = mem_cgroup_usage_unregister_event, |
4299 | }, | 4298 | }, |
4300 | { | 4299 | { |
4301 | .name = "max_usage_in_bytes", | 4300 | .name = "max_usage_in_bytes", |
4302 | .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), | 4301 | .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), |
4303 | .trigger = mem_cgroup_reset, | 4302 | .trigger = mem_cgroup_reset, |
4304 | .read_u64 = mem_cgroup_read, | 4303 | .read_u64 = mem_cgroup_read, |
4305 | }, | 4304 | }, |
4306 | { | 4305 | { |
4307 | .name = "limit_in_bytes", | 4306 | .name = "limit_in_bytes", |
4308 | .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), | 4307 | .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), |
4309 | .write_string = mem_cgroup_write, | 4308 | .write_string = mem_cgroup_write, |
4310 | .read_u64 = mem_cgroup_read, | 4309 | .read_u64 = mem_cgroup_read, |
4311 | }, | 4310 | }, |
4312 | { | 4311 | { |
4313 | .name = "soft_limit_in_bytes", | 4312 | .name = "soft_limit_in_bytes", |
4314 | .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), | 4313 | .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), |
4315 | .write_string = mem_cgroup_write, | 4314 | .write_string = mem_cgroup_write, |
4316 | .read_u64 = mem_cgroup_read, | 4315 | .read_u64 = mem_cgroup_read, |
4317 | }, | 4316 | }, |
4318 | { | 4317 | { |
4319 | .name = "failcnt", | 4318 | .name = "failcnt", |
4320 | .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), | 4319 | .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), |
4321 | .trigger = mem_cgroup_reset, | 4320 | .trigger = mem_cgroup_reset, |
4322 | .read_u64 = mem_cgroup_read, | 4321 | .read_u64 = mem_cgroup_read, |
4323 | }, | 4322 | }, |
4324 | { | 4323 | { |
4325 | .name = "stat", | 4324 | .name = "stat", |
4326 | .read_map = mem_control_stat_show, | 4325 | .read_map = mem_control_stat_show, |
4327 | }, | 4326 | }, |
4328 | { | 4327 | { |
4329 | .name = "force_empty", | 4328 | .name = "force_empty", |
4330 | .trigger = mem_cgroup_force_empty_write, | 4329 | .trigger = mem_cgroup_force_empty_write, |
4331 | }, | 4330 | }, |
4332 | { | 4331 | { |
4333 | .name = "use_hierarchy", | 4332 | .name = "use_hierarchy", |
4334 | .write_u64 = mem_cgroup_hierarchy_write, | 4333 | .write_u64 = mem_cgroup_hierarchy_write, |
4335 | .read_u64 = mem_cgroup_hierarchy_read, | 4334 | .read_u64 = mem_cgroup_hierarchy_read, |
4336 | }, | 4335 | }, |
4337 | { | 4336 | { |
4338 | .name = "swappiness", | 4337 | .name = "swappiness", |
4339 | .read_u64 = mem_cgroup_swappiness_read, | 4338 | .read_u64 = mem_cgroup_swappiness_read, |
4340 | .write_u64 = mem_cgroup_swappiness_write, | 4339 | .write_u64 = mem_cgroup_swappiness_write, |
4341 | }, | 4340 | }, |
4342 | { | 4341 | { |
4343 | .name = "move_charge_at_immigrate", | 4342 | .name = "move_charge_at_immigrate", |
4344 | .read_u64 = mem_cgroup_move_charge_read, | 4343 | .read_u64 = mem_cgroup_move_charge_read, |
4345 | .write_u64 = mem_cgroup_move_charge_write, | 4344 | .write_u64 = mem_cgroup_move_charge_write, |
4346 | }, | 4345 | }, |
4347 | { | 4346 | { |
4348 | .name = "oom_control", | 4347 | .name = "oom_control", |
4349 | .read_map = mem_cgroup_oom_control_read, | 4348 | .read_map = mem_cgroup_oom_control_read, |
4350 | .write_u64 = mem_cgroup_oom_control_write, | 4349 | .write_u64 = mem_cgroup_oom_control_write, |
4351 | .register_event = mem_cgroup_oom_register_event, | 4350 | .register_event = mem_cgroup_oom_register_event, |
4352 | .unregister_event = mem_cgroup_oom_unregister_event, | 4351 | .unregister_event = mem_cgroup_oom_unregister_event, |
4353 | .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), | 4352 | .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), |
4354 | }, | 4353 | }, |
4355 | }; | 4354 | }; |
4356 | 4355 | ||
4357 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 4356 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
4358 | static struct cftype memsw_cgroup_files[] = { | 4357 | static struct cftype memsw_cgroup_files[] = { |
4359 | { | 4358 | { |
4360 | .name = "memsw.usage_in_bytes", | 4359 | .name = "memsw.usage_in_bytes", |
4361 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | 4360 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), |
4362 | .read_u64 = mem_cgroup_read, | 4361 | .read_u64 = mem_cgroup_read, |
4363 | .register_event = mem_cgroup_usage_register_event, | 4362 | .register_event = mem_cgroup_usage_register_event, |
4364 | .unregister_event = mem_cgroup_usage_unregister_event, | 4363 | .unregister_event = mem_cgroup_usage_unregister_event, |
4365 | }, | 4364 | }, |
4366 | { | 4365 | { |
4367 | .name = "memsw.max_usage_in_bytes", | 4366 | .name = "memsw.max_usage_in_bytes", |
4368 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), | 4367 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), |
4369 | .trigger = mem_cgroup_reset, | 4368 | .trigger = mem_cgroup_reset, |
4370 | .read_u64 = mem_cgroup_read, | 4369 | .read_u64 = mem_cgroup_read, |
4371 | }, | 4370 | }, |
4372 | { | 4371 | { |
4373 | .name = "memsw.limit_in_bytes", | 4372 | .name = "memsw.limit_in_bytes", |
4374 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), | 4373 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), |
4375 | .write_string = mem_cgroup_write, | 4374 | .write_string = mem_cgroup_write, |
4376 | .read_u64 = mem_cgroup_read, | 4375 | .read_u64 = mem_cgroup_read, |
4377 | }, | 4376 | }, |
4378 | { | 4377 | { |
4379 | .name = "memsw.failcnt", | 4378 | .name = "memsw.failcnt", |
4380 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), | 4379 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), |
4381 | .trigger = mem_cgroup_reset, | 4380 | .trigger = mem_cgroup_reset, |
4382 | .read_u64 = mem_cgroup_read, | 4381 | .read_u64 = mem_cgroup_read, |
4383 | }, | 4382 | }, |
4384 | }; | 4383 | }; |
4385 | 4384 | ||
4386 | static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) | 4385 | static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) |
4387 | { | 4386 | { |
4388 | if (!do_swap_account) | 4387 | if (!do_swap_account) |
4389 | return 0; | 4388 | return 0; |
4390 | return cgroup_add_files(cont, ss, memsw_cgroup_files, | 4389 | return cgroup_add_files(cont, ss, memsw_cgroup_files, |
4391 | ARRAY_SIZE(memsw_cgroup_files)); | 4390 | ARRAY_SIZE(memsw_cgroup_files)); |
4392 | }; | 4391 | }; |
4393 | #else | 4392 | #else |
4394 | static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) | 4393 | static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) |
4395 | { | 4394 | { |
4396 | return 0; | 4395 | return 0; |
4397 | } | 4396 | } |
4398 | #endif | 4397 | #endif |
4399 | 4398 | ||
4400 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | 4399 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) |
4401 | { | 4400 | { |
4402 | struct mem_cgroup_per_node *pn; | 4401 | struct mem_cgroup_per_node *pn; |
4403 | struct mem_cgroup_per_zone *mz; | 4402 | struct mem_cgroup_per_zone *mz; |
4404 | enum lru_list l; | 4403 | enum lru_list l; |
4405 | int zone, tmp = node; | 4404 | int zone, tmp = node; |
4406 | /* | 4405 | /* |
4407 | * This routine is called against possible nodes. | 4406 | * This routine is called against possible nodes. |
4408 | * But it's BUG to call kmalloc() against offline node. | 4407 | * But it's BUG to call kmalloc() against offline node. |
4409 | * | 4408 | * |
4410 | * TODO: this routine can waste much memory for nodes which will | 4409 | * TODO: this routine can waste much memory for nodes which will |
4411 | * never be onlined. It's better to use memory hotplug callback | 4410 | * never be onlined. It's better to use memory hotplug callback |
4412 | * function. | 4411 | * function. |
4413 | */ | 4412 | */ |
4414 | if (!node_state(node, N_NORMAL_MEMORY)) | 4413 | if (!node_state(node, N_NORMAL_MEMORY)) |
4415 | tmp = -1; | 4414 | tmp = -1; |
4416 | pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); | 4415 | pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); |
4417 | if (!pn) | 4416 | if (!pn) |
4418 | return 1; | 4417 | return 1; |
4419 | 4418 | ||
4420 | mem->info.nodeinfo[node] = pn; | 4419 | mem->info.nodeinfo[node] = pn; |
4421 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 4420 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
4422 | mz = &pn->zoneinfo[zone]; | 4421 | mz = &pn->zoneinfo[zone]; |
4423 | for_each_lru(l) | 4422 | for_each_lru(l) |
4424 | INIT_LIST_HEAD(&mz->lists[l]); | 4423 | INIT_LIST_HEAD(&mz->lists[l]); |
4425 | mz->usage_in_excess = 0; | 4424 | mz->usage_in_excess = 0; |
4426 | mz->on_tree = false; | 4425 | mz->on_tree = false; |
4427 | mz->mem = mem; | 4426 | mz->mem = mem; |
4428 | } | 4427 | } |
4429 | return 0; | 4428 | return 0; |
4430 | } | 4429 | } |
4431 | 4430 | ||
4432 | static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | 4431 | static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) |
4433 | { | 4432 | { |
4434 | kfree(mem->info.nodeinfo[node]); | 4433 | kfree(mem->info.nodeinfo[node]); |
4435 | } | 4434 | } |
4436 | 4435 | ||
4437 | static struct mem_cgroup *mem_cgroup_alloc(void) | 4436 | static struct mem_cgroup *mem_cgroup_alloc(void) |
4438 | { | 4437 | { |
4439 | struct mem_cgroup *mem; | 4438 | struct mem_cgroup *mem; |
4440 | int size = sizeof(struct mem_cgroup); | 4439 | int size = sizeof(struct mem_cgroup); |
4441 | 4440 | ||
4442 | /* Can be very big if MAX_NUMNODES is very big */ | 4441 | /* Can be very big if MAX_NUMNODES is very big */ |
4443 | if (size < PAGE_SIZE) | 4442 | if (size < PAGE_SIZE) |
4444 | mem = kzalloc(size, GFP_KERNEL); | 4443 | mem = kzalloc(size, GFP_KERNEL); |
4445 | else | 4444 | else |
4446 | mem = vzalloc(size); | 4445 | mem = vzalloc(size); |
4447 | 4446 | ||
4448 | if (!mem) | 4447 | if (!mem) |
4449 | return NULL; | 4448 | return NULL; |
4450 | 4449 | ||
4451 | mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); | 4450 | mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); |
4452 | if (!mem->stat) | 4451 | if (!mem->stat) |
4453 | goto out_free; | 4452 | goto out_free; |
4454 | spin_lock_init(&mem->pcp_counter_lock); | 4453 | spin_lock_init(&mem->pcp_counter_lock); |
4455 | return mem; | 4454 | return mem; |
4456 | 4455 | ||
4457 | out_free: | 4456 | out_free: |
4458 | if (size < PAGE_SIZE) | 4457 | if (size < PAGE_SIZE) |
4459 | kfree(mem); | 4458 | kfree(mem); |
4460 | else | 4459 | else |
4461 | vfree(mem); | 4460 | vfree(mem); |
4462 | return NULL; | 4461 | return NULL; |
4463 | } | 4462 | } |
4464 | 4463 | ||
4465 | /* | 4464 | /* |
4466 | * At destroying mem_cgroup, references from swap_cgroup can remain. | 4465 | * At destroying mem_cgroup, references from swap_cgroup can remain. |
4467 | * (scanning all at force_empty is too costly...) | 4466 | * (scanning all at force_empty is too costly...) |
4468 | * | 4467 | * |
4469 | * Instead of clearing all references at force_empty, we remember | 4468 | * Instead of clearing all references at force_empty, we remember |
4470 | * the number of reference from swap_cgroup and free mem_cgroup when | 4469 | * the number of reference from swap_cgroup and free mem_cgroup when |
4471 | * it goes down to 0. | 4470 | * it goes down to 0. |
4472 | * | 4471 | * |
4473 | * Removal of cgroup itself succeeds regardless of refs from swap. | 4472 | * Removal of cgroup itself succeeds regardless of refs from swap. |
4474 | */ | 4473 | */ |
4475 | 4474 | ||
4476 | static void __mem_cgroup_free(struct mem_cgroup *mem) | 4475 | static void __mem_cgroup_free(struct mem_cgroup *mem) |
4477 | { | 4476 | { |
4478 | int node; | 4477 | int node; |
4479 | 4478 | ||
4480 | mem_cgroup_remove_from_trees(mem); | 4479 | mem_cgroup_remove_from_trees(mem); |
4481 | free_css_id(&mem_cgroup_subsys, &mem->css); | 4480 | free_css_id(&mem_cgroup_subsys, &mem->css); |
4482 | 4481 | ||
4483 | for_each_node_state(node, N_POSSIBLE) | 4482 | for_each_node_state(node, N_POSSIBLE) |
4484 | free_mem_cgroup_per_zone_info(mem, node); | 4483 | free_mem_cgroup_per_zone_info(mem, node); |
4485 | 4484 | ||
4486 | free_percpu(mem->stat); | 4485 | free_percpu(mem->stat); |
4487 | if (sizeof(struct mem_cgroup) < PAGE_SIZE) | 4486 | if (sizeof(struct mem_cgroup) < PAGE_SIZE) |
4488 | kfree(mem); | 4487 | kfree(mem); |
4489 | else | 4488 | else |
4490 | vfree(mem); | 4489 | vfree(mem); |
4491 | } | 4490 | } |
4492 | 4491 | ||
4493 | static void mem_cgroup_get(struct mem_cgroup *mem) | 4492 | static void mem_cgroup_get(struct mem_cgroup *mem) |
4494 | { | 4493 | { |
4495 | atomic_inc(&mem->refcnt); | 4494 | atomic_inc(&mem->refcnt); |
4496 | } | 4495 | } |
4497 | 4496 | ||
4498 | static void __mem_cgroup_put(struct mem_cgroup *mem, int count) | 4497 | static void __mem_cgroup_put(struct mem_cgroup *mem, int count) |
4499 | { | 4498 | { |
4500 | if (atomic_sub_and_test(count, &mem->refcnt)) { | 4499 | if (atomic_sub_and_test(count, &mem->refcnt)) { |
4501 | struct mem_cgroup *parent = parent_mem_cgroup(mem); | 4500 | struct mem_cgroup *parent = parent_mem_cgroup(mem); |
4502 | __mem_cgroup_free(mem); | 4501 | __mem_cgroup_free(mem); |
4503 | if (parent) | 4502 | if (parent) |
4504 | mem_cgroup_put(parent); | 4503 | mem_cgroup_put(parent); |
4505 | } | 4504 | } |
4506 | } | 4505 | } |
4507 | 4506 | ||
4508 | static void mem_cgroup_put(struct mem_cgroup *mem) | 4507 | static void mem_cgroup_put(struct mem_cgroup *mem) |
4509 | { | 4508 | { |
4510 | __mem_cgroup_put(mem, 1); | 4509 | __mem_cgroup_put(mem, 1); |
4511 | } | 4510 | } |
4512 | 4511 | ||
4513 | /* | 4512 | /* |
4514 | * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. | 4513 | * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. |
4515 | */ | 4514 | */ |
4516 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem) | 4515 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem) |
4517 | { | 4516 | { |
4518 | if (!mem->res.parent) | 4517 | if (!mem->res.parent) |
4519 | return NULL; | 4518 | return NULL; |
4520 | return mem_cgroup_from_res_counter(mem->res.parent, res); | 4519 | return mem_cgroup_from_res_counter(mem->res.parent, res); |
4521 | } | 4520 | } |
4522 | 4521 | ||
4523 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 4522 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
4524 | static void __init enable_swap_cgroup(void) | 4523 | static void __init enable_swap_cgroup(void) |
4525 | { | 4524 | { |
4526 | if (!mem_cgroup_disabled() && really_do_swap_account) | 4525 | if (!mem_cgroup_disabled() && really_do_swap_account) |
4527 | do_swap_account = 1; | 4526 | do_swap_account = 1; |
4528 | } | 4527 | } |
4529 | #else | 4528 | #else |
4530 | static void __init enable_swap_cgroup(void) | 4529 | static void __init enable_swap_cgroup(void) |
4531 | { | 4530 | { |
4532 | } | 4531 | } |
4533 | #endif | 4532 | #endif |
4534 | 4533 | ||
4535 | static int mem_cgroup_soft_limit_tree_init(void) | 4534 | static int mem_cgroup_soft_limit_tree_init(void) |
4536 | { | 4535 | { |
4537 | struct mem_cgroup_tree_per_node *rtpn; | 4536 | struct mem_cgroup_tree_per_node *rtpn; |
4538 | struct mem_cgroup_tree_per_zone *rtpz; | 4537 | struct mem_cgroup_tree_per_zone *rtpz; |
4539 | int tmp, node, zone; | 4538 | int tmp, node, zone; |
4540 | 4539 | ||
4541 | for_each_node_state(node, N_POSSIBLE) { | 4540 | for_each_node_state(node, N_POSSIBLE) { |
4542 | tmp = node; | 4541 | tmp = node; |
4543 | if (!node_state(node, N_NORMAL_MEMORY)) | 4542 | if (!node_state(node, N_NORMAL_MEMORY)) |
4544 | tmp = -1; | 4543 | tmp = -1; |
4545 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); | 4544 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); |
4546 | if (!rtpn) | 4545 | if (!rtpn) |
4547 | return 1; | 4546 | return 1; |
4548 | 4547 | ||
4549 | soft_limit_tree.rb_tree_per_node[node] = rtpn; | 4548 | soft_limit_tree.rb_tree_per_node[node] = rtpn; |
4550 | 4549 | ||
4551 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 4550 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
4552 | rtpz = &rtpn->rb_tree_per_zone[zone]; | 4551 | rtpz = &rtpn->rb_tree_per_zone[zone]; |
4553 | rtpz->rb_root = RB_ROOT; | 4552 | rtpz->rb_root = RB_ROOT; |
4554 | spin_lock_init(&rtpz->lock); | 4553 | spin_lock_init(&rtpz->lock); |
4555 | } | 4554 | } |
4556 | } | 4555 | } |
4557 | return 0; | 4556 | return 0; |
4558 | } | 4557 | } |
4559 | 4558 | ||
4560 | static struct cgroup_subsys_state * __ref | 4559 | static struct cgroup_subsys_state * __ref |
4561 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | 4560 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) |
4562 | { | 4561 | { |
4563 | struct mem_cgroup *mem, *parent; | 4562 | struct mem_cgroup *mem, *parent; |
4564 | long error = -ENOMEM; | 4563 | long error = -ENOMEM; |
4565 | int node; | 4564 | int node; |
4566 | 4565 | ||
4567 | mem = mem_cgroup_alloc(); | 4566 | mem = mem_cgroup_alloc(); |
4568 | if (!mem) | 4567 | if (!mem) |
4569 | return ERR_PTR(error); | 4568 | return ERR_PTR(error); |
4570 | 4569 | ||
4571 | for_each_node_state(node, N_POSSIBLE) | 4570 | for_each_node_state(node, N_POSSIBLE) |
4572 | if (alloc_mem_cgroup_per_zone_info(mem, node)) | 4571 | if (alloc_mem_cgroup_per_zone_info(mem, node)) |
4573 | goto free_out; | 4572 | goto free_out; |
4574 | 4573 | ||
4575 | /* root ? */ | 4574 | /* root ? */ |
4576 | if (cont->parent == NULL) { | 4575 | if (cont->parent == NULL) { |
4577 | int cpu; | 4576 | int cpu; |
4578 | enable_swap_cgroup(); | 4577 | enable_swap_cgroup(); |
4579 | parent = NULL; | 4578 | parent = NULL; |
4580 | root_mem_cgroup = mem; | 4579 | root_mem_cgroup = mem; |
4581 | if (mem_cgroup_soft_limit_tree_init()) | 4580 | if (mem_cgroup_soft_limit_tree_init()) |
4582 | goto free_out; | 4581 | goto free_out; |
4583 | for_each_possible_cpu(cpu) { | 4582 | for_each_possible_cpu(cpu) { |
4584 | struct memcg_stock_pcp *stock = | 4583 | struct memcg_stock_pcp *stock = |
4585 | &per_cpu(memcg_stock, cpu); | 4584 | &per_cpu(memcg_stock, cpu); |
4586 | INIT_WORK(&stock->work, drain_local_stock); | 4585 | INIT_WORK(&stock->work, drain_local_stock); |
4587 | } | 4586 | } |
4588 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); | 4587 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); |
4589 | } else { | 4588 | } else { |
4590 | parent = mem_cgroup_from_cont(cont->parent); | 4589 | parent = mem_cgroup_from_cont(cont->parent); |
4591 | mem->use_hierarchy = parent->use_hierarchy; | 4590 | mem->use_hierarchy = parent->use_hierarchy; |
4592 | mem->oom_kill_disable = parent->oom_kill_disable; | 4591 | mem->oom_kill_disable = parent->oom_kill_disable; |
4593 | } | 4592 | } |
4594 | 4593 | ||
4595 | if (parent && parent->use_hierarchy) { | 4594 | if (parent && parent->use_hierarchy) { |
4596 | res_counter_init(&mem->res, &parent->res); | 4595 | res_counter_init(&mem->res, &parent->res); |
4597 | res_counter_init(&mem->memsw, &parent->memsw); | 4596 | res_counter_init(&mem->memsw, &parent->memsw); |
4598 | /* | 4597 | /* |
4599 | * We increment refcnt of the parent to ensure that we can | 4598 | * We increment refcnt of the parent to ensure that we can |
4600 | * safely access it on res_counter_charge/uncharge. | 4599 | * safely access it on res_counter_charge/uncharge. |
4601 | * This refcnt will be decremented when freeing this | 4600 | * This refcnt will be decremented when freeing this |
4602 | * mem_cgroup(see mem_cgroup_put). | 4601 | * mem_cgroup(see mem_cgroup_put). |
4603 | */ | 4602 | */ |
4604 | mem_cgroup_get(parent); | 4603 | mem_cgroup_get(parent); |
4605 | } else { | 4604 | } else { |
4606 | res_counter_init(&mem->res, NULL); | 4605 | res_counter_init(&mem->res, NULL); |
4607 | res_counter_init(&mem->memsw, NULL); | 4606 | res_counter_init(&mem->memsw, NULL); |
4608 | } | 4607 | } |
4609 | mem->last_scanned_child = 0; | 4608 | mem->last_scanned_child = 0; |
4610 | INIT_LIST_HEAD(&mem->oom_notify); | 4609 | INIT_LIST_HEAD(&mem->oom_notify); |
4611 | 4610 | ||
4612 | if (parent) | 4611 | if (parent) |
4613 | mem->swappiness = get_swappiness(parent); | 4612 | mem->swappiness = get_swappiness(parent); |
4614 | atomic_set(&mem->refcnt, 1); | 4613 | atomic_set(&mem->refcnt, 1); |
4615 | mem->move_charge_at_immigrate = 0; | 4614 | mem->move_charge_at_immigrate = 0; |
4616 | mutex_init(&mem->thresholds_lock); | 4615 | mutex_init(&mem->thresholds_lock); |
4617 | return &mem->css; | 4616 | return &mem->css; |
4618 | free_out: | 4617 | free_out: |
4619 | __mem_cgroup_free(mem); | 4618 | __mem_cgroup_free(mem); |
4620 | root_mem_cgroup = NULL; | 4619 | root_mem_cgroup = NULL; |
4621 | return ERR_PTR(error); | 4620 | return ERR_PTR(error); |
4622 | } | 4621 | } |
4623 | 4622 | ||
4624 | static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, | 4623 | static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, |
4625 | struct cgroup *cont) | 4624 | struct cgroup *cont) |
4626 | { | 4625 | { |
4627 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 4626 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
4628 | 4627 | ||
4629 | return mem_cgroup_force_empty(mem, false); | 4628 | return mem_cgroup_force_empty(mem, false); |
4630 | } | 4629 | } |
4631 | 4630 | ||
4632 | static void mem_cgroup_destroy(struct cgroup_subsys *ss, | 4631 | static void mem_cgroup_destroy(struct cgroup_subsys *ss, |
4633 | struct cgroup *cont) | 4632 | struct cgroup *cont) |
4634 | { | 4633 | { |
4635 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 4634 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
4636 | 4635 | ||
4637 | mem_cgroup_put(mem); | 4636 | mem_cgroup_put(mem); |
4638 | } | 4637 | } |
4639 | 4638 | ||
4640 | static int mem_cgroup_populate(struct cgroup_subsys *ss, | 4639 | static int mem_cgroup_populate(struct cgroup_subsys *ss, |
4641 | struct cgroup *cont) | 4640 | struct cgroup *cont) |
4642 | { | 4641 | { |
4643 | int ret; | 4642 | int ret; |
4644 | 4643 | ||
4645 | ret = cgroup_add_files(cont, ss, mem_cgroup_files, | 4644 | ret = cgroup_add_files(cont, ss, mem_cgroup_files, |
4646 | ARRAY_SIZE(mem_cgroup_files)); | 4645 | ARRAY_SIZE(mem_cgroup_files)); |
4647 | 4646 | ||
4648 | if (!ret) | 4647 | if (!ret) |
4649 | ret = register_memsw_files(cont, ss); | 4648 | ret = register_memsw_files(cont, ss); |
4650 | return ret; | 4649 | return ret; |
4651 | } | 4650 | } |
4652 | 4651 | ||
4653 | #ifdef CONFIG_MMU | 4652 | #ifdef CONFIG_MMU |
4654 | /* Handlers for move charge at task migration. */ | 4653 | /* Handlers for move charge at task migration. */ |
4655 | #define PRECHARGE_COUNT_AT_ONCE 256 | 4654 | #define PRECHARGE_COUNT_AT_ONCE 256 |
4656 | static int mem_cgroup_do_precharge(unsigned long count) | 4655 | static int mem_cgroup_do_precharge(unsigned long count) |
4657 | { | 4656 | { |
4658 | int ret = 0; | 4657 | int ret = 0; |
4659 | int batch_count = PRECHARGE_COUNT_AT_ONCE; | 4658 | int batch_count = PRECHARGE_COUNT_AT_ONCE; |
4660 | struct mem_cgroup *mem = mc.to; | 4659 | struct mem_cgroup *mem = mc.to; |
4661 | 4660 | ||
4662 | if (mem_cgroup_is_root(mem)) { | 4661 | if (mem_cgroup_is_root(mem)) { |
4663 | mc.precharge += count; | 4662 | mc.precharge += count; |
4664 | /* we don't need css_get for root */ | 4663 | /* we don't need css_get for root */ |
4665 | return ret; | 4664 | return ret; |
4666 | } | 4665 | } |
4667 | /* try to charge at once */ | 4666 | /* try to charge at once */ |
4668 | if (count > 1) { | 4667 | if (count > 1) { |
4669 | struct res_counter *dummy; | 4668 | struct res_counter *dummy; |
4670 | /* | 4669 | /* |
4671 | * "mem" cannot be under rmdir() because we've already checked | 4670 | * "mem" cannot be under rmdir() because we've already checked |
4672 | * by cgroup_lock_live_cgroup() that it is not removed and we | 4671 | * by cgroup_lock_live_cgroup() that it is not removed and we |
4673 | * are still under the same cgroup_mutex. So we can postpone | 4672 | * are still under the same cgroup_mutex. So we can postpone |
4674 | * css_get(). | 4673 | * css_get(). |
4675 | */ | 4674 | */ |
4676 | if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy)) | 4675 | if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy)) |
4677 | goto one_by_one; | 4676 | goto one_by_one; |
4678 | if (do_swap_account && res_counter_charge(&mem->memsw, | 4677 | if (do_swap_account && res_counter_charge(&mem->memsw, |
4679 | PAGE_SIZE * count, &dummy)) { | 4678 | PAGE_SIZE * count, &dummy)) { |
4680 | res_counter_uncharge(&mem->res, PAGE_SIZE * count); | 4679 | res_counter_uncharge(&mem->res, PAGE_SIZE * count); |
4681 | goto one_by_one; | 4680 | goto one_by_one; |
4682 | } | 4681 | } |
4683 | mc.precharge += count; | 4682 | mc.precharge += count; |
4684 | return ret; | 4683 | return ret; |
4685 | } | 4684 | } |
4686 | one_by_one: | 4685 | one_by_one: |
4687 | /* fall back to one by one charge */ | 4686 | /* fall back to one by one charge */ |
4688 | while (count--) { | 4687 | while (count--) { |
4689 | if (signal_pending(current)) { | 4688 | if (signal_pending(current)) { |
4690 | ret = -EINTR; | 4689 | ret = -EINTR; |
4691 | break; | 4690 | break; |
4692 | } | 4691 | } |
4693 | if (!batch_count--) { | 4692 | if (!batch_count--) { |
4694 | batch_count = PRECHARGE_COUNT_AT_ONCE; | 4693 | batch_count = PRECHARGE_COUNT_AT_ONCE; |
4695 | cond_resched(); | 4694 | cond_resched(); |
4696 | } | 4695 | } |
4697 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false); | 4696 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false); |
4698 | if (ret || !mem) | 4697 | if (ret || !mem) |
4699 | /* mem_cgroup_clear_mc() will do uncharge later */ | 4698 | /* mem_cgroup_clear_mc() will do uncharge later */ |
4700 | return -ENOMEM; | 4699 | return -ENOMEM; |
4701 | mc.precharge++; | 4700 | mc.precharge++; |
4702 | } | 4701 | } |
4703 | return ret; | 4702 | return ret; |
4704 | } | 4703 | } |
4705 | 4704 | ||
4706 | /** | 4705 | /** |
4707 | * is_target_pte_for_mc - check a pte whether it is valid for move charge | 4706 | * is_target_pte_for_mc - check a pte whether it is valid for move charge |
4708 | * @vma: the vma the pte to be checked belongs | 4707 | * @vma: the vma the pte to be checked belongs |
4709 | * @addr: the address corresponding to the pte to be checked | 4708 | * @addr: the address corresponding to the pte to be checked |
4710 | * @ptent: the pte to be checked | 4709 | * @ptent: the pte to be checked |
4711 | * @target: the pointer the target page or swap ent will be stored(can be NULL) | 4710 | * @target: the pointer the target page or swap ent will be stored(can be NULL) |
4712 | * | 4711 | * |
4713 | * Returns | 4712 | * Returns |
4714 | * 0(MC_TARGET_NONE): if the pte is not a target for move charge. | 4713 | * 0(MC_TARGET_NONE): if the pte is not a target for move charge. |
4715 | * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for | 4714 | * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for |
4716 | * move charge. if @target is not NULL, the page is stored in target->page | 4715 | * move charge. if @target is not NULL, the page is stored in target->page |
4717 | * with extra refcnt got(Callers should handle it). | 4716 | * with extra refcnt got(Callers should handle it). |
4718 | * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a | 4717 | * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a |
4719 | * target for charge migration. if @target is not NULL, the entry is stored | 4718 | * target for charge migration. if @target is not NULL, the entry is stored |
4720 | * in target->ent. | 4719 | * in target->ent. |
4721 | * | 4720 | * |
4722 | * Called with pte lock held. | 4721 | * Called with pte lock held. |
4723 | */ | 4722 | */ |
4724 | union mc_target { | 4723 | union mc_target { |
4725 | struct page *page; | 4724 | struct page *page; |
4726 | swp_entry_t ent; | 4725 | swp_entry_t ent; |
4727 | }; | 4726 | }; |
4728 | 4727 | ||
4729 | enum mc_target_type { | 4728 | enum mc_target_type { |
4730 | MC_TARGET_NONE, /* not used */ | 4729 | MC_TARGET_NONE, /* not used */ |
4731 | MC_TARGET_PAGE, | 4730 | MC_TARGET_PAGE, |
4732 | MC_TARGET_SWAP, | 4731 | MC_TARGET_SWAP, |
4733 | }; | 4732 | }; |
4734 | 4733 | ||
4735 | static struct page *mc_handle_present_pte(struct vm_area_struct *vma, | 4734 | static struct page *mc_handle_present_pte(struct vm_area_struct *vma, |
4736 | unsigned long addr, pte_t ptent) | 4735 | unsigned long addr, pte_t ptent) |
4737 | { | 4736 | { |
4738 | struct page *page = vm_normal_page(vma, addr, ptent); | 4737 | struct page *page = vm_normal_page(vma, addr, ptent); |
4739 | 4738 | ||
4740 | if (!page || !page_mapped(page)) | 4739 | if (!page || !page_mapped(page)) |
4741 | return NULL; | 4740 | return NULL; |
4742 | if (PageAnon(page)) { | 4741 | if (PageAnon(page)) { |
4743 | /* we don't move shared anon */ | 4742 | /* we don't move shared anon */ |
4744 | if (!move_anon() || page_mapcount(page) > 2) | 4743 | if (!move_anon() || page_mapcount(page) > 2) |
4745 | return NULL; | 4744 | return NULL; |
4746 | } else if (!move_file()) | 4745 | } else if (!move_file()) |
4747 | /* we ignore mapcount for file pages */ | 4746 | /* we ignore mapcount for file pages */ |
4748 | return NULL; | 4747 | return NULL; |
4749 | if (!get_page_unless_zero(page)) | 4748 | if (!get_page_unless_zero(page)) |
4750 | return NULL; | 4749 | return NULL; |
4751 | 4750 | ||
4752 | return page; | 4751 | return page; |
4753 | } | 4752 | } |
4754 | 4753 | ||
4755 | static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, | 4754 | static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, |
4756 | unsigned long addr, pte_t ptent, swp_entry_t *entry) | 4755 | unsigned long addr, pte_t ptent, swp_entry_t *entry) |
4757 | { | 4756 | { |
4758 | int usage_count; | 4757 | int usage_count; |
4759 | struct page *page = NULL; | 4758 | struct page *page = NULL; |
4760 | swp_entry_t ent = pte_to_swp_entry(ptent); | 4759 | swp_entry_t ent = pte_to_swp_entry(ptent); |
4761 | 4760 | ||
4762 | if (!move_anon() || non_swap_entry(ent)) | 4761 | if (!move_anon() || non_swap_entry(ent)) |
4763 | return NULL; | 4762 | return NULL; |
4764 | usage_count = mem_cgroup_count_swap_user(ent, &page); | 4763 | usage_count = mem_cgroup_count_swap_user(ent, &page); |
4765 | if (usage_count > 1) { /* we don't move shared anon */ | 4764 | if (usage_count > 1) { /* we don't move shared anon */ |
4766 | if (page) | 4765 | if (page) |
4767 | put_page(page); | 4766 | put_page(page); |
4768 | return NULL; | 4767 | return NULL; |
4769 | } | 4768 | } |
4770 | if (do_swap_account) | 4769 | if (do_swap_account) |
4771 | entry->val = ent.val; | 4770 | entry->val = ent.val; |
4772 | 4771 | ||
4773 | return page; | 4772 | return page; |
4774 | } | 4773 | } |
4775 | 4774 | ||
4776 | static struct page *mc_handle_file_pte(struct vm_area_struct *vma, | 4775 | static struct page *mc_handle_file_pte(struct vm_area_struct *vma, |
4777 | unsigned long addr, pte_t ptent, swp_entry_t *entry) | 4776 | unsigned long addr, pte_t ptent, swp_entry_t *entry) |
4778 | { | 4777 | { |
4779 | struct page *page = NULL; | 4778 | struct page *page = NULL; |
4780 | struct inode *inode; | 4779 | struct inode *inode; |
4781 | struct address_space *mapping; | 4780 | struct address_space *mapping; |
4782 | pgoff_t pgoff; | 4781 | pgoff_t pgoff; |
4783 | 4782 | ||
4784 | if (!vma->vm_file) /* anonymous vma */ | 4783 | if (!vma->vm_file) /* anonymous vma */ |
4785 | return NULL; | 4784 | return NULL; |
4786 | if (!move_file()) | 4785 | if (!move_file()) |
4787 | return NULL; | 4786 | return NULL; |
4788 | 4787 | ||
4789 | inode = vma->vm_file->f_path.dentry->d_inode; | 4788 | inode = vma->vm_file->f_path.dentry->d_inode; |
4790 | mapping = vma->vm_file->f_mapping; | 4789 | mapping = vma->vm_file->f_mapping; |
4791 | if (pte_none(ptent)) | 4790 | if (pte_none(ptent)) |
4792 | pgoff = linear_page_index(vma, addr); | 4791 | pgoff = linear_page_index(vma, addr); |
4793 | else /* pte_file(ptent) is true */ | 4792 | else /* pte_file(ptent) is true */ |
4794 | pgoff = pte_to_pgoff(ptent); | 4793 | pgoff = pte_to_pgoff(ptent); |
4795 | 4794 | ||
4796 | /* page is moved even if it's not RSS of this task(page-faulted). */ | 4795 | /* page is moved even if it's not RSS of this task(page-faulted). */ |
4797 | if (!mapping_cap_swap_backed(mapping)) { /* normal file */ | 4796 | if (!mapping_cap_swap_backed(mapping)) { /* normal file */ |
4798 | page = find_get_page(mapping, pgoff); | 4797 | page = find_get_page(mapping, pgoff); |
4799 | } else { /* shmem/tmpfs file. we should take account of swap too. */ | 4798 | } else { /* shmem/tmpfs file. we should take account of swap too. */ |
4800 | swp_entry_t ent; | 4799 | swp_entry_t ent; |
4801 | mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent); | 4800 | mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent); |
4802 | if (do_swap_account) | 4801 | if (do_swap_account) |
4803 | entry->val = ent.val; | 4802 | entry->val = ent.val; |
4804 | } | 4803 | } |
4805 | 4804 | ||
4806 | return page; | 4805 | return page; |
4807 | } | 4806 | } |
4808 | 4807 | ||
4809 | static int is_target_pte_for_mc(struct vm_area_struct *vma, | 4808 | static int is_target_pte_for_mc(struct vm_area_struct *vma, |
4810 | unsigned long addr, pte_t ptent, union mc_target *target) | 4809 | unsigned long addr, pte_t ptent, union mc_target *target) |
4811 | { | 4810 | { |
4812 | struct page *page = NULL; | 4811 | struct page *page = NULL; |
4813 | struct page_cgroup *pc; | 4812 | struct page_cgroup *pc; |
4814 | int ret = 0; | 4813 | int ret = 0; |
4815 | swp_entry_t ent = { .val = 0 }; | 4814 | swp_entry_t ent = { .val = 0 }; |
4816 | 4815 | ||
4817 | if (pte_present(ptent)) | 4816 | if (pte_present(ptent)) |
4818 | page = mc_handle_present_pte(vma, addr, ptent); | 4817 | page = mc_handle_present_pte(vma, addr, ptent); |
4819 | else if (is_swap_pte(ptent)) | 4818 | else if (is_swap_pte(ptent)) |
4820 | page = mc_handle_swap_pte(vma, addr, ptent, &ent); | 4819 | page = mc_handle_swap_pte(vma, addr, ptent, &ent); |
4821 | else if (pte_none(ptent) || pte_file(ptent)) | 4820 | else if (pte_none(ptent) || pte_file(ptent)) |
4822 | page = mc_handle_file_pte(vma, addr, ptent, &ent); | 4821 | page = mc_handle_file_pte(vma, addr, ptent, &ent); |
4823 | 4822 | ||
4824 | if (!page && !ent.val) | 4823 | if (!page && !ent.val) |
4825 | return 0; | 4824 | return 0; |
4826 | if (page) { | 4825 | if (page) { |
4827 | pc = lookup_page_cgroup(page); | 4826 | pc = lookup_page_cgroup(page); |
4828 | /* | 4827 | /* |
4829 | * Do only loose check w/o page_cgroup lock. | 4828 | * Do only loose check w/o page_cgroup lock. |
4830 | * mem_cgroup_move_account() checks the pc is valid or not under | 4829 | * mem_cgroup_move_account() checks the pc is valid or not under |
4831 | * the lock. | 4830 | * the lock. |
4832 | */ | 4831 | */ |
4833 | if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { | 4832 | if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { |
4834 | ret = MC_TARGET_PAGE; | 4833 | ret = MC_TARGET_PAGE; |
4835 | if (target) | 4834 | if (target) |
4836 | target->page = page; | 4835 | target->page = page; |
4837 | } | 4836 | } |
4838 | if (!ret || !target) | 4837 | if (!ret || !target) |
4839 | put_page(page); | 4838 | put_page(page); |
4840 | } | 4839 | } |
4841 | /* There is a swap entry and a page doesn't exist or isn't charged */ | 4840 | /* There is a swap entry and a page doesn't exist or isn't charged */ |
4842 | if (ent.val && !ret && | 4841 | if (ent.val && !ret && |
4843 | css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { | 4842 | css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { |
4844 | ret = MC_TARGET_SWAP; | 4843 | ret = MC_TARGET_SWAP; |
4845 | if (target) | 4844 | if (target) |
4846 | target->ent = ent; | 4845 | target->ent = ent; |
4847 | } | 4846 | } |
4848 | return ret; | 4847 | return ret; |
4849 | } | 4848 | } |
4850 | 4849 | ||
4851 | static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | 4850 | static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, |
4852 | unsigned long addr, unsigned long end, | 4851 | unsigned long addr, unsigned long end, |
4853 | struct mm_walk *walk) | 4852 | struct mm_walk *walk) |
4854 | { | 4853 | { |
4855 | struct vm_area_struct *vma = walk->private; | 4854 | struct vm_area_struct *vma = walk->private; |
4856 | pte_t *pte; | 4855 | pte_t *pte; |
4857 | spinlock_t *ptl; | 4856 | spinlock_t *ptl; |
4858 | 4857 | ||
4859 | split_huge_page_pmd(walk->mm, pmd); | 4858 | split_huge_page_pmd(walk->mm, pmd); |
4860 | 4859 | ||
4861 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 4860 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
4862 | for (; addr != end; pte++, addr += PAGE_SIZE) | 4861 | for (; addr != end; pte++, addr += PAGE_SIZE) |
4863 | if (is_target_pte_for_mc(vma, addr, *pte, NULL)) | 4862 | if (is_target_pte_for_mc(vma, addr, *pte, NULL)) |
4864 | mc.precharge++; /* increment precharge temporarily */ | 4863 | mc.precharge++; /* increment precharge temporarily */ |
4865 | pte_unmap_unlock(pte - 1, ptl); | 4864 | pte_unmap_unlock(pte - 1, ptl); |
4866 | cond_resched(); | 4865 | cond_resched(); |
4867 | 4866 | ||
4868 | return 0; | 4867 | return 0; |
4869 | } | 4868 | } |
4870 | 4869 | ||
4871 | static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | 4870 | static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) |
4872 | { | 4871 | { |
4873 | unsigned long precharge; | 4872 | unsigned long precharge; |
4874 | struct vm_area_struct *vma; | 4873 | struct vm_area_struct *vma; |
4875 | 4874 | ||
4876 | down_read(&mm->mmap_sem); | 4875 | down_read(&mm->mmap_sem); |
4877 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 4876 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
4878 | struct mm_walk mem_cgroup_count_precharge_walk = { | 4877 | struct mm_walk mem_cgroup_count_precharge_walk = { |
4879 | .pmd_entry = mem_cgroup_count_precharge_pte_range, | 4878 | .pmd_entry = mem_cgroup_count_precharge_pte_range, |
4880 | .mm = mm, | 4879 | .mm = mm, |
4881 | .private = vma, | 4880 | .private = vma, |
4882 | }; | 4881 | }; |
4883 | if (is_vm_hugetlb_page(vma)) | 4882 | if (is_vm_hugetlb_page(vma)) |
4884 | continue; | 4883 | continue; |
4885 | walk_page_range(vma->vm_start, vma->vm_end, | 4884 | walk_page_range(vma->vm_start, vma->vm_end, |
4886 | &mem_cgroup_count_precharge_walk); | 4885 | &mem_cgroup_count_precharge_walk); |
4887 | } | 4886 | } |
4888 | up_read(&mm->mmap_sem); | 4887 | up_read(&mm->mmap_sem); |
4889 | 4888 | ||
4890 | precharge = mc.precharge; | 4889 | precharge = mc.precharge; |
4891 | mc.precharge = 0; | 4890 | mc.precharge = 0; |
4892 | 4891 | ||
4893 | return precharge; | 4892 | return precharge; |
4894 | } | 4893 | } |
4895 | 4894 | ||
4896 | static int mem_cgroup_precharge_mc(struct mm_struct *mm) | 4895 | static int mem_cgroup_precharge_mc(struct mm_struct *mm) |
4897 | { | 4896 | { |
4898 | unsigned long precharge = mem_cgroup_count_precharge(mm); | 4897 | unsigned long precharge = mem_cgroup_count_precharge(mm); |
4899 | 4898 | ||
4900 | VM_BUG_ON(mc.moving_task); | 4899 | VM_BUG_ON(mc.moving_task); |
4901 | mc.moving_task = current; | 4900 | mc.moving_task = current; |
4902 | return mem_cgroup_do_precharge(precharge); | 4901 | return mem_cgroup_do_precharge(precharge); |
4903 | } | 4902 | } |
4904 | 4903 | ||
4905 | /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ | 4904 | /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ |
4906 | static void __mem_cgroup_clear_mc(void) | 4905 | static void __mem_cgroup_clear_mc(void) |
4907 | { | 4906 | { |
4908 | struct mem_cgroup *from = mc.from; | 4907 | struct mem_cgroup *from = mc.from; |
4909 | struct mem_cgroup *to = mc.to; | 4908 | struct mem_cgroup *to = mc.to; |
4910 | 4909 | ||
4911 | /* we must uncharge all the leftover precharges from mc.to */ | 4910 | /* we must uncharge all the leftover precharges from mc.to */ |
4912 | if (mc.precharge) { | 4911 | if (mc.precharge) { |
4913 | __mem_cgroup_cancel_charge(mc.to, mc.precharge); | 4912 | __mem_cgroup_cancel_charge(mc.to, mc.precharge); |
4914 | mc.precharge = 0; | 4913 | mc.precharge = 0; |
4915 | } | 4914 | } |
4916 | /* | 4915 | /* |
4917 | * we didn't uncharge from mc.from at mem_cgroup_move_account(), so | 4916 | * we didn't uncharge from mc.from at mem_cgroup_move_account(), so |
4918 | * we must uncharge here. | 4917 | * we must uncharge here. |
4919 | */ | 4918 | */ |
4920 | if (mc.moved_charge) { | 4919 | if (mc.moved_charge) { |
4921 | __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); | 4920 | __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); |
4922 | mc.moved_charge = 0; | 4921 | mc.moved_charge = 0; |
4923 | } | 4922 | } |
4924 | /* we must fixup refcnts and charges */ | 4923 | /* we must fixup refcnts and charges */ |
4925 | if (mc.moved_swap) { | 4924 | if (mc.moved_swap) { |
4926 | /* uncharge swap account from the old cgroup */ | 4925 | /* uncharge swap account from the old cgroup */ |
4927 | if (!mem_cgroup_is_root(mc.from)) | 4926 | if (!mem_cgroup_is_root(mc.from)) |
4928 | res_counter_uncharge(&mc.from->memsw, | 4927 | res_counter_uncharge(&mc.from->memsw, |
4929 | PAGE_SIZE * mc.moved_swap); | 4928 | PAGE_SIZE * mc.moved_swap); |
4930 | __mem_cgroup_put(mc.from, mc.moved_swap); | 4929 | __mem_cgroup_put(mc.from, mc.moved_swap); |
4931 | 4930 | ||
4932 | if (!mem_cgroup_is_root(mc.to)) { | 4931 | if (!mem_cgroup_is_root(mc.to)) { |
4933 | /* | 4932 | /* |
4934 | * we charged both to->res and to->memsw, so we should | 4933 | * we charged both to->res and to->memsw, so we should |
4935 | * uncharge to->res. | 4934 | * uncharge to->res. |
4936 | */ | 4935 | */ |
4937 | res_counter_uncharge(&mc.to->res, | 4936 | res_counter_uncharge(&mc.to->res, |
4938 | PAGE_SIZE * mc.moved_swap); | 4937 | PAGE_SIZE * mc.moved_swap); |
4939 | } | 4938 | } |
4940 | /* we've already done mem_cgroup_get(mc.to) */ | 4939 | /* we've already done mem_cgroup_get(mc.to) */ |
4941 | mc.moved_swap = 0; | 4940 | mc.moved_swap = 0; |
4942 | } | 4941 | } |
4943 | memcg_oom_recover(from); | 4942 | memcg_oom_recover(from); |
4944 | memcg_oom_recover(to); | 4943 | memcg_oom_recover(to); |
4945 | wake_up_all(&mc.waitq); | 4944 | wake_up_all(&mc.waitq); |
4946 | } | 4945 | } |
4947 | 4946 | ||
4948 | static void mem_cgroup_clear_mc(void) | 4947 | static void mem_cgroup_clear_mc(void) |
4949 | { | 4948 | { |
4950 | struct mem_cgroup *from = mc.from; | 4949 | struct mem_cgroup *from = mc.from; |
4951 | 4950 | ||
4952 | /* | 4951 | /* |
4953 | * we must clear moving_task before waking up waiters at the end of | 4952 | * we must clear moving_task before waking up waiters at the end of |
4954 | * task migration. | 4953 | * task migration. |
4955 | */ | 4954 | */ |
4956 | mc.moving_task = NULL; | 4955 | mc.moving_task = NULL; |
4957 | __mem_cgroup_clear_mc(); | 4956 | __mem_cgroup_clear_mc(); |
4958 | spin_lock(&mc.lock); | 4957 | spin_lock(&mc.lock); |
4959 | mc.from = NULL; | 4958 | mc.from = NULL; |
4960 | mc.to = NULL; | 4959 | mc.to = NULL; |
4961 | spin_unlock(&mc.lock); | 4960 | spin_unlock(&mc.lock); |
4962 | mem_cgroup_end_move(from); | 4961 | mem_cgroup_end_move(from); |
4963 | } | 4962 | } |
4964 | 4963 | ||
4965 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | 4964 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, |
4966 | struct cgroup *cgroup, | 4965 | struct cgroup *cgroup, |
4967 | struct task_struct *p) | 4966 | struct task_struct *p) |
4968 | { | 4967 | { |
4969 | int ret = 0; | 4968 | int ret = 0; |
4970 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); | 4969 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); |
4971 | 4970 | ||
4972 | if (mem->move_charge_at_immigrate) { | 4971 | if (mem->move_charge_at_immigrate) { |
4973 | struct mm_struct *mm; | 4972 | struct mm_struct *mm; |
4974 | struct mem_cgroup *from = mem_cgroup_from_task(p); | 4973 | struct mem_cgroup *from = mem_cgroup_from_task(p); |
4975 | 4974 | ||
4976 | VM_BUG_ON(from == mem); | 4975 | VM_BUG_ON(from == mem); |
4977 | 4976 | ||
4978 | mm = get_task_mm(p); | 4977 | mm = get_task_mm(p); |
4979 | if (!mm) | 4978 | if (!mm) |
4980 | return 0; | 4979 | return 0; |
4981 | /* We move charges only when we move a owner of the mm */ | 4980 | /* We move charges only when we move a owner of the mm */ |
4982 | if (mm->owner == p) { | 4981 | if (mm->owner == p) { |
4983 | VM_BUG_ON(mc.from); | 4982 | VM_BUG_ON(mc.from); |
4984 | VM_BUG_ON(mc.to); | 4983 | VM_BUG_ON(mc.to); |
4985 | VM_BUG_ON(mc.precharge); | 4984 | VM_BUG_ON(mc.precharge); |
4986 | VM_BUG_ON(mc.moved_charge); | 4985 | VM_BUG_ON(mc.moved_charge); |
4987 | VM_BUG_ON(mc.moved_swap); | 4986 | VM_BUG_ON(mc.moved_swap); |
4988 | mem_cgroup_start_move(from); | 4987 | mem_cgroup_start_move(from); |
4989 | spin_lock(&mc.lock); | 4988 | spin_lock(&mc.lock); |
4990 | mc.from = from; | 4989 | mc.from = from; |
4991 | mc.to = mem; | 4990 | mc.to = mem; |
4992 | spin_unlock(&mc.lock); | 4991 | spin_unlock(&mc.lock); |
4993 | /* We set mc.moving_task later */ | 4992 | /* We set mc.moving_task later */ |
4994 | 4993 | ||
4995 | ret = mem_cgroup_precharge_mc(mm); | 4994 | ret = mem_cgroup_precharge_mc(mm); |
4996 | if (ret) | 4995 | if (ret) |
4997 | mem_cgroup_clear_mc(); | 4996 | mem_cgroup_clear_mc(); |
4998 | } | 4997 | } |
4999 | mmput(mm); | 4998 | mmput(mm); |
5000 | } | 4999 | } |
5001 | return ret; | 5000 | return ret; |
5002 | } | 5001 | } |
5003 | 5002 | ||
5004 | static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, | 5003 | static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, |
5005 | struct cgroup *cgroup, | 5004 | struct cgroup *cgroup, |
5006 | struct task_struct *p) | 5005 | struct task_struct *p) |
5007 | { | 5006 | { |
5008 | mem_cgroup_clear_mc(); | 5007 | mem_cgroup_clear_mc(); |
5009 | } | 5008 | } |
5010 | 5009 | ||
5011 | static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | 5010 | static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, |
5012 | unsigned long addr, unsigned long end, | 5011 | unsigned long addr, unsigned long end, |
5013 | struct mm_walk *walk) | 5012 | struct mm_walk *walk) |
5014 | { | 5013 | { |
5015 | int ret = 0; | 5014 | int ret = 0; |
5016 | struct vm_area_struct *vma = walk->private; | 5015 | struct vm_area_struct *vma = walk->private; |
5017 | pte_t *pte; | 5016 | pte_t *pte; |
5018 | spinlock_t *ptl; | 5017 | spinlock_t *ptl; |
5019 | 5018 | ||
5020 | split_huge_page_pmd(walk->mm, pmd); | 5019 | split_huge_page_pmd(walk->mm, pmd); |
5021 | retry: | 5020 | retry: |
5022 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 5021 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
5023 | for (; addr != end; addr += PAGE_SIZE) { | 5022 | for (; addr != end; addr += PAGE_SIZE) { |
5024 | pte_t ptent = *(pte++); | 5023 | pte_t ptent = *(pte++); |
5025 | union mc_target target; | 5024 | union mc_target target; |
5026 | int type; | 5025 | int type; |
5027 | struct page *page; | 5026 | struct page *page; |
5028 | struct page_cgroup *pc; | 5027 | struct page_cgroup *pc; |
5029 | swp_entry_t ent; | 5028 | swp_entry_t ent; |
5030 | 5029 | ||
5031 | if (!mc.precharge) | 5030 | if (!mc.precharge) |
5032 | break; | 5031 | break; |
5033 | 5032 | ||
5034 | type = is_target_pte_for_mc(vma, addr, ptent, &target); | 5033 | type = is_target_pte_for_mc(vma, addr, ptent, &target); |
5035 | switch (type) { | 5034 | switch (type) { |
5036 | case MC_TARGET_PAGE: | 5035 | case MC_TARGET_PAGE: |
5037 | page = target.page; | 5036 | page = target.page; |
5038 | if (isolate_lru_page(page)) | 5037 | if (isolate_lru_page(page)) |
5039 | goto put; | 5038 | goto put; |
5040 | pc = lookup_page_cgroup(page); | 5039 | pc = lookup_page_cgroup(page); |
5041 | if (!mem_cgroup_move_account(page, 1, pc, | 5040 | if (!mem_cgroup_move_account(page, 1, pc, |
5042 | mc.from, mc.to, false)) { | 5041 | mc.from, mc.to, false)) { |
5043 | mc.precharge--; | 5042 | mc.precharge--; |
5044 | /* we uncharge from mc.from later. */ | 5043 | /* we uncharge from mc.from later. */ |
5045 | mc.moved_charge++; | 5044 | mc.moved_charge++; |
5046 | } | 5045 | } |
5047 | putback_lru_page(page); | 5046 | putback_lru_page(page); |
5048 | put: /* is_target_pte_for_mc() gets the page */ | 5047 | put: /* is_target_pte_for_mc() gets the page */ |
5049 | put_page(page); | 5048 | put_page(page); |
5050 | break; | 5049 | break; |
5051 | case MC_TARGET_SWAP: | 5050 | case MC_TARGET_SWAP: |
5052 | ent = target.ent; | 5051 | ent = target.ent; |
5053 | if (!mem_cgroup_move_swap_account(ent, | 5052 | if (!mem_cgroup_move_swap_account(ent, |
5054 | mc.from, mc.to, false)) { | 5053 | mc.from, mc.to, false)) { |
5055 | mc.precharge--; | 5054 | mc.precharge--; |
5056 | /* we fixup refcnts and charges later. */ | 5055 | /* we fixup refcnts and charges later. */ |
5057 | mc.moved_swap++; | 5056 | mc.moved_swap++; |
5058 | } | 5057 | } |
5059 | break; | 5058 | break; |
5060 | default: | 5059 | default: |
5061 | break; | 5060 | break; |
5062 | } | 5061 | } |
5063 | } | 5062 | } |
5064 | pte_unmap_unlock(pte - 1, ptl); | 5063 | pte_unmap_unlock(pte - 1, ptl); |
5065 | cond_resched(); | 5064 | cond_resched(); |
5066 | 5065 | ||
5067 | if (addr != end) { | 5066 | if (addr != end) { |
5068 | /* | 5067 | /* |
5069 | * We have consumed all precharges we got in can_attach(). | 5068 | * We have consumed all precharges we got in can_attach(). |
5070 | * We try charge one by one, but don't do any additional | 5069 | * We try charge one by one, but don't do any additional |
5071 | * charges to mc.to if we have failed in charge once in attach() | 5070 | * charges to mc.to if we have failed in charge once in attach() |
5072 | * phase. | 5071 | * phase. |
5073 | */ | 5072 | */ |
5074 | ret = mem_cgroup_do_precharge(1); | 5073 | ret = mem_cgroup_do_precharge(1); |
5075 | if (!ret) | 5074 | if (!ret) |
5076 | goto retry; | 5075 | goto retry; |
5077 | } | 5076 | } |
5078 | 5077 | ||
5079 | return ret; | 5078 | return ret; |
5080 | } | 5079 | } |
5081 | 5080 | ||
5082 | static void mem_cgroup_move_charge(struct mm_struct *mm) | 5081 | static void mem_cgroup_move_charge(struct mm_struct *mm) |
5083 | { | 5082 | { |
5084 | struct vm_area_struct *vma; | 5083 | struct vm_area_struct *vma; |
5085 | 5084 | ||
5086 | lru_add_drain_all(); | 5085 | lru_add_drain_all(); |
5087 | retry: | 5086 | retry: |
5088 | if (unlikely(!down_read_trylock(&mm->mmap_sem))) { | 5087 | if (unlikely(!down_read_trylock(&mm->mmap_sem))) { |
5089 | /* | 5088 | /* |
5090 | * Someone who are holding the mmap_sem might be waiting in | 5089 | * Someone who are holding the mmap_sem might be waiting in |
5091 | * waitq. So we cancel all extra charges, wake up all waiters, | 5090 | * waitq. So we cancel all extra charges, wake up all waiters, |
5092 | * and retry. Because we cancel precharges, we might not be able | 5091 | * and retry. Because we cancel precharges, we might not be able |
5093 | * to move enough charges, but moving charge is a best-effort | 5092 | * to move enough charges, but moving charge is a best-effort |
5094 | * feature anyway, so it wouldn't be a big problem. | 5093 | * feature anyway, so it wouldn't be a big problem. |
5095 | */ | 5094 | */ |
5096 | __mem_cgroup_clear_mc(); | 5095 | __mem_cgroup_clear_mc(); |
5097 | cond_resched(); | 5096 | cond_resched(); |
5098 | goto retry; | 5097 | goto retry; |
5099 | } | 5098 | } |
5100 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 5099 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
5101 | int ret; | 5100 | int ret; |
5102 | struct mm_walk mem_cgroup_move_charge_walk = { | 5101 | struct mm_walk mem_cgroup_move_charge_walk = { |
5103 | .pmd_entry = mem_cgroup_move_charge_pte_range, | 5102 | .pmd_entry = mem_cgroup_move_charge_pte_range, |
5104 | .mm = mm, | 5103 | .mm = mm, |
5105 | .private = vma, | 5104 | .private = vma, |
5106 | }; | 5105 | }; |
5107 | if (is_vm_hugetlb_page(vma)) | 5106 | if (is_vm_hugetlb_page(vma)) |
5108 | continue; | 5107 | continue; |
5109 | ret = walk_page_range(vma->vm_start, vma->vm_end, | 5108 | ret = walk_page_range(vma->vm_start, vma->vm_end, |
5110 | &mem_cgroup_move_charge_walk); | 5109 | &mem_cgroup_move_charge_walk); |
5111 | if (ret) | 5110 | if (ret) |
5112 | /* | 5111 | /* |
5113 | * means we have consumed all precharges and failed in | 5112 | * means we have consumed all precharges and failed in |
5114 | * doing additional charge. Just abandon here. | 5113 | * doing additional charge. Just abandon here. |
5115 | */ | 5114 | */ |
5116 | break; | 5115 | break; |
5117 | } | 5116 | } |
5118 | up_read(&mm->mmap_sem); | 5117 | up_read(&mm->mmap_sem); |
5119 | } | 5118 | } |
5120 | 5119 | ||
5121 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | 5120 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, |
5122 | struct cgroup *cont, | 5121 | struct cgroup *cont, |
5123 | struct cgroup *old_cont, | 5122 | struct cgroup *old_cont, |
5124 | struct task_struct *p) | 5123 | struct task_struct *p) |
5125 | { | 5124 | { |
5126 | struct mm_struct *mm; | 5125 | struct mm_struct *mm; |
5127 | 5126 | ||
5128 | if (!mc.to) | 5127 | if (!mc.to) |
5129 | /* no need to move charge */ | 5128 | /* no need to move charge */ |
5130 | return; | 5129 | return; |
5131 | 5130 | ||
5132 | mm = get_task_mm(p); | 5131 | mm = get_task_mm(p); |
5133 | if (mm) { | 5132 | if (mm) { |
5134 | mem_cgroup_move_charge(mm); | 5133 | mem_cgroup_move_charge(mm); |
5135 | mmput(mm); | 5134 | mmput(mm); |
5136 | } | 5135 | } |
5137 | mem_cgroup_clear_mc(); | 5136 | mem_cgroup_clear_mc(); |
5138 | } | 5137 | } |
5139 | #else /* !CONFIG_MMU */ | 5138 | #else /* !CONFIG_MMU */ |
5140 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | 5139 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, |
5141 | struct cgroup *cgroup, | 5140 | struct cgroup *cgroup, |
5142 | struct task_struct *p) | 5141 | struct task_struct *p) |
5143 | { | 5142 | { |
5144 | return 0; | 5143 | return 0; |
5145 | } | 5144 | } |
5146 | static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, | 5145 | static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, |
5147 | struct cgroup *cgroup, | 5146 | struct cgroup *cgroup, |
5148 | struct task_struct *p) | 5147 | struct task_struct *p) |
5149 | { | 5148 | { |
5150 | } | 5149 | } |
5151 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | 5150 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, |
5152 | struct cgroup *cont, | 5151 | struct cgroup *cont, |
5153 | struct cgroup *old_cont, | 5152 | struct cgroup *old_cont, |
5154 | struct task_struct *p) | 5153 | struct task_struct *p) |
5155 | { | 5154 | { |
5156 | } | 5155 | } |
5157 | #endif | 5156 | #endif |
5158 | 5157 | ||
5159 | struct cgroup_subsys mem_cgroup_subsys = { | 5158 | struct cgroup_subsys mem_cgroup_subsys = { |
5160 | .name = "memory", | 5159 | .name = "memory", |
5161 | .subsys_id = mem_cgroup_subsys_id, | 5160 | .subsys_id = mem_cgroup_subsys_id, |
5162 | .create = mem_cgroup_create, | 5161 | .create = mem_cgroup_create, |
5163 | .pre_destroy = mem_cgroup_pre_destroy, | 5162 | .pre_destroy = mem_cgroup_pre_destroy, |
5164 | .destroy = mem_cgroup_destroy, | 5163 | .destroy = mem_cgroup_destroy, |
5165 | .populate = mem_cgroup_populate, | 5164 | .populate = mem_cgroup_populate, |
5166 | .can_attach = mem_cgroup_can_attach, | 5165 | .can_attach = mem_cgroup_can_attach, |
5167 | .cancel_attach = mem_cgroup_cancel_attach, | 5166 | .cancel_attach = mem_cgroup_cancel_attach, |
5168 | .attach = mem_cgroup_move_task, | 5167 | .attach = mem_cgroup_move_task, |
5169 | .early_init = 0, | 5168 | .early_init = 0, |
5170 | .use_id = 1, | 5169 | .use_id = 1, |
5171 | }; | 5170 | }; |
5172 | 5171 | ||
5173 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 5172 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
5174 | static int __init enable_swap_account(char *s) | 5173 | static int __init enable_swap_account(char *s) |
5175 | { | 5174 | { |
5176 | /* consider enabled if no parameter or 1 is given */ | 5175 | /* consider enabled if no parameter or 1 is given */ |
5177 | if (!strcmp(s, "1")) | 5176 | if (!strcmp(s, "1")) |
5178 | really_do_swap_account = 1; | 5177 | really_do_swap_account = 1; |
5179 | else if (!strcmp(s, "0")) | 5178 | else if (!strcmp(s, "0")) |
5180 | really_do_swap_account = 0; | 5179 | really_do_swap_account = 0; |
5181 | return 1; | 5180 | return 1; |
5182 | } | 5181 | } |
5183 | __setup("swapaccount=", enable_swap_account); | 5182 | __setup("swapaccount=", enable_swap_account); |
5184 | 5183 | ||
5185 | #endif | 5184 | #endif |
5186 | 5185 |