Commit c8dad2bb6307f5b00f804a686917105206a4d5c9
Committed by
Linus Torvalds
1 parent
f817ed4853
Exists in
master
and in
4 other branches
memcg: reduce size of mem_cgroup by using nr_cpu_ids
As Jan Blunck <jblunck@suse.de> pointed out, allocating per-cpu stat for memcg to the size of NR_CPUS is not good. This patch changes mem_cgroup's cpustat allocation not based on NR_CPUS but based on nr_cpu_ids. Reviewed-by: Li Zefan <lizf@cn.fujitsu.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Pavel Emelyanov <xemul@openvz.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 1 changed file with 18 additions and 17 deletions Inline Diff
mm/memcontrol.c
1 | /* memcontrol.c - Memory Controller | 1 | /* memcontrol.c - Memory Controller |
2 | * | 2 | * |
3 | * Copyright IBM Corporation, 2007 | 3 | * Copyright IBM Corporation, 2007 |
4 | * Author Balbir Singh <balbir@linux.vnet.ibm.com> | 4 | * Author Balbir Singh <balbir@linux.vnet.ibm.com> |
5 | * | 5 | * |
6 | * Copyright 2007 OpenVZ SWsoft Inc | 6 | * Copyright 2007 OpenVZ SWsoft Inc |
7 | * Author: Pavel Emelianov <xemul@openvz.org> | 7 | * Author: Pavel Emelianov <xemul@openvz.org> |
8 | * | 8 | * |
9 | * This program is free software; you can redistribute it and/or modify | 9 | * This program is free software; you can redistribute it and/or modify |
10 | * it under the terms of the GNU General Public License as published by | 10 | * it under the terms of the GNU General Public License as published by |
11 | * the Free Software Foundation; either version 2 of the License, or | 11 | * the Free Software Foundation; either version 2 of the License, or |
12 | * (at your option) any later version. | 12 | * (at your option) any later version. |
13 | * | 13 | * |
14 | * This program is distributed in the hope that it will be useful, | 14 | * This program is distributed in the hope that it will be useful, |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | 15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 | * GNU General Public License for more details. | 17 | * GNU General Public License for more details. |
18 | */ | 18 | */ |
19 | 19 | ||
20 | #include <linux/res_counter.h> | 20 | #include <linux/res_counter.h> |
21 | #include <linux/memcontrol.h> | 21 | #include <linux/memcontrol.h> |
22 | #include <linux/cgroup.h> | 22 | #include <linux/cgroup.h> |
23 | #include <linux/mm.h> | 23 | #include <linux/mm.h> |
24 | #include <linux/smp.h> | 24 | #include <linux/smp.h> |
25 | #include <linux/page-flags.h> | 25 | #include <linux/page-flags.h> |
26 | #include <linux/backing-dev.h> | 26 | #include <linux/backing-dev.h> |
27 | #include <linux/bit_spinlock.h> | 27 | #include <linux/bit_spinlock.h> |
28 | #include <linux/rcupdate.h> | 28 | #include <linux/rcupdate.h> |
29 | #include <linux/slab.h> | 29 | #include <linux/slab.h> |
30 | #include <linux/swap.h> | 30 | #include <linux/swap.h> |
31 | #include <linux/spinlock.h> | 31 | #include <linux/spinlock.h> |
32 | #include <linux/fs.h> | 32 | #include <linux/fs.h> |
33 | #include <linux/seq_file.h> | 33 | #include <linux/seq_file.h> |
34 | #include <linux/vmalloc.h> | 34 | #include <linux/vmalloc.h> |
35 | #include <linux/mm_inline.h> | 35 | #include <linux/mm_inline.h> |
36 | #include <linux/page_cgroup.h> | 36 | #include <linux/page_cgroup.h> |
37 | 37 | ||
38 | #include <asm/uaccess.h> | 38 | #include <asm/uaccess.h> |
39 | 39 | ||
40 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; | 40 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; |
41 | #define MEM_CGROUP_RECLAIM_RETRIES 5 | 41 | #define MEM_CGROUP_RECLAIM_RETRIES 5 |
42 | 42 | ||
43 | /* | 43 | /* |
44 | * Statistics for memory cgroup. | 44 | * Statistics for memory cgroup. |
45 | */ | 45 | */ |
46 | enum mem_cgroup_stat_index { | 46 | enum mem_cgroup_stat_index { |
47 | /* | 47 | /* |
48 | * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. | 48 | * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. |
49 | */ | 49 | */ |
50 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ | 50 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ |
51 | MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */ | 51 | MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */ |
52 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ | 52 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ |
53 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ | 53 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ |
54 | 54 | ||
55 | MEM_CGROUP_STAT_NSTATS, | 55 | MEM_CGROUP_STAT_NSTATS, |
56 | }; | 56 | }; |
57 | 57 | ||
58 | struct mem_cgroup_stat_cpu { | 58 | struct mem_cgroup_stat_cpu { |
59 | s64 count[MEM_CGROUP_STAT_NSTATS]; | 59 | s64 count[MEM_CGROUP_STAT_NSTATS]; |
60 | } ____cacheline_aligned_in_smp; | 60 | } ____cacheline_aligned_in_smp; |
61 | 61 | ||
62 | struct mem_cgroup_stat { | 62 | struct mem_cgroup_stat { |
63 | struct mem_cgroup_stat_cpu cpustat[NR_CPUS]; | 63 | struct mem_cgroup_stat_cpu cpustat[0]; |
64 | }; | 64 | }; |
65 | 65 | ||
66 | /* | 66 | /* |
67 | * For accounting under irq disable, no need for increment preempt count. | 67 | * For accounting under irq disable, no need for increment preempt count. |
68 | */ | 68 | */ |
69 | static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat, | 69 | static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat, |
70 | enum mem_cgroup_stat_index idx, int val) | 70 | enum mem_cgroup_stat_index idx, int val) |
71 | { | 71 | { |
72 | stat->count[idx] += val; | 72 | stat->count[idx] += val; |
73 | } | 73 | } |
74 | 74 | ||
75 | static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, | 75 | static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, |
76 | enum mem_cgroup_stat_index idx) | 76 | enum mem_cgroup_stat_index idx) |
77 | { | 77 | { |
78 | int cpu; | 78 | int cpu; |
79 | s64 ret = 0; | 79 | s64 ret = 0; |
80 | for_each_possible_cpu(cpu) | 80 | for_each_possible_cpu(cpu) |
81 | ret += stat->cpustat[cpu].count[idx]; | 81 | ret += stat->cpustat[cpu].count[idx]; |
82 | return ret; | 82 | return ret; |
83 | } | 83 | } |
84 | 84 | ||
85 | /* | 85 | /* |
86 | * per-zone information in memory controller. | 86 | * per-zone information in memory controller. |
87 | */ | 87 | */ |
88 | struct mem_cgroup_per_zone { | 88 | struct mem_cgroup_per_zone { |
89 | /* | 89 | /* |
90 | * spin_lock to protect the per cgroup LRU | 90 | * spin_lock to protect the per cgroup LRU |
91 | */ | 91 | */ |
92 | spinlock_t lru_lock; | 92 | spinlock_t lru_lock; |
93 | struct list_head lists[NR_LRU_LISTS]; | 93 | struct list_head lists[NR_LRU_LISTS]; |
94 | unsigned long count[NR_LRU_LISTS]; | 94 | unsigned long count[NR_LRU_LISTS]; |
95 | }; | 95 | }; |
96 | /* Macro for accessing counter */ | 96 | /* Macro for accessing counter */ |
97 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) | 97 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) |
98 | 98 | ||
99 | struct mem_cgroup_per_node { | 99 | struct mem_cgroup_per_node { |
100 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; | 100 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; |
101 | }; | 101 | }; |
102 | 102 | ||
103 | struct mem_cgroup_lru_info { | 103 | struct mem_cgroup_lru_info { |
104 | struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; | 104 | struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; |
105 | }; | 105 | }; |
106 | 106 | ||
107 | /* | 107 | /* |
108 | * The memory controller data structure. The memory controller controls both | 108 | * The memory controller data structure. The memory controller controls both |
109 | * page cache and RSS per cgroup. We would eventually like to provide | 109 | * page cache and RSS per cgroup. We would eventually like to provide |
110 | * statistics based on the statistics developed by Rik Van Riel for clock-pro, | 110 | * statistics based on the statistics developed by Rik Van Riel for clock-pro, |
111 | * to help the administrator determine what knobs to tune. | 111 | * to help the administrator determine what knobs to tune. |
112 | * | 112 | * |
113 | * TODO: Add a water mark for the memory controller. Reclaim will begin when | 113 | * TODO: Add a water mark for the memory controller. Reclaim will begin when |
114 | * we hit the water mark. May be even add a low water mark, such that | 114 | * we hit the water mark. May be even add a low water mark, such that |
115 | * no reclaim occurs from a cgroup at it's low water mark, this is | 115 | * no reclaim occurs from a cgroup at it's low water mark, this is |
116 | * a feature that will be implemented much later in the future. | 116 | * a feature that will be implemented much later in the future. |
117 | */ | 117 | */ |
118 | struct mem_cgroup { | 118 | struct mem_cgroup { |
119 | struct cgroup_subsys_state css; | 119 | struct cgroup_subsys_state css; |
120 | /* | 120 | /* |
121 | * the counter to account for memory usage | 121 | * the counter to account for memory usage |
122 | */ | 122 | */ |
123 | struct res_counter res; | 123 | struct res_counter res; |
124 | /* | 124 | /* |
125 | * Per cgroup active and inactive list, similar to the | 125 | * Per cgroup active and inactive list, similar to the |
126 | * per zone LRU lists. | 126 | * per zone LRU lists. |
127 | */ | 127 | */ |
128 | struct mem_cgroup_lru_info info; | 128 | struct mem_cgroup_lru_info info; |
129 | 129 | ||
130 | int prev_priority; /* for recording reclaim priority */ | 130 | int prev_priority; /* for recording reclaim priority */ |
131 | /* | 131 | /* |
132 | * statistics. | 132 | * statistics. This must be placed at the end of memcg. |
133 | */ | 133 | */ |
134 | struct mem_cgroup_stat stat; | 134 | struct mem_cgroup_stat stat; |
135 | }; | 135 | }; |
136 | static struct mem_cgroup init_mem_cgroup; | ||
137 | 136 | ||
138 | enum charge_type { | 137 | enum charge_type { |
139 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | 138 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, |
140 | MEM_CGROUP_CHARGE_TYPE_MAPPED, | 139 | MEM_CGROUP_CHARGE_TYPE_MAPPED, |
141 | MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ | 140 | MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ |
142 | MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ | 141 | MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ |
143 | NR_CHARGE_TYPE, | 142 | NR_CHARGE_TYPE, |
144 | }; | 143 | }; |
145 | 144 | ||
146 | /* only for here (for easy reading.) */ | 145 | /* only for here (for easy reading.) */ |
147 | #define PCGF_CACHE (1UL << PCG_CACHE) | 146 | #define PCGF_CACHE (1UL << PCG_CACHE) |
148 | #define PCGF_USED (1UL << PCG_USED) | 147 | #define PCGF_USED (1UL << PCG_USED) |
149 | #define PCGF_ACTIVE (1UL << PCG_ACTIVE) | 148 | #define PCGF_ACTIVE (1UL << PCG_ACTIVE) |
150 | #define PCGF_LOCK (1UL << PCG_LOCK) | 149 | #define PCGF_LOCK (1UL << PCG_LOCK) |
151 | #define PCGF_FILE (1UL << PCG_FILE) | 150 | #define PCGF_FILE (1UL << PCG_FILE) |
152 | static const unsigned long | 151 | static const unsigned long |
153 | pcg_default_flags[NR_CHARGE_TYPE] = { | 152 | pcg_default_flags[NR_CHARGE_TYPE] = { |
154 | PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */ | 153 | PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */ |
155 | PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */ | 154 | PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */ |
156 | PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ | 155 | PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ |
157 | 0, /* FORCE */ | 156 | 0, /* FORCE */ |
158 | }; | 157 | }; |
159 | 158 | ||
160 | /* | 159 | /* |
161 | * Always modified under lru lock. Then, not necessary to preempt_disable() | 160 | * Always modified under lru lock. Then, not necessary to preempt_disable() |
162 | */ | 161 | */ |
163 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | 162 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, |
164 | struct page_cgroup *pc, | 163 | struct page_cgroup *pc, |
165 | bool charge) | 164 | bool charge) |
166 | { | 165 | { |
167 | int val = (charge)? 1 : -1; | 166 | int val = (charge)? 1 : -1; |
168 | struct mem_cgroup_stat *stat = &mem->stat; | 167 | struct mem_cgroup_stat *stat = &mem->stat; |
169 | struct mem_cgroup_stat_cpu *cpustat; | 168 | struct mem_cgroup_stat_cpu *cpustat; |
170 | 169 | ||
171 | VM_BUG_ON(!irqs_disabled()); | 170 | VM_BUG_ON(!irqs_disabled()); |
172 | 171 | ||
173 | cpustat = &stat->cpustat[smp_processor_id()]; | 172 | cpustat = &stat->cpustat[smp_processor_id()]; |
174 | if (PageCgroupCache(pc)) | 173 | if (PageCgroupCache(pc)) |
175 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); | 174 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); |
176 | else | 175 | else |
177 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val); | 176 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val); |
178 | 177 | ||
179 | if (charge) | 178 | if (charge) |
180 | __mem_cgroup_stat_add_safe(cpustat, | 179 | __mem_cgroup_stat_add_safe(cpustat, |
181 | MEM_CGROUP_STAT_PGPGIN_COUNT, 1); | 180 | MEM_CGROUP_STAT_PGPGIN_COUNT, 1); |
182 | else | 181 | else |
183 | __mem_cgroup_stat_add_safe(cpustat, | 182 | __mem_cgroup_stat_add_safe(cpustat, |
184 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); | 183 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); |
185 | } | 184 | } |
186 | 185 | ||
187 | static struct mem_cgroup_per_zone * | 186 | static struct mem_cgroup_per_zone * |
188 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | 187 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) |
189 | { | 188 | { |
190 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; | 189 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; |
191 | } | 190 | } |
192 | 191 | ||
193 | static struct mem_cgroup_per_zone * | 192 | static struct mem_cgroup_per_zone * |
194 | page_cgroup_zoneinfo(struct page_cgroup *pc) | 193 | page_cgroup_zoneinfo(struct page_cgroup *pc) |
195 | { | 194 | { |
196 | struct mem_cgroup *mem = pc->mem_cgroup; | 195 | struct mem_cgroup *mem = pc->mem_cgroup; |
197 | int nid = page_cgroup_nid(pc); | 196 | int nid = page_cgroup_nid(pc); |
198 | int zid = page_cgroup_zid(pc); | 197 | int zid = page_cgroup_zid(pc); |
199 | 198 | ||
200 | return mem_cgroup_zoneinfo(mem, nid, zid); | 199 | return mem_cgroup_zoneinfo(mem, nid, zid); |
201 | } | 200 | } |
202 | 201 | ||
203 | static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, | 202 | static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, |
204 | enum lru_list idx) | 203 | enum lru_list idx) |
205 | { | 204 | { |
206 | int nid, zid; | 205 | int nid, zid; |
207 | struct mem_cgroup_per_zone *mz; | 206 | struct mem_cgroup_per_zone *mz; |
208 | u64 total = 0; | 207 | u64 total = 0; |
209 | 208 | ||
210 | for_each_online_node(nid) | 209 | for_each_online_node(nid) |
211 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 210 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { |
212 | mz = mem_cgroup_zoneinfo(mem, nid, zid); | 211 | mz = mem_cgroup_zoneinfo(mem, nid, zid); |
213 | total += MEM_CGROUP_ZSTAT(mz, idx); | 212 | total += MEM_CGROUP_ZSTAT(mz, idx); |
214 | } | 213 | } |
215 | return total; | 214 | return total; |
216 | } | 215 | } |
217 | 216 | ||
218 | static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) | 217 | static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) |
219 | { | 218 | { |
220 | return container_of(cgroup_subsys_state(cont, | 219 | return container_of(cgroup_subsys_state(cont, |
221 | mem_cgroup_subsys_id), struct mem_cgroup, | 220 | mem_cgroup_subsys_id), struct mem_cgroup, |
222 | css); | 221 | css); |
223 | } | 222 | } |
224 | 223 | ||
225 | struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | 224 | struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) |
226 | { | 225 | { |
227 | /* | 226 | /* |
228 | * mm_update_next_owner() may clear mm->owner to NULL | 227 | * mm_update_next_owner() may clear mm->owner to NULL |
229 | * if it races with swapoff, page migration, etc. | 228 | * if it races with swapoff, page migration, etc. |
230 | * So this can be called with p == NULL. | 229 | * So this can be called with p == NULL. |
231 | */ | 230 | */ |
232 | if (unlikely(!p)) | 231 | if (unlikely(!p)) |
233 | return NULL; | 232 | return NULL; |
234 | 233 | ||
235 | return container_of(task_subsys_state(p, mem_cgroup_subsys_id), | 234 | return container_of(task_subsys_state(p, mem_cgroup_subsys_id), |
236 | struct mem_cgroup, css); | 235 | struct mem_cgroup, css); |
237 | } | 236 | } |
238 | 237 | ||
239 | static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, | 238 | static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, |
240 | struct page_cgroup *pc) | 239 | struct page_cgroup *pc) |
241 | { | 240 | { |
242 | int lru = LRU_BASE; | 241 | int lru = LRU_BASE; |
243 | 242 | ||
244 | if (PageCgroupUnevictable(pc)) | 243 | if (PageCgroupUnevictable(pc)) |
245 | lru = LRU_UNEVICTABLE; | 244 | lru = LRU_UNEVICTABLE; |
246 | else { | 245 | else { |
247 | if (PageCgroupActive(pc)) | 246 | if (PageCgroupActive(pc)) |
248 | lru += LRU_ACTIVE; | 247 | lru += LRU_ACTIVE; |
249 | if (PageCgroupFile(pc)) | 248 | if (PageCgroupFile(pc)) |
250 | lru += LRU_FILE; | 249 | lru += LRU_FILE; |
251 | } | 250 | } |
252 | 251 | ||
253 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; | 252 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; |
254 | 253 | ||
255 | mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false); | 254 | mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false); |
256 | list_del(&pc->lru); | 255 | list_del(&pc->lru); |
257 | } | 256 | } |
258 | 257 | ||
259 | static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, | 258 | static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, |
260 | struct page_cgroup *pc, bool hot) | 259 | struct page_cgroup *pc, bool hot) |
261 | { | 260 | { |
262 | int lru = LRU_BASE; | 261 | int lru = LRU_BASE; |
263 | 262 | ||
264 | if (PageCgroupUnevictable(pc)) | 263 | if (PageCgroupUnevictable(pc)) |
265 | lru = LRU_UNEVICTABLE; | 264 | lru = LRU_UNEVICTABLE; |
266 | else { | 265 | else { |
267 | if (PageCgroupActive(pc)) | 266 | if (PageCgroupActive(pc)) |
268 | lru += LRU_ACTIVE; | 267 | lru += LRU_ACTIVE; |
269 | if (PageCgroupFile(pc)) | 268 | if (PageCgroupFile(pc)) |
270 | lru += LRU_FILE; | 269 | lru += LRU_FILE; |
271 | } | 270 | } |
272 | 271 | ||
273 | MEM_CGROUP_ZSTAT(mz, lru) += 1; | 272 | MEM_CGROUP_ZSTAT(mz, lru) += 1; |
274 | if (hot) | 273 | if (hot) |
275 | list_add(&pc->lru, &mz->lists[lru]); | 274 | list_add(&pc->lru, &mz->lists[lru]); |
276 | else | 275 | else |
277 | list_add_tail(&pc->lru, &mz->lists[lru]); | 276 | list_add_tail(&pc->lru, &mz->lists[lru]); |
278 | 277 | ||
279 | mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true); | 278 | mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true); |
280 | } | 279 | } |
281 | 280 | ||
282 | static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru) | 281 | static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru) |
283 | { | 282 | { |
284 | struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); | 283 | struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); |
285 | int active = PageCgroupActive(pc); | 284 | int active = PageCgroupActive(pc); |
286 | int file = PageCgroupFile(pc); | 285 | int file = PageCgroupFile(pc); |
287 | int unevictable = PageCgroupUnevictable(pc); | 286 | int unevictable = PageCgroupUnevictable(pc); |
288 | enum lru_list from = unevictable ? LRU_UNEVICTABLE : | 287 | enum lru_list from = unevictable ? LRU_UNEVICTABLE : |
289 | (LRU_FILE * !!file + !!active); | 288 | (LRU_FILE * !!file + !!active); |
290 | 289 | ||
291 | if (lru == from) | 290 | if (lru == from) |
292 | return; | 291 | return; |
293 | 292 | ||
294 | MEM_CGROUP_ZSTAT(mz, from) -= 1; | 293 | MEM_CGROUP_ZSTAT(mz, from) -= 1; |
295 | /* | 294 | /* |
296 | * However this is done under mz->lru_lock, another flags, which | 295 | * However this is done under mz->lru_lock, another flags, which |
297 | * are not related to LRU, will be modified from out-of-lock. | 296 | * are not related to LRU, will be modified from out-of-lock. |
298 | * We have to use atomic set/clear flags. | 297 | * We have to use atomic set/clear flags. |
299 | */ | 298 | */ |
300 | if (is_unevictable_lru(lru)) { | 299 | if (is_unevictable_lru(lru)) { |
301 | ClearPageCgroupActive(pc); | 300 | ClearPageCgroupActive(pc); |
302 | SetPageCgroupUnevictable(pc); | 301 | SetPageCgroupUnevictable(pc); |
303 | } else { | 302 | } else { |
304 | if (is_active_lru(lru)) | 303 | if (is_active_lru(lru)) |
305 | SetPageCgroupActive(pc); | 304 | SetPageCgroupActive(pc); |
306 | else | 305 | else |
307 | ClearPageCgroupActive(pc); | 306 | ClearPageCgroupActive(pc); |
308 | ClearPageCgroupUnevictable(pc); | 307 | ClearPageCgroupUnevictable(pc); |
309 | } | 308 | } |
310 | 309 | ||
311 | MEM_CGROUP_ZSTAT(mz, lru) += 1; | 310 | MEM_CGROUP_ZSTAT(mz, lru) += 1; |
312 | list_move(&pc->lru, &mz->lists[lru]); | 311 | list_move(&pc->lru, &mz->lists[lru]); |
313 | } | 312 | } |
314 | 313 | ||
315 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | 314 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) |
316 | { | 315 | { |
317 | int ret; | 316 | int ret; |
318 | 317 | ||
319 | task_lock(task); | 318 | task_lock(task); |
320 | ret = task->mm && mm_match_cgroup(task->mm, mem); | 319 | ret = task->mm && mm_match_cgroup(task->mm, mem); |
321 | task_unlock(task); | 320 | task_unlock(task); |
322 | return ret; | 321 | return ret; |
323 | } | 322 | } |
324 | 323 | ||
325 | /* | 324 | /* |
326 | * This routine assumes that the appropriate zone's lru lock is already held | 325 | * This routine assumes that the appropriate zone's lru lock is already held |
327 | */ | 326 | */ |
328 | void mem_cgroup_move_lists(struct page *page, enum lru_list lru) | 327 | void mem_cgroup_move_lists(struct page *page, enum lru_list lru) |
329 | { | 328 | { |
330 | struct page_cgroup *pc; | 329 | struct page_cgroup *pc; |
331 | struct mem_cgroup_per_zone *mz; | 330 | struct mem_cgroup_per_zone *mz; |
332 | unsigned long flags; | 331 | unsigned long flags; |
333 | 332 | ||
334 | if (mem_cgroup_subsys.disabled) | 333 | if (mem_cgroup_subsys.disabled) |
335 | return; | 334 | return; |
336 | 335 | ||
337 | /* | 336 | /* |
338 | * We cannot lock_page_cgroup while holding zone's lru_lock, | 337 | * We cannot lock_page_cgroup while holding zone's lru_lock, |
339 | * because other holders of lock_page_cgroup can be interrupted | 338 | * because other holders of lock_page_cgroup can be interrupted |
340 | * with an attempt to rotate_reclaimable_page. But we cannot | 339 | * with an attempt to rotate_reclaimable_page. But we cannot |
341 | * safely get to page_cgroup without it, so just try_lock it: | 340 | * safely get to page_cgroup without it, so just try_lock it: |
342 | * mem_cgroup_isolate_pages allows for page left on wrong list. | 341 | * mem_cgroup_isolate_pages allows for page left on wrong list. |
343 | */ | 342 | */ |
344 | pc = lookup_page_cgroup(page); | 343 | pc = lookup_page_cgroup(page); |
345 | if (!trylock_page_cgroup(pc)) | 344 | if (!trylock_page_cgroup(pc)) |
346 | return; | 345 | return; |
347 | if (pc && PageCgroupUsed(pc)) { | 346 | if (pc && PageCgroupUsed(pc)) { |
348 | mz = page_cgroup_zoneinfo(pc); | 347 | mz = page_cgroup_zoneinfo(pc); |
349 | spin_lock_irqsave(&mz->lru_lock, flags); | 348 | spin_lock_irqsave(&mz->lru_lock, flags); |
350 | __mem_cgroup_move_lists(pc, lru); | 349 | __mem_cgroup_move_lists(pc, lru); |
351 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 350 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
352 | } | 351 | } |
353 | unlock_page_cgroup(pc); | 352 | unlock_page_cgroup(pc); |
354 | } | 353 | } |
355 | 354 | ||
356 | /* | 355 | /* |
357 | * Calculate mapped_ratio under memory controller. This will be used in | 356 | * Calculate mapped_ratio under memory controller. This will be used in |
358 | * vmscan.c for deteremining we have to reclaim mapped pages. | 357 | * vmscan.c for deteremining we have to reclaim mapped pages. |
359 | */ | 358 | */ |
360 | int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem) | 359 | int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem) |
361 | { | 360 | { |
362 | long total, rss; | 361 | long total, rss; |
363 | 362 | ||
364 | /* | 363 | /* |
365 | * usage is recorded in bytes. But, here, we assume the number of | 364 | * usage is recorded in bytes. But, here, we assume the number of |
366 | * physical pages can be represented by "long" on any arch. | 365 | * physical pages can be represented by "long" on any arch. |
367 | */ | 366 | */ |
368 | total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L; | 367 | total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L; |
369 | rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); | 368 | rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); |
370 | return (int)((rss * 100L) / total); | 369 | return (int)((rss * 100L) / total); |
371 | } | 370 | } |
372 | 371 | ||
373 | /* | 372 | /* |
374 | * prev_priority control...this will be used in memory reclaim path. | 373 | * prev_priority control...this will be used in memory reclaim path. |
375 | */ | 374 | */ |
376 | int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) | 375 | int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) |
377 | { | 376 | { |
378 | return mem->prev_priority; | 377 | return mem->prev_priority; |
379 | } | 378 | } |
380 | 379 | ||
381 | void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority) | 380 | void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority) |
382 | { | 381 | { |
383 | if (priority < mem->prev_priority) | 382 | if (priority < mem->prev_priority) |
384 | mem->prev_priority = priority; | 383 | mem->prev_priority = priority; |
385 | } | 384 | } |
386 | 385 | ||
387 | void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) | 386 | void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) |
388 | { | 387 | { |
389 | mem->prev_priority = priority; | 388 | mem->prev_priority = priority; |
390 | } | 389 | } |
391 | 390 | ||
392 | /* | 391 | /* |
393 | * Calculate # of pages to be scanned in this priority/zone. | 392 | * Calculate # of pages to be scanned in this priority/zone. |
394 | * See also vmscan.c | 393 | * See also vmscan.c |
395 | * | 394 | * |
396 | * priority starts from "DEF_PRIORITY" and decremented in each loop. | 395 | * priority starts from "DEF_PRIORITY" and decremented in each loop. |
397 | * (see include/linux/mmzone.h) | 396 | * (see include/linux/mmzone.h) |
398 | */ | 397 | */ |
399 | 398 | ||
400 | long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone, | 399 | long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone, |
401 | int priority, enum lru_list lru) | 400 | int priority, enum lru_list lru) |
402 | { | 401 | { |
403 | long nr_pages; | 402 | long nr_pages; |
404 | int nid = zone->zone_pgdat->node_id; | 403 | int nid = zone->zone_pgdat->node_id; |
405 | int zid = zone_idx(zone); | 404 | int zid = zone_idx(zone); |
406 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); | 405 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); |
407 | 406 | ||
408 | nr_pages = MEM_CGROUP_ZSTAT(mz, lru); | 407 | nr_pages = MEM_CGROUP_ZSTAT(mz, lru); |
409 | 408 | ||
410 | return (nr_pages >> priority); | 409 | return (nr_pages >> priority); |
411 | } | 410 | } |
412 | 411 | ||
413 | unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | 412 | unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, |
414 | struct list_head *dst, | 413 | struct list_head *dst, |
415 | unsigned long *scanned, int order, | 414 | unsigned long *scanned, int order, |
416 | int mode, struct zone *z, | 415 | int mode, struct zone *z, |
417 | struct mem_cgroup *mem_cont, | 416 | struct mem_cgroup *mem_cont, |
418 | int active, int file) | 417 | int active, int file) |
419 | { | 418 | { |
420 | unsigned long nr_taken = 0; | 419 | unsigned long nr_taken = 0; |
421 | struct page *page; | 420 | struct page *page; |
422 | unsigned long scan; | 421 | unsigned long scan; |
423 | LIST_HEAD(pc_list); | 422 | LIST_HEAD(pc_list); |
424 | struct list_head *src; | 423 | struct list_head *src; |
425 | struct page_cgroup *pc, *tmp; | 424 | struct page_cgroup *pc, *tmp; |
426 | int nid = z->zone_pgdat->node_id; | 425 | int nid = z->zone_pgdat->node_id; |
427 | int zid = zone_idx(z); | 426 | int zid = zone_idx(z); |
428 | struct mem_cgroup_per_zone *mz; | 427 | struct mem_cgroup_per_zone *mz; |
429 | int lru = LRU_FILE * !!file + !!active; | 428 | int lru = LRU_FILE * !!file + !!active; |
430 | 429 | ||
431 | BUG_ON(!mem_cont); | 430 | BUG_ON(!mem_cont); |
432 | mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); | 431 | mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); |
433 | src = &mz->lists[lru]; | 432 | src = &mz->lists[lru]; |
434 | 433 | ||
435 | spin_lock(&mz->lru_lock); | 434 | spin_lock(&mz->lru_lock); |
436 | scan = 0; | 435 | scan = 0; |
437 | list_for_each_entry_safe_reverse(pc, tmp, src, lru) { | 436 | list_for_each_entry_safe_reverse(pc, tmp, src, lru) { |
438 | if (scan >= nr_to_scan) | 437 | if (scan >= nr_to_scan) |
439 | break; | 438 | break; |
440 | if (unlikely(!PageCgroupUsed(pc))) | 439 | if (unlikely(!PageCgroupUsed(pc))) |
441 | continue; | 440 | continue; |
442 | page = pc->page; | 441 | page = pc->page; |
443 | 442 | ||
444 | if (unlikely(!PageLRU(page))) | 443 | if (unlikely(!PageLRU(page))) |
445 | continue; | 444 | continue; |
446 | 445 | ||
447 | /* | 446 | /* |
448 | * TODO: play better with lumpy reclaim, grabbing anything. | 447 | * TODO: play better with lumpy reclaim, grabbing anything. |
449 | */ | 448 | */ |
450 | if (PageUnevictable(page) || | 449 | if (PageUnevictable(page) || |
451 | (PageActive(page) && !active) || | 450 | (PageActive(page) && !active) || |
452 | (!PageActive(page) && active)) { | 451 | (!PageActive(page) && active)) { |
453 | __mem_cgroup_move_lists(pc, page_lru(page)); | 452 | __mem_cgroup_move_lists(pc, page_lru(page)); |
454 | continue; | 453 | continue; |
455 | } | 454 | } |
456 | 455 | ||
457 | scan++; | 456 | scan++; |
458 | list_move(&pc->lru, &pc_list); | 457 | list_move(&pc->lru, &pc_list); |
459 | 458 | ||
460 | if (__isolate_lru_page(page, mode, file) == 0) { | 459 | if (__isolate_lru_page(page, mode, file) == 0) { |
461 | list_move(&page->lru, dst); | 460 | list_move(&page->lru, dst); |
462 | nr_taken++; | 461 | nr_taken++; |
463 | } | 462 | } |
464 | } | 463 | } |
465 | 464 | ||
466 | list_splice(&pc_list, src); | 465 | list_splice(&pc_list, src); |
467 | spin_unlock(&mz->lru_lock); | 466 | spin_unlock(&mz->lru_lock); |
468 | 467 | ||
469 | *scanned = scan; | 468 | *scanned = scan; |
470 | return nr_taken; | 469 | return nr_taken; |
471 | } | 470 | } |
472 | 471 | ||
473 | /* | 472 | /* |
474 | * Unlike exported interface, "oom" parameter is added. if oom==true, | 473 | * Unlike exported interface, "oom" parameter is added. if oom==true, |
475 | * oom-killer can be invoked. | 474 | * oom-killer can be invoked. |
476 | */ | 475 | */ |
477 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | 476 | static int __mem_cgroup_try_charge(struct mm_struct *mm, |
478 | gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) | 477 | gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) |
479 | { | 478 | { |
480 | struct mem_cgroup *mem; | 479 | struct mem_cgroup *mem; |
481 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 480 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
482 | /* | 481 | /* |
483 | * We always charge the cgroup the mm_struct belongs to. | 482 | * We always charge the cgroup the mm_struct belongs to. |
484 | * The mm_struct's mem_cgroup changes on task migration if the | 483 | * The mm_struct's mem_cgroup changes on task migration if the |
485 | * thread group leader migrates. It's possible that mm is not | 484 | * thread group leader migrates. It's possible that mm is not |
486 | * set, if so charge the init_mm (happens for pagecache usage). | 485 | * set, if so charge the init_mm (happens for pagecache usage). |
487 | */ | 486 | */ |
488 | if (likely(!*memcg)) { | 487 | if (likely(!*memcg)) { |
489 | rcu_read_lock(); | 488 | rcu_read_lock(); |
490 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 489 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); |
491 | if (unlikely(!mem)) { | 490 | if (unlikely(!mem)) { |
492 | rcu_read_unlock(); | 491 | rcu_read_unlock(); |
493 | return 0; | 492 | return 0; |
494 | } | 493 | } |
495 | /* | 494 | /* |
496 | * For every charge from the cgroup, increment reference count | 495 | * For every charge from the cgroup, increment reference count |
497 | */ | 496 | */ |
498 | css_get(&mem->css); | 497 | css_get(&mem->css); |
499 | *memcg = mem; | 498 | *memcg = mem; |
500 | rcu_read_unlock(); | 499 | rcu_read_unlock(); |
501 | } else { | 500 | } else { |
502 | mem = *memcg; | 501 | mem = *memcg; |
503 | css_get(&mem->css); | 502 | css_get(&mem->css); |
504 | } | 503 | } |
505 | 504 | ||
506 | 505 | ||
507 | while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) { | 506 | while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) { |
508 | if (!(gfp_mask & __GFP_WAIT)) | 507 | if (!(gfp_mask & __GFP_WAIT)) |
509 | goto nomem; | 508 | goto nomem; |
510 | 509 | ||
511 | if (try_to_free_mem_cgroup_pages(mem, gfp_mask)) | 510 | if (try_to_free_mem_cgroup_pages(mem, gfp_mask)) |
512 | continue; | 511 | continue; |
513 | 512 | ||
514 | /* | 513 | /* |
515 | * try_to_free_mem_cgroup_pages() might not give us a full | 514 | * try_to_free_mem_cgroup_pages() might not give us a full |
516 | * picture of reclaim. Some pages are reclaimed and might be | 515 | * picture of reclaim. Some pages are reclaimed and might be |
517 | * moved to swap cache or just unmapped from the cgroup. | 516 | * moved to swap cache or just unmapped from the cgroup. |
518 | * Check the limit again to see if the reclaim reduced the | 517 | * Check the limit again to see if the reclaim reduced the |
519 | * current usage of the cgroup before giving up | 518 | * current usage of the cgroup before giving up |
520 | */ | 519 | */ |
521 | if (res_counter_check_under_limit(&mem->res)) | 520 | if (res_counter_check_under_limit(&mem->res)) |
522 | continue; | 521 | continue; |
523 | 522 | ||
524 | if (!nr_retries--) { | 523 | if (!nr_retries--) { |
525 | if (oom) | 524 | if (oom) |
526 | mem_cgroup_out_of_memory(mem, gfp_mask); | 525 | mem_cgroup_out_of_memory(mem, gfp_mask); |
527 | goto nomem; | 526 | goto nomem; |
528 | } | 527 | } |
529 | } | 528 | } |
530 | return 0; | 529 | return 0; |
531 | nomem: | 530 | nomem: |
532 | css_put(&mem->css); | 531 | css_put(&mem->css); |
533 | return -ENOMEM; | 532 | return -ENOMEM; |
534 | } | 533 | } |
535 | 534 | ||
536 | /** | 535 | /** |
537 | * mem_cgroup_try_charge - get charge of PAGE_SIZE. | 536 | * mem_cgroup_try_charge - get charge of PAGE_SIZE. |
538 | * @mm: an mm_struct which is charged against. (when *memcg is NULL) | 537 | * @mm: an mm_struct which is charged against. (when *memcg is NULL) |
539 | * @gfp_mask: gfp_mask for reclaim. | 538 | * @gfp_mask: gfp_mask for reclaim. |
540 | * @memcg: a pointer to memory cgroup which is charged against. | 539 | * @memcg: a pointer to memory cgroup which is charged against. |
541 | * | 540 | * |
542 | * charge against memory cgroup pointed by *memcg. if *memcg == NULL, estimated | 541 | * charge against memory cgroup pointed by *memcg. if *memcg == NULL, estimated |
543 | * memory cgroup from @mm is got and stored in *memcg. | 542 | * memory cgroup from @mm is got and stored in *memcg. |
544 | * | 543 | * |
545 | * Returns 0 if success. -ENOMEM at failure. | 544 | * Returns 0 if success. -ENOMEM at failure. |
546 | * This call can invoke OOM-Killer. | 545 | * This call can invoke OOM-Killer. |
547 | */ | 546 | */ |
548 | 547 | ||
549 | int mem_cgroup_try_charge(struct mm_struct *mm, | 548 | int mem_cgroup_try_charge(struct mm_struct *mm, |
550 | gfp_t mask, struct mem_cgroup **memcg) | 549 | gfp_t mask, struct mem_cgroup **memcg) |
551 | { | 550 | { |
552 | return __mem_cgroup_try_charge(mm, mask, memcg, true); | 551 | return __mem_cgroup_try_charge(mm, mask, memcg, true); |
553 | } | 552 | } |
554 | 553 | ||
555 | /* | 554 | /* |
556 | * commit a charge got by mem_cgroup_try_charge() and makes page_cgroup to be | 555 | * commit a charge got by mem_cgroup_try_charge() and makes page_cgroup to be |
557 | * USED state. If already USED, uncharge and return. | 556 | * USED state. If already USED, uncharge and return. |
558 | */ | 557 | */ |
559 | 558 | ||
560 | static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | 559 | static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, |
561 | struct page_cgroup *pc, | 560 | struct page_cgroup *pc, |
562 | enum charge_type ctype) | 561 | enum charge_type ctype) |
563 | { | 562 | { |
564 | struct mem_cgroup_per_zone *mz; | 563 | struct mem_cgroup_per_zone *mz; |
565 | unsigned long flags; | 564 | unsigned long flags; |
566 | 565 | ||
567 | /* try_charge() can return NULL to *memcg, taking care of it. */ | 566 | /* try_charge() can return NULL to *memcg, taking care of it. */ |
568 | if (!mem) | 567 | if (!mem) |
569 | return; | 568 | return; |
570 | 569 | ||
571 | lock_page_cgroup(pc); | 570 | lock_page_cgroup(pc); |
572 | if (unlikely(PageCgroupUsed(pc))) { | 571 | if (unlikely(PageCgroupUsed(pc))) { |
573 | unlock_page_cgroup(pc); | 572 | unlock_page_cgroup(pc); |
574 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 573 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
575 | css_put(&mem->css); | 574 | css_put(&mem->css); |
576 | return; | 575 | return; |
577 | } | 576 | } |
578 | pc->mem_cgroup = mem; | 577 | pc->mem_cgroup = mem; |
579 | /* | 578 | /* |
580 | * If a page is accounted as a page cache, insert to inactive list. | 579 | * If a page is accounted as a page cache, insert to inactive list. |
581 | * If anon, insert to active list. | 580 | * If anon, insert to active list. |
582 | */ | 581 | */ |
583 | pc->flags = pcg_default_flags[ctype]; | 582 | pc->flags = pcg_default_flags[ctype]; |
584 | 583 | ||
585 | mz = page_cgroup_zoneinfo(pc); | 584 | mz = page_cgroup_zoneinfo(pc); |
586 | 585 | ||
587 | spin_lock_irqsave(&mz->lru_lock, flags); | 586 | spin_lock_irqsave(&mz->lru_lock, flags); |
588 | __mem_cgroup_add_list(mz, pc, true); | 587 | __mem_cgroup_add_list(mz, pc, true); |
589 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 588 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
590 | unlock_page_cgroup(pc); | 589 | unlock_page_cgroup(pc); |
591 | } | 590 | } |
592 | 591 | ||
593 | /** | 592 | /** |
594 | * mem_cgroup_move_account - move account of the page | 593 | * mem_cgroup_move_account - move account of the page |
595 | * @pc: page_cgroup of the page. | 594 | * @pc: page_cgroup of the page. |
596 | * @from: mem_cgroup which the page is moved from. | 595 | * @from: mem_cgroup which the page is moved from. |
597 | * @to: mem_cgroup which the page is moved to. @from != @to. | 596 | * @to: mem_cgroup which the page is moved to. @from != @to. |
598 | * | 597 | * |
599 | * The caller must confirm following. | 598 | * The caller must confirm following. |
600 | * 1. disable irq. | 599 | * 1. disable irq. |
601 | * 2. lru_lock of old mem_cgroup(@from) should be held. | 600 | * 2. lru_lock of old mem_cgroup(@from) should be held. |
602 | * | 601 | * |
603 | * returns 0 at success, | 602 | * returns 0 at success, |
604 | * returns -EBUSY when lock is busy or "pc" is unstable. | 603 | * returns -EBUSY when lock is busy or "pc" is unstable. |
605 | * | 604 | * |
606 | * This function does "uncharge" from old cgroup but doesn't do "charge" to | 605 | * This function does "uncharge" from old cgroup but doesn't do "charge" to |
607 | * new cgroup. It should be done by a caller. | 606 | * new cgroup. It should be done by a caller. |
608 | */ | 607 | */ |
609 | 608 | ||
610 | static int mem_cgroup_move_account(struct page_cgroup *pc, | 609 | static int mem_cgroup_move_account(struct page_cgroup *pc, |
611 | struct mem_cgroup *from, struct mem_cgroup *to) | 610 | struct mem_cgroup *from, struct mem_cgroup *to) |
612 | { | 611 | { |
613 | struct mem_cgroup_per_zone *from_mz, *to_mz; | 612 | struct mem_cgroup_per_zone *from_mz, *to_mz; |
614 | int nid, zid; | 613 | int nid, zid; |
615 | int ret = -EBUSY; | 614 | int ret = -EBUSY; |
616 | 615 | ||
617 | VM_BUG_ON(!irqs_disabled()); | 616 | VM_BUG_ON(!irqs_disabled()); |
618 | VM_BUG_ON(from == to); | 617 | VM_BUG_ON(from == to); |
619 | 618 | ||
620 | nid = page_cgroup_nid(pc); | 619 | nid = page_cgroup_nid(pc); |
621 | zid = page_cgroup_zid(pc); | 620 | zid = page_cgroup_zid(pc); |
622 | from_mz = mem_cgroup_zoneinfo(from, nid, zid); | 621 | from_mz = mem_cgroup_zoneinfo(from, nid, zid); |
623 | to_mz = mem_cgroup_zoneinfo(to, nid, zid); | 622 | to_mz = mem_cgroup_zoneinfo(to, nid, zid); |
624 | 623 | ||
625 | 624 | ||
626 | if (!trylock_page_cgroup(pc)) | 625 | if (!trylock_page_cgroup(pc)) |
627 | return ret; | 626 | return ret; |
628 | 627 | ||
629 | if (!PageCgroupUsed(pc)) | 628 | if (!PageCgroupUsed(pc)) |
630 | goto out; | 629 | goto out; |
631 | 630 | ||
632 | if (pc->mem_cgroup != from) | 631 | if (pc->mem_cgroup != from) |
633 | goto out; | 632 | goto out; |
634 | 633 | ||
635 | if (spin_trylock(&to_mz->lru_lock)) { | 634 | if (spin_trylock(&to_mz->lru_lock)) { |
636 | __mem_cgroup_remove_list(from_mz, pc); | 635 | __mem_cgroup_remove_list(from_mz, pc); |
637 | css_put(&from->css); | 636 | css_put(&from->css); |
638 | res_counter_uncharge(&from->res, PAGE_SIZE); | 637 | res_counter_uncharge(&from->res, PAGE_SIZE); |
639 | pc->mem_cgroup = to; | 638 | pc->mem_cgroup = to; |
640 | css_get(&to->css); | 639 | css_get(&to->css); |
641 | __mem_cgroup_add_list(to_mz, pc, false); | 640 | __mem_cgroup_add_list(to_mz, pc, false); |
642 | ret = 0; | 641 | ret = 0; |
643 | spin_unlock(&to_mz->lru_lock); | 642 | spin_unlock(&to_mz->lru_lock); |
644 | } | 643 | } |
645 | out: | 644 | out: |
646 | unlock_page_cgroup(pc); | 645 | unlock_page_cgroup(pc); |
647 | return ret; | 646 | return ret; |
648 | } | 647 | } |
649 | 648 | ||
650 | /* | 649 | /* |
651 | * move charges to its parent. | 650 | * move charges to its parent. |
652 | */ | 651 | */ |
653 | 652 | ||
654 | static int mem_cgroup_move_parent(struct page_cgroup *pc, | 653 | static int mem_cgroup_move_parent(struct page_cgroup *pc, |
655 | struct mem_cgroup *child, | 654 | struct mem_cgroup *child, |
656 | gfp_t gfp_mask) | 655 | gfp_t gfp_mask) |
657 | { | 656 | { |
658 | struct cgroup *cg = child->css.cgroup; | 657 | struct cgroup *cg = child->css.cgroup; |
659 | struct cgroup *pcg = cg->parent; | 658 | struct cgroup *pcg = cg->parent; |
660 | struct mem_cgroup *parent; | 659 | struct mem_cgroup *parent; |
661 | struct mem_cgroup_per_zone *mz; | 660 | struct mem_cgroup_per_zone *mz; |
662 | unsigned long flags; | 661 | unsigned long flags; |
663 | int ret; | 662 | int ret; |
664 | 663 | ||
665 | /* Is ROOT ? */ | 664 | /* Is ROOT ? */ |
666 | if (!pcg) | 665 | if (!pcg) |
667 | return -EINVAL; | 666 | return -EINVAL; |
668 | 667 | ||
669 | parent = mem_cgroup_from_cont(pcg); | 668 | parent = mem_cgroup_from_cont(pcg); |
670 | 669 | ||
671 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); | 670 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); |
672 | if (ret) | 671 | if (ret) |
673 | return ret; | 672 | return ret; |
674 | 673 | ||
675 | mz = mem_cgroup_zoneinfo(child, | 674 | mz = mem_cgroup_zoneinfo(child, |
676 | page_cgroup_nid(pc), page_cgroup_zid(pc)); | 675 | page_cgroup_nid(pc), page_cgroup_zid(pc)); |
677 | 676 | ||
678 | spin_lock_irqsave(&mz->lru_lock, flags); | 677 | spin_lock_irqsave(&mz->lru_lock, flags); |
679 | ret = mem_cgroup_move_account(pc, child, parent); | 678 | ret = mem_cgroup_move_account(pc, child, parent); |
680 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 679 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
681 | 680 | ||
682 | /* drop extra refcnt */ | 681 | /* drop extra refcnt */ |
683 | css_put(&parent->css); | 682 | css_put(&parent->css); |
684 | /* uncharge if move fails */ | 683 | /* uncharge if move fails */ |
685 | if (ret) | 684 | if (ret) |
686 | res_counter_uncharge(&parent->res, PAGE_SIZE); | 685 | res_counter_uncharge(&parent->res, PAGE_SIZE); |
687 | 686 | ||
688 | return ret; | 687 | return ret; |
689 | } | 688 | } |
690 | 689 | ||
691 | /* | 690 | /* |
692 | * Charge the memory controller for page usage. | 691 | * Charge the memory controller for page usage. |
693 | * Return | 692 | * Return |
694 | * 0 if the charge was successful | 693 | * 0 if the charge was successful |
695 | * < 0 if the cgroup is over its limit | 694 | * < 0 if the cgroup is over its limit |
696 | */ | 695 | */ |
697 | static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | 696 | static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, |
698 | gfp_t gfp_mask, enum charge_type ctype, | 697 | gfp_t gfp_mask, enum charge_type ctype, |
699 | struct mem_cgroup *memcg) | 698 | struct mem_cgroup *memcg) |
700 | { | 699 | { |
701 | struct mem_cgroup *mem; | 700 | struct mem_cgroup *mem; |
702 | struct page_cgroup *pc; | 701 | struct page_cgroup *pc; |
703 | int ret; | 702 | int ret; |
704 | 703 | ||
705 | pc = lookup_page_cgroup(page); | 704 | pc = lookup_page_cgroup(page); |
706 | /* can happen at boot */ | 705 | /* can happen at boot */ |
707 | if (unlikely(!pc)) | 706 | if (unlikely(!pc)) |
708 | return 0; | 707 | return 0; |
709 | prefetchw(pc); | 708 | prefetchw(pc); |
710 | 709 | ||
711 | mem = memcg; | 710 | mem = memcg; |
712 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); | 711 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); |
713 | if (ret) | 712 | if (ret) |
714 | return ret; | 713 | return ret; |
715 | 714 | ||
716 | __mem_cgroup_commit_charge(mem, pc, ctype); | 715 | __mem_cgroup_commit_charge(mem, pc, ctype); |
717 | return 0; | 716 | return 0; |
718 | } | 717 | } |
719 | 718 | ||
720 | int mem_cgroup_newpage_charge(struct page *page, | 719 | int mem_cgroup_newpage_charge(struct page *page, |
721 | struct mm_struct *mm, gfp_t gfp_mask) | 720 | struct mm_struct *mm, gfp_t gfp_mask) |
722 | { | 721 | { |
723 | if (mem_cgroup_subsys.disabled) | 722 | if (mem_cgroup_subsys.disabled) |
724 | return 0; | 723 | return 0; |
725 | if (PageCompound(page)) | 724 | if (PageCompound(page)) |
726 | return 0; | 725 | return 0; |
727 | /* | 726 | /* |
728 | * If already mapped, we don't have to account. | 727 | * If already mapped, we don't have to account. |
729 | * If page cache, page->mapping has address_space. | 728 | * If page cache, page->mapping has address_space. |
730 | * But page->mapping may have out-of-use anon_vma pointer, | 729 | * But page->mapping may have out-of-use anon_vma pointer, |
731 | * detecit it by PageAnon() check. newly-mapped-anon's page->mapping | 730 | * detecit it by PageAnon() check. newly-mapped-anon's page->mapping |
732 | * is NULL. | 731 | * is NULL. |
733 | */ | 732 | */ |
734 | if (page_mapped(page) || (page->mapping && !PageAnon(page))) | 733 | if (page_mapped(page) || (page->mapping && !PageAnon(page))) |
735 | return 0; | 734 | return 0; |
736 | if (unlikely(!mm)) | 735 | if (unlikely(!mm)) |
737 | mm = &init_mm; | 736 | mm = &init_mm; |
738 | return mem_cgroup_charge_common(page, mm, gfp_mask, | 737 | return mem_cgroup_charge_common(page, mm, gfp_mask, |
739 | MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); | 738 | MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); |
740 | } | 739 | } |
741 | 740 | ||
742 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | 741 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, |
743 | gfp_t gfp_mask) | 742 | gfp_t gfp_mask) |
744 | { | 743 | { |
745 | if (mem_cgroup_subsys.disabled) | 744 | if (mem_cgroup_subsys.disabled) |
746 | return 0; | 745 | return 0; |
747 | if (PageCompound(page)) | 746 | if (PageCompound(page)) |
748 | return 0; | 747 | return 0; |
749 | /* | 748 | /* |
750 | * Corner case handling. This is called from add_to_page_cache() | 749 | * Corner case handling. This is called from add_to_page_cache() |
751 | * in usual. But some FS (shmem) precharges this page before calling it | 750 | * in usual. But some FS (shmem) precharges this page before calling it |
752 | * and call add_to_page_cache() with GFP_NOWAIT. | 751 | * and call add_to_page_cache() with GFP_NOWAIT. |
753 | * | 752 | * |
754 | * For GFP_NOWAIT case, the page may be pre-charged before calling | 753 | * For GFP_NOWAIT case, the page may be pre-charged before calling |
755 | * add_to_page_cache(). (See shmem.c) check it here and avoid to call | 754 | * add_to_page_cache(). (See shmem.c) check it here and avoid to call |
756 | * charge twice. (It works but has to pay a bit larger cost.) | 755 | * charge twice. (It works but has to pay a bit larger cost.) |
757 | */ | 756 | */ |
758 | if (!(gfp_mask & __GFP_WAIT)) { | 757 | if (!(gfp_mask & __GFP_WAIT)) { |
759 | struct page_cgroup *pc; | 758 | struct page_cgroup *pc; |
760 | 759 | ||
761 | 760 | ||
762 | pc = lookup_page_cgroup(page); | 761 | pc = lookup_page_cgroup(page); |
763 | if (!pc) | 762 | if (!pc) |
764 | return 0; | 763 | return 0; |
765 | lock_page_cgroup(pc); | 764 | lock_page_cgroup(pc); |
766 | if (PageCgroupUsed(pc)) { | 765 | if (PageCgroupUsed(pc)) { |
767 | unlock_page_cgroup(pc); | 766 | unlock_page_cgroup(pc); |
768 | return 0; | 767 | return 0; |
769 | } | 768 | } |
770 | unlock_page_cgroup(pc); | 769 | unlock_page_cgroup(pc); |
771 | } | 770 | } |
772 | 771 | ||
773 | if (unlikely(!mm)) | 772 | if (unlikely(!mm)) |
774 | mm = &init_mm; | 773 | mm = &init_mm; |
775 | 774 | ||
776 | if (page_is_file_cache(page)) | 775 | if (page_is_file_cache(page)) |
777 | return mem_cgroup_charge_common(page, mm, gfp_mask, | 776 | return mem_cgroup_charge_common(page, mm, gfp_mask, |
778 | MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); | 777 | MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); |
779 | else | 778 | else |
780 | return mem_cgroup_charge_common(page, mm, gfp_mask, | 779 | return mem_cgroup_charge_common(page, mm, gfp_mask, |
781 | MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL); | 780 | MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL); |
782 | } | 781 | } |
783 | 782 | ||
784 | void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) | 783 | void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) |
785 | { | 784 | { |
786 | struct page_cgroup *pc; | 785 | struct page_cgroup *pc; |
787 | 786 | ||
788 | if (mem_cgroup_subsys.disabled) | 787 | if (mem_cgroup_subsys.disabled) |
789 | return; | 788 | return; |
790 | if (!ptr) | 789 | if (!ptr) |
791 | return; | 790 | return; |
792 | pc = lookup_page_cgroup(page); | 791 | pc = lookup_page_cgroup(page); |
793 | __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED); | 792 | __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED); |
794 | } | 793 | } |
795 | 794 | ||
796 | void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | 795 | void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) |
797 | { | 796 | { |
798 | if (mem_cgroup_subsys.disabled) | 797 | if (mem_cgroup_subsys.disabled) |
799 | return; | 798 | return; |
800 | if (!mem) | 799 | if (!mem) |
801 | return; | 800 | return; |
802 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 801 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
803 | css_put(&mem->css); | 802 | css_put(&mem->css); |
804 | } | 803 | } |
805 | 804 | ||
806 | 805 | ||
807 | /* | 806 | /* |
808 | * uncharge if !page_mapped(page) | 807 | * uncharge if !page_mapped(page) |
809 | */ | 808 | */ |
810 | static void | 809 | static void |
811 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | 810 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) |
812 | { | 811 | { |
813 | struct page_cgroup *pc; | 812 | struct page_cgroup *pc; |
814 | struct mem_cgroup *mem; | 813 | struct mem_cgroup *mem; |
815 | struct mem_cgroup_per_zone *mz; | 814 | struct mem_cgroup_per_zone *mz; |
816 | unsigned long flags; | 815 | unsigned long flags; |
817 | 816 | ||
818 | if (mem_cgroup_subsys.disabled) | 817 | if (mem_cgroup_subsys.disabled) |
819 | return; | 818 | return; |
820 | 819 | ||
821 | /* | 820 | /* |
822 | * Check if our page_cgroup is valid | 821 | * Check if our page_cgroup is valid |
823 | */ | 822 | */ |
824 | pc = lookup_page_cgroup(page); | 823 | pc = lookup_page_cgroup(page); |
825 | if (unlikely(!pc || !PageCgroupUsed(pc))) | 824 | if (unlikely(!pc || !PageCgroupUsed(pc))) |
826 | return; | 825 | return; |
827 | 826 | ||
828 | lock_page_cgroup(pc); | 827 | lock_page_cgroup(pc); |
829 | if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page)) | 828 | if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page)) |
830 | || !PageCgroupUsed(pc)) { | 829 | || !PageCgroupUsed(pc)) { |
831 | /* This happens at race in zap_pte_range() and do_swap_page()*/ | 830 | /* This happens at race in zap_pte_range() and do_swap_page()*/ |
832 | unlock_page_cgroup(pc); | 831 | unlock_page_cgroup(pc); |
833 | return; | 832 | return; |
834 | } | 833 | } |
835 | ClearPageCgroupUsed(pc); | 834 | ClearPageCgroupUsed(pc); |
836 | mem = pc->mem_cgroup; | 835 | mem = pc->mem_cgroup; |
837 | 836 | ||
838 | mz = page_cgroup_zoneinfo(pc); | 837 | mz = page_cgroup_zoneinfo(pc); |
839 | spin_lock_irqsave(&mz->lru_lock, flags); | 838 | spin_lock_irqsave(&mz->lru_lock, flags); |
840 | __mem_cgroup_remove_list(mz, pc); | 839 | __mem_cgroup_remove_list(mz, pc); |
841 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 840 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
842 | unlock_page_cgroup(pc); | 841 | unlock_page_cgroup(pc); |
843 | 842 | ||
844 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 843 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
845 | css_put(&mem->css); | 844 | css_put(&mem->css); |
846 | 845 | ||
847 | return; | 846 | return; |
848 | } | 847 | } |
849 | 848 | ||
850 | void mem_cgroup_uncharge_page(struct page *page) | 849 | void mem_cgroup_uncharge_page(struct page *page) |
851 | { | 850 | { |
852 | /* early check. */ | 851 | /* early check. */ |
853 | if (page_mapped(page)) | 852 | if (page_mapped(page)) |
854 | return; | 853 | return; |
855 | if (page->mapping && !PageAnon(page)) | 854 | if (page->mapping && !PageAnon(page)) |
856 | return; | 855 | return; |
857 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); | 856 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); |
858 | } | 857 | } |
859 | 858 | ||
860 | void mem_cgroup_uncharge_cache_page(struct page *page) | 859 | void mem_cgroup_uncharge_cache_page(struct page *page) |
861 | { | 860 | { |
862 | VM_BUG_ON(page_mapped(page)); | 861 | VM_BUG_ON(page_mapped(page)); |
863 | VM_BUG_ON(page->mapping); | 862 | VM_BUG_ON(page->mapping); |
864 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); | 863 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); |
865 | } | 864 | } |
866 | 865 | ||
867 | /* | 866 | /* |
868 | * Before starting migration, account PAGE_SIZE to mem_cgroup that the old | 867 | * Before starting migration, account PAGE_SIZE to mem_cgroup that the old |
869 | * page belongs to. | 868 | * page belongs to. |
870 | */ | 869 | */ |
871 | int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) | 870 | int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) |
872 | { | 871 | { |
873 | struct page_cgroup *pc; | 872 | struct page_cgroup *pc; |
874 | struct mem_cgroup *mem = NULL; | 873 | struct mem_cgroup *mem = NULL; |
875 | int ret = 0; | 874 | int ret = 0; |
876 | 875 | ||
877 | if (mem_cgroup_subsys.disabled) | 876 | if (mem_cgroup_subsys.disabled) |
878 | return 0; | 877 | return 0; |
879 | 878 | ||
880 | pc = lookup_page_cgroup(page); | 879 | pc = lookup_page_cgroup(page); |
881 | lock_page_cgroup(pc); | 880 | lock_page_cgroup(pc); |
882 | if (PageCgroupUsed(pc)) { | 881 | if (PageCgroupUsed(pc)) { |
883 | mem = pc->mem_cgroup; | 882 | mem = pc->mem_cgroup; |
884 | css_get(&mem->css); | 883 | css_get(&mem->css); |
885 | } | 884 | } |
886 | unlock_page_cgroup(pc); | 885 | unlock_page_cgroup(pc); |
887 | 886 | ||
888 | if (mem) { | 887 | if (mem) { |
889 | ret = mem_cgroup_try_charge(NULL, GFP_HIGHUSER_MOVABLE, &mem); | 888 | ret = mem_cgroup_try_charge(NULL, GFP_HIGHUSER_MOVABLE, &mem); |
890 | css_put(&mem->css); | 889 | css_put(&mem->css); |
891 | } | 890 | } |
892 | *ptr = mem; | 891 | *ptr = mem; |
893 | return ret; | 892 | return ret; |
894 | } | 893 | } |
895 | 894 | ||
896 | /* remove redundant charge if migration failed*/ | 895 | /* remove redundant charge if migration failed*/ |
897 | void mem_cgroup_end_migration(struct mem_cgroup *mem, | 896 | void mem_cgroup_end_migration(struct mem_cgroup *mem, |
898 | struct page *oldpage, struct page *newpage) | 897 | struct page *oldpage, struct page *newpage) |
899 | { | 898 | { |
900 | struct page *target, *unused; | 899 | struct page *target, *unused; |
901 | struct page_cgroup *pc; | 900 | struct page_cgroup *pc; |
902 | enum charge_type ctype; | 901 | enum charge_type ctype; |
903 | 902 | ||
904 | if (!mem) | 903 | if (!mem) |
905 | return; | 904 | return; |
906 | 905 | ||
907 | /* at migration success, oldpage->mapping is NULL. */ | 906 | /* at migration success, oldpage->mapping is NULL. */ |
908 | if (oldpage->mapping) { | 907 | if (oldpage->mapping) { |
909 | target = oldpage; | 908 | target = oldpage; |
910 | unused = NULL; | 909 | unused = NULL; |
911 | } else { | 910 | } else { |
912 | target = newpage; | 911 | target = newpage; |
913 | unused = oldpage; | 912 | unused = oldpage; |
914 | } | 913 | } |
915 | 914 | ||
916 | if (PageAnon(target)) | 915 | if (PageAnon(target)) |
917 | ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; | 916 | ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; |
918 | else if (page_is_file_cache(target)) | 917 | else if (page_is_file_cache(target)) |
919 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; | 918 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; |
920 | else | 919 | else |
921 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; | 920 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; |
922 | 921 | ||
923 | /* unused page is not on radix-tree now. */ | 922 | /* unused page is not on radix-tree now. */ |
924 | if (unused && ctype != MEM_CGROUP_CHARGE_TYPE_MAPPED) | 923 | if (unused && ctype != MEM_CGROUP_CHARGE_TYPE_MAPPED) |
925 | __mem_cgroup_uncharge_common(unused, ctype); | 924 | __mem_cgroup_uncharge_common(unused, ctype); |
926 | 925 | ||
927 | pc = lookup_page_cgroup(target); | 926 | pc = lookup_page_cgroup(target); |
928 | /* | 927 | /* |
929 | * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup. | 928 | * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup. |
930 | * So, double-counting is effectively avoided. | 929 | * So, double-counting is effectively avoided. |
931 | */ | 930 | */ |
932 | __mem_cgroup_commit_charge(mem, pc, ctype); | 931 | __mem_cgroup_commit_charge(mem, pc, ctype); |
933 | 932 | ||
934 | /* | 933 | /* |
935 | * Both of oldpage and newpage are still under lock_page(). | 934 | * Both of oldpage and newpage are still under lock_page(). |
936 | * Then, we don't have to care about race in radix-tree. | 935 | * Then, we don't have to care about race in radix-tree. |
937 | * But we have to be careful that this page is unmapped or not. | 936 | * But we have to be careful that this page is unmapped or not. |
938 | * | 937 | * |
939 | * There is a case for !page_mapped(). At the start of | 938 | * There is a case for !page_mapped(). At the start of |
940 | * migration, oldpage was mapped. But now, it's zapped. | 939 | * migration, oldpage was mapped. But now, it's zapped. |
941 | * But we know *target* page is not freed/reused under us. | 940 | * But we know *target* page is not freed/reused under us. |
942 | * mem_cgroup_uncharge_page() does all necessary checks. | 941 | * mem_cgroup_uncharge_page() does all necessary checks. |
943 | */ | 942 | */ |
944 | if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) | 943 | if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) |
945 | mem_cgroup_uncharge_page(target); | 944 | mem_cgroup_uncharge_page(target); |
946 | } | 945 | } |
947 | 946 | ||
948 | /* | 947 | /* |
949 | * A call to try to shrink memory usage under specified resource controller. | 948 | * A call to try to shrink memory usage under specified resource controller. |
950 | * This is typically used for page reclaiming for shmem for reducing side | 949 | * This is typically used for page reclaiming for shmem for reducing side |
951 | * effect of page allocation from shmem, which is used by some mem_cgroup. | 950 | * effect of page allocation from shmem, which is used by some mem_cgroup. |
952 | */ | 951 | */ |
953 | int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) | 952 | int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) |
954 | { | 953 | { |
955 | struct mem_cgroup *mem; | 954 | struct mem_cgroup *mem; |
956 | int progress = 0; | 955 | int progress = 0; |
957 | int retry = MEM_CGROUP_RECLAIM_RETRIES; | 956 | int retry = MEM_CGROUP_RECLAIM_RETRIES; |
958 | 957 | ||
959 | if (mem_cgroup_subsys.disabled) | 958 | if (mem_cgroup_subsys.disabled) |
960 | return 0; | 959 | return 0; |
961 | if (!mm) | 960 | if (!mm) |
962 | return 0; | 961 | return 0; |
963 | 962 | ||
964 | rcu_read_lock(); | 963 | rcu_read_lock(); |
965 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 964 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); |
966 | if (unlikely(!mem)) { | 965 | if (unlikely(!mem)) { |
967 | rcu_read_unlock(); | 966 | rcu_read_unlock(); |
968 | return 0; | 967 | return 0; |
969 | } | 968 | } |
970 | css_get(&mem->css); | 969 | css_get(&mem->css); |
971 | rcu_read_unlock(); | 970 | rcu_read_unlock(); |
972 | 971 | ||
973 | do { | 972 | do { |
974 | progress = try_to_free_mem_cgroup_pages(mem, gfp_mask); | 973 | progress = try_to_free_mem_cgroup_pages(mem, gfp_mask); |
975 | progress += res_counter_check_under_limit(&mem->res); | 974 | progress += res_counter_check_under_limit(&mem->res); |
976 | } while (!progress && --retry); | 975 | } while (!progress && --retry); |
977 | 976 | ||
978 | css_put(&mem->css); | 977 | css_put(&mem->css); |
979 | if (!retry) | 978 | if (!retry) |
980 | return -ENOMEM; | 979 | return -ENOMEM; |
981 | return 0; | 980 | return 0; |
982 | } | 981 | } |
983 | 982 | ||
984 | static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | 983 | static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, |
985 | unsigned long long val) | 984 | unsigned long long val) |
986 | { | 985 | { |
987 | 986 | ||
988 | int retry_count = MEM_CGROUP_RECLAIM_RETRIES; | 987 | int retry_count = MEM_CGROUP_RECLAIM_RETRIES; |
989 | int progress; | 988 | int progress; |
990 | int ret = 0; | 989 | int ret = 0; |
991 | 990 | ||
992 | while (res_counter_set_limit(&memcg->res, val)) { | 991 | while (res_counter_set_limit(&memcg->res, val)) { |
993 | if (signal_pending(current)) { | 992 | if (signal_pending(current)) { |
994 | ret = -EINTR; | 993 | ret = -EINTR; |
995 | break; | 994 | break; |
996 | } | 995 | } |
997 | if (!retry_count) { | 996 | if (!retry_count) { |
998 | ret = -EBUSY; | 997 | ret = -EBUSY; |
999 | break; | 998 | break; |
1000 | } | 999 | } |
1001 | progress = try_to_free_mem_cgroup_pages(memcg, | 1000 | progress = try_to_free_mem_cgroup_pages(memcg, |
1002 | GFP_HIGHUSER_MOVABLE); | 1001 | GFP_HIGHUSER_MOVABLE); |
1003 | if (!progress) | 1002 | if (!progress) |
1004 | retry_count--; | 1003 | retry_count--; |
1005 | } | 1004 | } |
1006 | return ret; | 1005 | return ret; |
1007 | } | 1006 | } |
1008 | 1007 | ||
1009 | 1008 | ||
1010 | /* | 1009 | /* |
1011 | * This routine traverse page_cgroup in given list and drop them all. | 1010 | * This routine traverse page_cgroup in given list and drop them all. |
1012 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. | 1011 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. |
1013 | */ | 1012 | */ |
1014 | static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, | 1013 | static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, |
1015 | struct mem_cgroup_per_zone *mz, | 1014 | struct mem_cgroup_per_zone *mz, |
1016 | enum lru_list lru) | 1015 | enum lru_list lru) |
1017 | { | 1016 | { |
1018 | struct page_cgroup *pc, *busy; | 1017 | struct page_cgroup *pc, *busy; |
1019 | unsigned long flags; | 1018 | unsigned long flags; |
1020 | unsigned long loop; | 1019 | unsigned long loop; |
1021 | struct list_head *list; | 1020 | struct list_head *list; |
1022 | int ret = 0; | 1021 | int ret = 0; |
1023 | 1022 | ||
1024 | list = &mz->lists[lru]; | 1023 | list = &mz->lists[lru]; |
1025 | 1024 | ||
1026 | loop = MEM_CGROUP_ZSTAT(mz, lru); | 1025 | loop = MEM_CGROUP_ZSTAT(mz, lru); |
1027 | /* give some margin against EBUSY etc...*/ | 1026 | /* give some margin against EBUSY etc...*/ |
1028 | loop += 256; | 1027 | loop += 256; |
1029 | busy = NULL; | 1028 | busy = NULL; |
1030 | while (loop--) { | 1029 | while (loop--) { |
1031 | ret = 0; | 1030 | ret = 0; |
1032 | spin_lock_irqsave(&mz->lru_lock, flags); | 1031 | spin_lock_irqsave(&mz->lru_lock, flags); |
1033 | if (list_empty(list)) { | 1032 | if (list_empty(list)) { |
1034 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 1033 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
1035 | break; | 1034 | break; |
1036 | } | 1035 | } |
1037 | pc = list_entry(list->prev, struct page_cgroup, lru); | 1036 | pc = list_entry(list->prev, struct page_cgroup, lru); |
1038 | if (busy == pc) { | 1037 | if (busy == pc) { |
1039 | list_move(&pc->lru, list); | 1038 | list_move(&pc->lru, list); |
1040 | busy = 0; | 1039 | busy = 0; |
1041 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 1040 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
1042 | continue; | 1041 | continue; |
1043 | } | 1042 | } |
1044 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 1043 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
1045 | 1044 | ||
1046 | ret = mem_cgroup_move_parent(pc, mem, GFP_HIGHUSER_MOVABLE); | 1045 | ret = mem_cgroup_move_parent(pc, mem, GFP_HIGHUSER_MOVABLE); |
1047 | if (ret == -ENOMEM) | 1046 | if (ret == -ENOMEM) |
1048 | break; | 1047 | break; |
1049 | 1048 | ||
1050 | if (ret == -EBUSY || ret == -EINVAL) { | 1049 | if (ret == -EBUSY || ret == -EINVAL) { |
1051 | /* found lock contention or "pc" is obsolete. */ | 1050 | /* found lock contention or "pc" is obsolete. */ |
1052 | busy = pc; | 1051 | busy = pc; |
1053 | cond_resched(); | 1052 | cond_resched(); |
1054 | } else | 1053 | } else |
1055 | busy = NULL; | 1054 | busy = NULL; |
1056 | } | 1055 | } |
1057 | if (!ret && !list_empty(list)) | 1056 | if (!ret && !list_empty(list)) |
1058 | return -EBUSY; | 1057 | return -EBUSY; |
1059 | return ret; | 1058 | return ret; |
1060 | } | 1059 | } |
1061 | 1060 | ||
1062 | /* | 1061 | /* |
1063 | * make mem_cgroup's charge to be 0 if there is no task. | 1062 | * make mem_cgroup's charge to be 0 if there is no task. |
1064 | * This enables deleting this mem_cgroup. | 1063 | * This enables deleting this mem_cgroup. |
1065 | */ | 1064 | */ |
1066 | static int mem_cgroup_force_empty(struct mem_cgroup *mem) | 1065 | static int mem_cgroup_force_empty(struct mem_cgroup *mem) |
1067 | { | 1066 | { |
1068 | int ret; | 1067 | int ret; |
1069 | int node, zid, shrink; | 1068 | int node, zid, shrink; |
1070 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 1069 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
1071 | 1070 | ||
1072 | css_get(&mem->css); | 1071 | css_get(&mem->css); |
1073 | 1072 | ||
1074 | shrink = 0; | 1073 | shrink = 0; |
1075 | move_account: | 1074 | move_account: |
1076 | while (mem->res.usage > 0) { | 1075 | while (mem->res.usage > 0) { |
1077 | ret = -EBUSY; | 1076 | ret = -EBUSY; |
1078 | if (atomic_read(&mem->css.cgroup->count) > 0) | 1077 | if (atomic_read(&mem->css.cgroup->count) > 0) |
1079 | goto out; | 1078 | goto out; |
1080 | 1079 | ||
1081 | /* This is for making all *used* pages to be on LRU. */ | 1080 | /* This is for making all *used* pages to be on LRU. */ |
1082 | lru_add_drain_all(); | 1081 | lru_add_drain_all(); |
1083 | ret = 0; | 1082 | ret = 0; |
1084 | for_each_node_state(node, N_POSSIBLE) { | 1083 | for_each_node_state(node, N_POSSIBLE) { |
1085 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { | 1084 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { |
1086 | struct mem_cgroup_per_zone *mz; | 1085 | struct mem_cgroup_per_zone *mz; |
1087 | enum lru_list l; | 1086 | enum lru_list l; |
1088 | mz = mem_cgroup_zoneinfo(mem, node, zid); | 1087 | mz = mem_cgroup_zoneinfo(mem, node, zid); |
1089 | for_each_lru(l) { | 1088 | for_each_lru(l) { |
1090 | ret = mem_cgroup_force_empty_list(mem, | 1089 | ret = mem_cgroup_force_empty_list(mem, |
1091 | mz, l); | 1090 | mz, l); |
1092 | if (ret) | 1091 | if (ret) |
1093 | break; | 1092 | break; |
1094 | } | 1093 | } |
1095 | } | 1094 | } |
1096 | if (ret) | 1095 | if (ret) |
1097 | break; | 1096 | break; |
1098 | } | 1097 | } |
1099 | /* it seems parent cgroup doesn't have enough mem */ | 1098 | /* it seems parent cgroup doesn't have enough mem */ |
1100 | if (ret == -ENOMEM) | 1099 | if (ret == -ENOMEM) |
1101 | goto try_to_free; | 1100 | goto try_to_free; |
1102 | cond_resched(); | 1101 | cond_resched(); |
1103 | } | 1102 | } |
1104 | ret = 0; | 1103 | ret = 0; |
1105 | out: | 1104 | out: |
1106 | css_put(&mem->css); | 1105 | css_put(&mem->css); |
1107 | return ret; | 1106 | return ret; |
1108 | 1107 | ||
1109 | try_to_free: | 1108 | try_to_free: |
1110 | /* returns EBUSY if we come here twice. */ | 1109 | /* returns EBUSY if we come here twice. */ |
1111 | if (shrink) { | 1110 | if (shrink) { |
1112 | ret = -EBUSY; | 1111 | ret = -EBUSY; |
1113 | goto out; | 1112 | goto out; |
1114 | } | 1113 | } |
1115 | /* try to free all pages in this cgroup */ | 1114 | /* try to free all pages in this cgroup */ |
1116 | shrink = 1; | 1115 | shrink = 1; |
1117 | while (nr_retries && mem->res.usage > 0) { | 1116 | while (nr_retries && mem->res.usage > 0) { |
1118 | int progress; | 1117 | int progress; |
1119 | progress = try_to_free_mem_cgroup_pages(mem, | 1118 | progress = try_to_free_mem_cgroup_pages(mem, |
1120 | GFP_HIGHUSER_MOVABLE); | 1119 | GFP_HIGHUSER_MOVABLE); |
1121 | if (!progress) | 1120 | if (!progress) |
1122 | nr_retries--; | 1121 | nr_retries--; |
1123 | 1122 | ||
1124 | } | 1123 | } |
1125 | /* try move_account...there may be some *locked* pages. */ | 1124 | /* try move_account...there may be some *locked* pages. */ |
1126 | if (mem->res.usage) | 1125 | if (mem->res.usage) |
1127 | goto move_account; | 1126 | goto move_account; |
1128 | ret = 0; | 1127 | ret = 0; |
1129 | goto out; | 1128 | goto out; |
1130 | } | 1129 | } |
1131 | 1130 | ||
1132 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | 1131 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) |
1133 | { | 1132 | { |
1134 | return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, | 1133 | return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, |
1135 | cft->private); | 1134 | cft->private); |
1136 | } | 1135 | } |
1137 | /* | 1136 | /* |
1138 | * The user of this function is... | 1137 | * The user of this function is... |
1139 | * RES_LIMIT. | 1138 | * RES_LIMIT. |
1140 | */ | 1139 | */ |
1141 | static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | 1140 | static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, |
1142 | const char *buffer) | 1141 | const char *buffer) |
1143 | { | 1142 | { |
1144 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 1143 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
1145 | unsigned long long val; | 1144 | unsigned long long val; |
1146 | int ret; | 1145 | int ret; |
1147 | 1146 | ||
1148 | switch (cft->private) { | 1147 | switch (cft->private) { |
1149 | case RES_LIMIT: | 1148 | case RES_LIMIT: |
1150 | /* This function does all necessary parse...reuse it */ | 1149 | /* This function does all necessary parse...reuse it */ |
1151 | ret = res_counter_memparse_write_strategy(buffer, &val); | 1150 | ret = res_counter_memparse_write_strategy(buffer, &val); |
1152 | if (!ret) | 1151 | if (!ret) |
1153 | ret = mem_cgroup_resize_limit(memcg, val); | 1152 | ret = mem_cgroup_resize_limit(memcg, val); |
1154 | break; | 1153 | break; |
1155 | default: | 1154 | default: |
1156 | ret = -EINVAL; /* should be BUG() ? */ | 1155 | ret = -EINVAL; /* should be BUG() ? */ |
1157 | break; | 1156 | break; |
1158 | } | 1157 | } |
1159 | return ret; | 1158 | return ret; |
1160 | } | 1159 | } |
1161 | 1160 | ||
1162 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | 1161 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) |
1163 | { | 1162 | { |
1164 | struct mem_cgroup *mem; | 1163 | struct mem_cgroup *mem; |
1165 | 1164 | ||
1166 | mem = mem_cgroup_from_cont(cont); | 1165 | mem = mem_cgroup_from_cont(cont); |
1167 | switch (event) { | 1166 | switch (event) { |
1168 | case RES_MAX_USAGE: | 1167 | case RES_MAX_USAGE: |
1169 | res_counter_reset_max(&mem->res); | 1168 | res_counter_reset_max(&mem->res); |
1170 | break; | 1169 | break; |
1171 | case RES_FAILCNT: | 1170 | case RES_FAILCNT: |
1172 | res_counter_reset_failcnt(&mem->res); | 1171 | res_counter_reset_failcnt(&mem->res); |
1173 | break; | 1172 | break; |
1174 | } | 1173 | } |
1175 | return 0; | 1174 | return 0; |
1176 | } | 1175 | } |
1177 | 1176 | ||
1178 | static const struct mem_cgroup_stat_desc { | 1177 | static const struct mem_cgroup_stat_desc { |
1179 | const char *msg; | 1178 | const char *msg; |
1180 | u64 unit; | 1179 | u64 unit; |
1181 | } mem_cgroup_stat_desc[] = { | 1180 | } mem_cgroup_stat_desc[] = { |
1182 | [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, }, | 1181 | [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, }, |
1183 | [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, }, | 1182 | [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, }, |
1184 | [MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, }, | 1183 | [MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, }, |
1185 | [MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, }, | 1184 | [MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, }, |
1186 | }; | 1185 | }; |
1187 | 1186 | ||
1188 | static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | 1187 | static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, |
1189 | struct cgroup_map_cb *cb) | 1188 | struct cgroup_map_cb *cb) |
1190 | { | 1189 | { |
1191 | struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); | 1190 | struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); |
1192 | struct mem_cgroup_stat *stat = &mem_cont->stat; | 1191 | struct mem_cgroup_stat *stat = &mem_cont->stat; |
1193 | int i; | 1192 | int i; |
1194 | 1193 | ||
1195 | for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) { | 1194 | for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) { |
1196 | s64 val; | 1195 | s64 val; |
1197 | 1196 | ||
1198 | val = mem_cgroup_read_stat(stat, i); | 1197 | val = mem_cgroup_read_stat(stat, i); |
1199 | val *= mem_cgroup_stat_desc[i].unit; | 1198 | val *= mem_cgroup_stat_desc[i].unit; |
1200 | cb->fill(cb, mem_cgroup_stat_desc[i].msg, val); | 1199 | cb->fill(cb, mem_cgroup_stat_desc[i].msg, val); |
1201 | } | 1200 | } |
1202 | /* showing # of active pages */ | 1201 | /* showing # of active pages */ |
1203 | { | 1202 | { |
1204 | unsigned long active_anon, inactive_anon; | 1203 | unsigned long active_anon, inactive_anon; |
1205 | unsigned long active_file, inactive_file; | 1204 | unsigned long active_file, inactive_file; |
1206 | unsigned long unevictable; | 1205 | unsigned long unevictable; |
1207 | 1206 | ||
1208 | inactive_anon = mem_cgroup_get_all_zonestat(mem_cont, | 1207 | inactive_anon = mem_cgroup_get_all_zonestat(mem_cont, |
1209 | LRU_INACTIVE_ANON); | 1208 | LRU_INACTIVE_ANON); |
1210 | active_anon = mem_cgroup_get_all_zonestat(mem_cont, | 1209 | active_anon = mem_cgroup_get_all_zonestat(mem_cont, |
1211 | LRU_ACTIVE_ANON); | 1210 | LRU_ACTIVE_ANON); |
1212 | inactive_file = mem_cgroup_get_all_zonestat(mem_cont, | 1211 | inactive_file = mem_cgroup_get_all_zonestat(mem_cont, |
1213 | LRU_INACTIVE_FILE); | 1212 | LRU_INACTIVE_FILE); |
1214 | active_file = mem_cgroup_get_all_zonestat(mem_cont, | 1213 | active_file = mem_cgroup_get_all_zonestat(mem_cont, |
1215 | LRU_ACTIVE_FILE); | 1214 | LRU_ACTIVE_FILE); |
1216 | unevictable = mem_cgroup_get_all_zonestat(mem_cont, | 1215 | unevictable = mem_cgroup_get_all_zonestat(mem_cont, |
1217 | LRU_UNEVICTABLE); | 1216 | LRU_UNEVICTABLE); |
1218 | 1217 | ||
1219 | cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE); | 1218 | cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE); |
1220 | cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE); | 1219 | cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE); |
1221 | cb->fill(cb, "active_file", (active_file) * PAGE_SIZE); | 1220 | cb->fill(cb, "active_file", (active_file) * PAGE_SIZE); |
1222 | cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE); | 1221 | cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE); |
1223 | cb->fill(cb, "unevictable", unevictable * PAGE_SIZE); | 1222 | cb->fill(cb, "unevictable", unevictable * PAGE_SIZE); |
1224 | 1223 | ||
1225 | } | 1224 | } |
1226 | return 0; | 1225 | return 0; |
1227 | } | 1226 | } |
1228 | 1227 | ||
1229 | static struct cftype mem_cgroup_files[] = { | 1228 | static struct cftype mem_cgroup_files[] = { |
1230 | { | 1229 | { |
1231 | .name = "usage_in_bytes", | 1230 | .name = "usage_in_bytes", |
1232 | .private = RES_USAGE, | 1231 | .private = RES_USAGE, |
1233 | .read_u64 = mem_cgroup_read, | 1232 | .read_u64 = mem_cgroup_read, |
1234 | }, | 1233 | }, |
1235 | { | 1234 | { |
1236 | .name = "max_usage_in_bytes", | 1235 | .name = "max_usage_in_bytes", |
1237 | .private = RES_MAX_USAGE, | 1236 | .private = RES_MAX_USAGE, |
1238 | .trigger = mem_cgroup_reset, | 1237 | .trigger = mem_cgroup_reset, |
1239 | .read_u64 = mem_cgroup_read, | 1238 | .read_u64 = mem_cgroup_read, |
1240 | }, | 1239 | }, |
1241 | { | 1240 | { |
1242 | .name = "limit_in_bytes", | 1241 | .name = "limit_in_bytes", |
1243 | .private = RES_LIMIT, | 1242 | .private = RES_LIMIT, |
1244 | .write_string = mem_cgroup_write, | 1243 | .write_string = mem_cgroup_write, |
1245 | .read_u64 = mem_cgroup_read, | 1244 | .read_u64 = mem_cgroup_read, |
1246 | }, | 1245 | }, |
1247 | { | 1246 | { |
1248 | .name = "failcnt", | 1247 | .name = "failcnt", |
1249 | .private = RES_FAILCNT, | 1248 | .private = RES_FAILCNT, |
1250 | .trigger = mem_cgroup_reset, | 1249 | .trigger = mem_cgroup_reset, |
1251 | .read_u64 = mem_cgroup_read, | 1250 | .read_u64 = mem_cgroup_read, |
1252 | }, | 1251 | }, |
1253 | { | 1252 | { |
1254 | .name = "stat", | 1253 | .name = "stat", |
1255 | .read_map = mem_control_stat_show, | 1254 | .read_map = mem_control_stat_show, |
1256 | }, | 1255 | }, |
1257 | }; | 1256 | }; |
1258 | 1257 | ||
1259 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | 1258 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) |
1260 | { | 1259 | { |
1261 | struct mem_cgroup_per_node *pn; | 1260 | struct mem_cgroup_per_node *pn; |
1262 | struct mem_cgroup_per_zone *mz; | 1261 | struct mem_cgroup_per_zone *mz; |
1263 | enum lru_list l; | 1262 | enum lru_list l; |
1264 | int zone, tmp = node; | 1263 | int zone, tmp = node; |
1265 | /* | 1264 | /* |
1266 | * This routine is called against possible nodes. | 1265 | * This routine is called against possible nodes. |
1267 | * But it's BUG to call kmalloc() against offline node. | 1266 | * But it's BUG to call kmalloc() against offline node. |
1268 | * | 1267 | * |
1269 | * TODO: this routine can waste much memory for nodes which will | 1268 | * TODO: this routine can waste much memory for nodes which will |
1270 | * never be onlined. It's better to use memory hotplug callback | 1269 | * never be onlined. It's better to use memory hotplug callback |
1271 | * function. | 1270 | * function. |
1272 | */ | 1271 | */ |
1273 | if (!node_state(node, N_NORMAL_MEMORY)) | 1272 | if (!node_state(node, N_NORMAL_MEMORY)) |
1274 | tmp = -1; | 1273 | tmp = -1; |
1275 | pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); | 1274 | pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); |
1276 | if (!pn) | 1275 | if (!pn) |
1277 | return 1; | 1276 | return 1; |
1278 | 1277 | ||
1279 | mem->info.nodeinfo[node] = pn; | 1278 | mem->info.nodeinfo[node] = pn; |
1280 | memset(pn, 0, sizeof(*pn)); | 1279 | memset(pn, 0, sizeof(*pn)); |
1281 | 1280 | ||
1282 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 1281 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
1283 | mz = &pn->zoneinfo[zone]; | 1282 | mz = &pn->zoneinfo[zone]; |
1284 | spin_lock_init(&mz->lru_lock); | 1283 | spin_lock_init(&mz->lru_lock); |
1285 | for_each_lru(l) | 1284 | for_each_lru(l) |
1286 | INIT_LIST_HEAD(&mz->lists[l]); | 1285 | INIT_LIST_HEAD(&mz->lists[l]); |
1287 | } | 1286 | } |
1288 | return 0; | 1287 | return 0; |
1289 | } | 1288 | } |
1290 | 1289 | ||
1291 | static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | 1290 | static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) |
1292 | { | 1291 | { |
1293 | kfree(mem->info.nodeinfo[node]); | 1292 | kfree(mem->info.nodeinfo[node]); |
1294 | } | 1293 | } |
1295 | 1294 | ||
1295 | static int mem_cgroup_size(void) | ||
1296 | { | ||
1297 | int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu); | ||
1298 | return sizeof(struct mem_cgroup) + cpustat_size; | ||
1299 | } | ||
1300 | |||
1296 | static struct mem_cgroup *mem_cgroup_alloc(void) | 1301 | static struct mem_cgroup *mem_cgroup_alloc(void) |
1297 | { | 1302 | { |
1298 | struct mem_cgroup *mem; | 1303 | struct mem_cgroup *mem; |
1304 | int size = mem_cgroup_size(); | ||
1299 | 1305 | ||
1300 | if (sizeof(*mem) < PAGE_SIZE) | 1306 | if (size < PAGE_SIZE) |
1301 | mem = kmalloc(sizeof(*mem), GFP_KERNEL); | 1307 | mem = kmalloc(size, GFP_KERNEL); |
1302 | else | 1308 | else |
1303 | mem = vmalloc(sizeof(*mem)); | 1309 | mem = vmalloc(size); |
1304 | 1310 | ||
1305 | if (mem) | 1311 | if (mem) |
1306 | memset(mem, 0, sizeof(*mem)); | 1312 | memset(mem, 0, size); |
1307 | return mem; | 1313 | return mem; |
1308 | } | 1314 | } |
1309 | 1315 | ||
1310 | static void mem_cgroup_free(struct mem_cgroup *mem) | 1316 | static void mem_cgroup_free(struct mem_cgroup *mem) |
1311 | { | 1317 | { |
1312 | if (sizeof(*mem) < PAGE_SIZE) | 1318 | if (mem_cgroup_size() < PAGE_SIZE) |
1313 | kfree(mem); | 1319 | kfree(mem); |
1314 | else | 1320 | else |
1315 | vfree(mem); | 1321 | vfree(mem); |
1316 | } | 1322 | } |
1317 | 1323 | ||
1318 | 1324 | ||
1319 | static struct cgroup_subsys_state * | 1325 | static struct cgroup_subsys_state * |
1320 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | 1326 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) |
1321 | { | 1327 | { |
1322 | struct mem_cgroup *mem; | 1328 | struct mem_cgroup *mem; |
1323 | int node; | 1329 | int node; |
1324 | 1330 | ||
1325 | if (unlikely((cont->parent) == NULL)) { | 1331 | mem = mem_cgroup_alloc(); |
1326 | mem = &init_mem_cgroup; | 1332 | if (!mem) |
1327 | } else { | 1333 | return ERR_PTR(-ENOMEM); |
1328 | mem = mem_cgroup_alloc(); | ||
1329 | if (!mem) | ||
1330 | return ERR_PTR(-ENOMEM); | ||
1331 | } | ||
1332 | 1334 | ||
1333 | res_counter_init(&mem->res); | 1335 | res_counter_init(&mem->res); |
1334 | 1336 | ||
1335 | for_each_node_state(node, N_POSSIBLE) | 1337 | for_each_node_state(node, N_POSSIBLE) |
1336 | if (alloc_mem_cgroup_per_zone_info(mem, node)) | 1338 | if (alloc_mem_cgroup_per_zone_info(mem, node)) |
1337 | goto free_out; | 1339 | goto free_out; |
1338 | 1340 | ||
1339 | return &mem->css; | 1341 | return &mem->css; |
1340 | free_out: | 1342 | free_out: |
1341 | for_each_node_state(node, N_POSSIBLE) | 1343 | for_each_node_state(node, N_POSSIBLE) |
1342 | free_mem_cgroup_per_zone_info(mem, node); | 1344 | free_mem_cgroup_per_zone_info(mem, node); |
1343 | if (cont->parent != NULL) | 1345 | mem_cgroup_free(mem); |
1344 | mem_cgroup_free(mem); | ||
1345 | return ERR_PTR(-ENOMEM); | 1346 | return ERR_PTR(-ENOMEM); |
1346 | } | 1347 | } |
1347 | 1348 | ||
1348 | static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, | 1349 | static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, |
1349 | struct cgroup *cont) | 1350 | struct cgroup *cont) |
1350 | { | 1351 | { |
1351 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 1352 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
1352 | mem_cgroup_force_empty(mem); | 1353 | mem_cgroup_force_empty(mem); |
1353 | } | 1354 | } |
1354 | 1355 | ||
1355 | static void mem_cgroup_destroy(struct cgroup_subsys *ss, | 1356 | static void mem_cgroup_destroy(struct cgroup_subsys *ss, |
1356 | struct cgroup *cont) | 1357 | struct cgroup *cont) |
1357 | { | 1358 | { |
1358 | int node; | 1359 | int node; |
1359 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 1360 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
1360 | 1361 | ||
1361 | for_each_node_state(node, N_POSSIBLE) | 1362 | for_each_node_state(node, N_POSSIBLE) |
1362 | free_mem_cgroup_per_zone_info(mem, node); | 1363 | free_mem_cgroup_per_zone_info(mem, node); |
1363 | 1364 | ||
1364 | mem_cgroup_free(mem_cgroup_from_cont(cont)); | 1365 | mem_cgroup_free(mem_cgroup_from_cont(cont)); |
1365 | } | 1366 | } |
1366 | 1367 | ||
1367 | static int mem_cgroup_populate(struct cgroup_subsys *ss, | 1368 | static int mem_cgroup_populate(struct cgroup_subsys *ss, |
1368 | struct cgroup *cont) | 1369 | struct cgroup *cont) |
1369 | { | 1370 | { |
1370 | return cgroup_add_files(cont, ss, mem_cgroup_files, | 1371 | return cgroup_add_files(cont, ss, mem_cgroup_files, |
1371 | ARRAY_SIZE(mem_cgroup_files)); | 1372 | ARRAY_SIZE(mem_cgroup_files)); |
1372 | } | 1373 | } |
1373 | 1374 | ||
1374 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | 1375 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, |
1375 | struct cgroup *cont, | 1376 | struct cgroup *cont, |
1376 | struct cgroup *old_cont, | 1377 | struct cgroup *old_cont, |
1377 | struct task_struct *p) | 1378 | struct task_struct *p) |
1378 | { | 1379 | { |
1379 | struct mm_struct *mm; | 1380 | struct mm_struct *mm; |
1380 | struct mem_cgroup *mem, *old_mem; | 1381 | struct mem_cgroup *mem, *old_mem; |
1381 | 1382 | ||
1382 | mm = get_task_mm(p); | 1383 | mm = get_task_mm(p); |
1383 | if (mm == NULL) | 1384 | if (mm == NULL) |
1384 | return; | 1385 | return; |
1385 | 1386 | ||
1386 | mem = mem_cgroup_from_cont(cont); | 1387 | mem = mem_cgroup_from_cont(cont); |
1387 | old_mem = mem_cgroup_from_cont(old_cont); | 1388 | old_mem = mem_cgroup_from_cont(old_cont); |
1388 | 1389 | ||
1389 | /* | 1390 | /* |
1390 | * Only thread group leaders are allowed to migrate, the mm_struct is | 1391 | * Only thread group leaders are allowed to migrate, the mm_struct is |
1391 | * in effect owned by the leader | 1392 | * in effect owned by the leader |
1392 | */ | 1393 | */ |
1393 | if (!thread_group_leader(p)) | 1394 | if (!thread_group_leader(p)) |
1394 | goto out; | 1395 | goto out; |
1395 | 1396 | ||
1396 | out: | 1397 | out: |
1397 | mmput(mm); | 1398 | mmput(mm); |
1398 | } | 1399 | } |
1399 | 1400 | ||
1400 | struct cgroup_subsys mem_cgroup_subsys = { | 1401 | struct cgroup_subsys mem_cgroup_subsys = { |
1401 | .name = "memory", | 1402 | .name = "memory", |
1402 | .subsys_id = mem_cgroup_subsys_id, | 1403 | .subsys_id = mem_cgroup_subsys_id, |
1403 | .create = mem_cgroup_create, | 1404 | .create = mem_cgroup_create, |
1404 | .pre_destroy = mem_cgroup_pre_destroy, | 1405 | .pre_destroy = mem_cgroup_pre_destroy, |