Commit 08e552c69c6930d64722de3ec18c51844d06ee28
Committed by
Linus Torvalds
1 parent
8c7c6e34a1
Exists in
master
and in
4 other branches
memcg: synchronized LRU
A big patch for changing memcg's LRU semantics. Now, - page_cgroup is linked to mem_cgroup's its own LRU (per zone). - LRU of page_cgroup is not synchronous with global LRU. - page and page_cgroup is one-to-one and statically allocated. - To find page_cgroup is on what LRU, you have to check pc->mem_cgroup as - lru = page_cgroup_zoneinfo(pc, nid_of_pc, zid_of_pc); - SwapCache is handled. And, when we handle LRU list of page_cgroup, we do following. pc = lookup_page_cgroup(page); lock_page_cgroup(pc); .....................(1) mz = page_cgroup_zoneinfo(pc); spin_lock(&mz->lru_lock); .....add to LRU spin_unlock(&mz->lru_lock); unlock_page_cgroup(pc); But (1) is spin_lock and we have to be afraid of dead-lock with zone->lru_lock. So, trylock() is used at (1), now. Without (1), we can't trust "mz" is correct. This is a trial to remove this dirty nesting of locks. This patch changes mz->lru_lock to be zone->lru_lock. Then, above sequence will be written as spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU mem_cgroup_add/remove/etc_lru() { pc = lookup_page_cgroup(page); mz = page_cgroup_zoneinfo(pc); if (PageCgroupUsed(pc)) { ....add to LRU } spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU This is much simpler. (*) We're safe even if we don't take lock_page_cgroup(pc). Because.. 1. When pc->mem_cgroup can be modified. - at charge. - at account_move(). 2. at charge the PCG_USED bit is not set before pc->mem_cgroup is fixed. 3. at account_move() the page is isolated and not on LRU. Pros. - easy for maintenance. - memcg can make use of laziness of pagevec. - we don't have to duplicated LRU/Active/Unevictable bit in page_cgroup. - LRU status of memcg will be synchronized with global LRU's one. - # of locks are reduced. - account_move() is simplified very much. Cons. - may increase cost of LRU rotation. (no impact if memcg is not configured.) Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Pavel Emelyanov <xemul@openvz.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 8 changed files with 178 additions and 206 deletions Side-by-side Diff
fs/splice.c
include/linux/memcontrol.h
... | ... | @@ -40,7 +40,12 @@ |
40 | 40 | |
41 | 41 | extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, |
42 | 42 | gfp_t gfp_mask); |
43 | -extern void mem_cgroup_move_lists(struct page *page, enum lru_list lru); | |
43 | +extern void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru); | |
44 | +extern void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru); | |
45 | +extern void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru); | |
46 | +extern void mem_cgroup_del_lru(struct page *page); | |
47 | +extern void mem_cgroup_move_lists(struct page *page, | |
48 | + enum lru_list from, enum lru_list to); | |
44 | 49 | extern void mem_cgroup_uncharge_page(struct page *page); |
45 | 50 | extern void mem_cgroup_uncharge_cache_page(struct page *page); |
46 | 51 | extern int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask); |
... | ... | @@ -131,7 +136,27 @@ |
131 | 136 | return 0; |
132 | 137 | } |
133 | 138 | |
134 | -static inline void mem_cgroup_move_lists(struct page *page, bool active) | |
139 | +static inline void mem_cgroup_add_lru_list(struct page *page, int lru) | |
140 | +{ | |
141 | +} | |
142 | + | |
143 | +static inline void mem_cgroup_del_lru_list(struct page *page, int lru) | |
144 | +{ | |
145 | + return ; | |
146 | +} | |
147 | + | |
148 | +static inline void mem_cgroup_rotate_lru_list(struct page *page, int lru) | |
149 | +{ | |
150 | + return ; | |
151 | +} | |
152 | + | |
153 | +static inline void mem_cgroup_del_lru(struct page *page) | |
154 | +{ | |
155 | + return ; | |
156 | +} | |
157 | + | |
158 | +static inline void | |
159 | +mem_cgroup_move_lists(struct page *page, enum lru_list from, enum lru_list to) | |
135 | 160 | { |
136 | 161 | } |
137 | 162 |
include/linux/mm_inline.h
... | ... | @@ -28,6 +28,7 @@ |
28 | 28 | { |
29 | 29 | list_add(&page->lru, &zone->lru[l].list); |
30 | 30 | __inc_zone_state(zone, NR_LRU_BASE + l); |
31 | + mem_cgroup_add_lru_list(page, l); | |
31 | 32 | } |
32 | 33 | |
33 | 34 | static inline void |
... | ... | @@ -35,6 +36,7 @@ |
35 | 36 | { |
36 | 37 | list_del(&page->lru); |
37 | 38 | __dec_zone_state(zone, NR_LRU_BASE + l); |
39 | + mem_cgroup_del_lru_list(page, l); | |
38 | 40 | } |
39 | 41 | |
40 | 42 | static inline void |
... | ... | @@ -54,6 +56,7 @@ |
54 | 56 | l += page_is_file_cache(page); |
55 | 57 | } |
56 | 58 | __dec_zone_state(zone, NR_LRU_BASE + l); |
59 | + mem_cgroup_del_lru_list(page, l); | |
57 | 60 | } |
58 | 61 | |
59 | 62 | /** |
include/linux/page_cgroup.h
... | ... | @@ -26,10 +26,6 @@ |
26 | 26 | PCG_LOCK, /* page cgroup is locked */ |
27 | 27 | PCG_CACHE, /* charged as cache */ |
28 | 28 | PCG_USED, /* this object is in use. */ |
29 | - /* flags for LRU placement */ | |
30 | - PCG_ACTIVE, /* page is active in this cgroup */ | |
31 | - PCG_FILE, /* page is file system backed */ | |
32 | - PCG_UNEVICTABLE, /* page is unevictableable */ | |
33 | 29 | }; |
34 | 30 | |
35 | 31 | #define TESTPCGFLAG(uname, lname) \ |
... | ... | @@ -49,19 +45,6 @@ |
49 | 45 | |
50 | 46 | TESTPCGFLAG(Used, USED) |
51 | 47 | CLEARPCGFLAG(Used, USED) |
52 | - | |
53 | -/* LRU management flags (from global-lru definition) */ | |
54 | -TESTPCGFLAG(File, FILE) | |
55 | -SETPCGFLAG(File, FILE) | |
56 | -CLEARPCGFLAG(File, FILE) | |
57 | - | |
58 | -TESTPCGFLAG(Active, ACTIVE) | |
59 | -SETPCGFLAG(Active, ACTIVE) | |
60 | -CLEARPCGFLAG(Active, ACTIVE) | |
61 | - | |
62 | -TESTPCGFLAG(Unevictable, UNEVICTABLE) | |
63 | -SETPCGFLAG(Unevictable, UNEVICTABLE) | |
64 | -CLEARPCGFLAG(Unevictable, UNEVICTABLE) | |
65 | 48 | |
66 | 49 | static inline int page_cgroup_nid(struct page_cgroup *pc) |
67 | 50 | { |
mm/memcontrol.c
... | ... | @@ -36,6 +36,7 @@ |
36 | 36 | #include <linux/vmalloc.h> |
37 | 37 | #include <linux/mm_inline.h> |
38 | 38 | #include <linux/page_cgroup.h> |
39 | +#include "internal.h" | |
39 | 40 | |
40 | 41 | #include <asm/uaccess.h> |
41 | 42 | |
... | ... | @@ -100,7 +101,6 @@ |
100 | 101 | /* |
101 | 102 | * spin_lock to protect the per cgroup LRU |
102 | 103 | */ |
103 | - spinlock_t lru_lock; | |
104 | 104 | struct list_head lists[NR_LRU_LISTS]; |
105 | 105 | unsigned long count[NR_LRU_LISTS]; |
106 | 106 | }; |
107 | 107 | |
108 | 108 | |
... | ... | @@ -163,14 +163,12 @@ |
163 | 163 | /* only for here (for easy reading.) */ |
164 | 164 | #define PCGF_CACHE (1UL << PCG_CACHE) |
165 | 165 | #define PCGF_USED (1UL << PCG_USED) |
166 | -#define PCGF_ACTIVE (1UL << PCG_ACTIVE) | |
167 | 166 | #define PCGF_LOCK (1UL << PCG_LOCK) |
168 | -#define PCGF_FILE (1UL << PCG_FILE) | |
169 | 167 | static const unsigned long |
170 | 168 | pcg_default_flags[NR_CHARGE_TYPE] = { |
171 | - PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */ | |
172 | - PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */ | |
173 | - PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ | |
169 | + PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */ | |
170 | + PCGF_USED | PCGF_LOCK, /* Anon */ | |
171 | + PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ | |
174 | 172 | 0, /* FORCE */ |
175 | 173 | }; |
176 | 174 | |
... | ... | @@ -185,9 +183,6 @@ |
185 | 183 | static void mem_cgroup_get(struct mem_cgroup *mem); |
186 | 184 | static void mem_cgroup_put(struct mem_cgroup *mem); |
187 | 185 | |
188 | -/* | |
189 | - * Always modified under lru lock. Then, not necessary to preempt_disable() | |
190 | - */ | |
191 | 186 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, |
192 | 187 | struct page_cgroup *pc, |
193 | 188 | bool charge) |
194 | 189 | |
... | ... | @@ -195,10 +190,9 @@ |
195 | 190 | int val = (charge)? 1 : -1; |
196 | 191 | struct mem_cgroup_stat *stat = &mem->stat; |
197 | 192 | struct mem_cgroup_stat_cpu *cpustat; |
193 | + int cpu = get_cpu(); | |
198 | 194 | |
199 | - VM_BUG_ON(!irqs_disabled()); | |
200 | - | |
201 | - cpustat = &stat->cpustat[smp_processor_id()]; | |
195 | + cpustat = &stat->cpustat[cpu]; | |
202 | 196 | if (PageCgroupCache(pc)) |
203 | 197 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); |
204 | 198 | else |
... | ... | @@ -210,6 +204,7 @@ |
210 | 204 | else |
211 | 205 | __mem_cgroup_stat_add_safe(cpustat, |
212 | 206 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); |
207 | + put_cpu(); | |
213 | 208 | } |
214 | 209 | |
215 | 210 | static struct mem_cgroup_per_zone * |
216 | 211 | |
217 | 212 | |
218 | 213 | |
219 | 214 | |
220 | 215 | |
221 | 216 | |
222 | 217 | |
223 | 218 | |
224 | 219 | |
225 | 220 | |
226 | 221 | |
227 | 222 | |
228 | 223 | |
229 | 224 | |
230 | 225 | |
231 | 226 | |
... | ... | @@ -264,82 +259,97 @@ |
264 | 259 | struct mem_cgroup, css); |
265 | 260 | } |
266 | 261 | |
267 | -static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, | |
268 | - struct page_cgroup *pc) | |
262 | +/* | |
263 | + * Following LRU functions are allowed to be used without PCG_LOCK. | |
264 | + * Operations are called by routine of global LRU independently from memcg. | |
265 | + * What we have to take care of here is validness of pc->mem_cgroup. | |
266 | + * | |
267 | + * Changes to pc->mem_cgroup happens when | |
268 | + * 1. charge | |
269 | + * 2. moving account | |
270 | + * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. | |
271 | + * It is added to LRU before charge. | |
272 | + * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. | |
273 | + * When moving account, the page is not on LRU. It's isolated. | |
274 | + */ | |
275 | + | |
276 | +void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) | |
269 | 277 | { |
270 | - int lru = LRU_BASE; | |
278 | + struct page_cgroup *pc; | |
279 | + struct mem_cgroup *mem; | |
280 | + struct mem_cgroup_per_zone *mz; | |
271 | 281 | |
272 | - if (PageCgroupUnevictable(pc)) | |
273 | - lru = LRU_UNEVICTABLE; | |
274 | - else { | |
275 | - if (PageCgroupActive(pc)) | |
276 | - lru += LRU_ACTIVE; | |
277 | - if (PageCgroupFile(pc)) | |
278 | - lru += LRU_FILE; | |
279 | - } | |
280 | - | |
282 | + if (mem_cgroup_subsys.disabled) | |
283 | + return; | |
284 | + pc = lookup_page_cgroup(page); | |
285 | + /* can happen while we handle swapcache. */ | |
286 | + if (list_empty(&pc->lru)) | |
287 | + return; | |
288 | + mz = page_cgroup_zoneinfo(pc); | |
289 | + mem = pc->mem_cgroup; | |
281 | 290 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; |
291 | + list_del_init(&pc->lru); | |
292 | + return; | |
293 | +} | |
282 | 294 | |
283 | - mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false); | |
284 | - list_del(&pc->lru); | |
295 | +void mem_cgroup_del_lru(struct page *page) | |
296 | +{ | |
297 | + mem_cgroup_del_lru_list(page, page_lru(page)); | |
285 | 298 | } |
286 | 299 | |
287 | -static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, | |
288 | - struct page_cgroup *pc, bool hot) | |
300 | +void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) | |
289 | 301 | { |
290 | - int lru = LRU_BASE; | |
302 | + struct mem_cgroup_per_zone *mz; | |
303 | + struct page_cgroup *pc; | |
291 | 304 | |
292 | - if (PageCgroupUnevictable(pc)) | |
293 | - lru = LRU_UNEVICTABLE; | |
294 | - else { | |
295 | - if (PageCgroupActive(pc)) | |
296 | - lru += LRU_ACTIVE; | |
297 | - if (PageCgroupFile(pc)) | |
298 | - lru += LRU_FILE; | |
299 | - } | |
305 | + if (mem_cgroup_subsys.disabled) | |
306 | + return; | |
300 | 307 | |
301 | - MEM_CGROUP_ZSTAT(mz, lru) += 1; | |
302 | - if (hot) | |
303 | - list_add(&pc->lru, &mz->lists[lru]); | |
304 | - else | |
305 | - list_add_tail(&pc->lru, &mz->lists[lru]); | |
306 | - | |
307 | - mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true); | |
308 | + pc = lookup_page_cgroup(page); | |
309 | + smp_rmb(); | |
310 | + /* unused page is not rotated. */ | |
311 | + if (!PageCgroupUsed(pc)) | |
312 | + return; | |
313 | + mz = page_cgroup_zoneinfo(pc); | |
314 | + list_move(&pc->lru, &mz->lists[lru]); | |
308 | 315 | } |
309 | 316 | |
310 | -static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru) | |
317 | +void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) | |
311 | 318 | { |
312 | - struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); | |
313 | - int active = PageCgroupActive(pc); | |
314 | - int file = PageCgroupFile(pc); | |
315 | - int unevictable = PageCgroupUnevictable(pc); | |
316 | - enum lru_list from = unevictable ? LRU_UNEVICTABLE : | |
317 | - (LRU_FILE * !!file + !!active); | |
319 | + struct page_cgroup *pc; | |
320 | + struct mem_cgroup_per_zone *mz; | |
318 | 321 | |
319 | - if (lru == from) | |
322 | + if (mem_cgroup_subsys.disabled) | |
320 | 323 | return; |
324 | + pc = lookup_page_cgroup(page); | |
325 | + /* barrier to sync with "charge" */ | |
326 | + smp_rmb(); | |
327 | + if (!PageCgroupUsed(pc)) | |
328 | + return; | |
321 | 329 | |
322 | - MEM_CGROUP_ZSTAT(mz, from) -= 1; | |
323 | - /* | |
324 | - * However this is done under mz->lru_lock, another flags, which | |
325 | - * are not related to LRU, will be modified from out-of-lock. | |
326 | - * We have to use atomic set/clear flags. | |
327 | - */ | |
328 | - if (is_unevictable_lru(lru)) { | |
329 | - ClearPageCgroupActive(pc); | |
330 | - SetPageCgroupUnevictable(pc); | |
331 | - } else { | |
332 | - if (is_active_lru(lru)) | |
333 | - SetPageCgroupActive(pc); | |
334 | - else | |
335 | - ClearPageCgroupActive(pc); | |
336 | - ClearPageCgroupUnevictable(pc); | |
337 | - } | |
338 | - | |
330 | + mz = page_cgroup_zoneinfo(pc); | |
339 | 331 | MEM_CGROUP_ZSTAT(mz, lru) += 1; |
340 | - list_move(&pc->lru, &mz->lists[lru]); | |
332 | + list_add(&pc->lru, &mz->lists[lru]); | |
341 | 333 | } |
334 | +/* | |
335 | + * To add swapcache into LRU. Be careful to all this function. | |
336 | + * zone->lru_lock shouldn't be held and irq must not be disabled. | |
337 | + */ | |
338 | +static void mem_cgroup_lru_fixup(struct page *page) | |
339 | +{ | |
340 | + if (!isolate_lru_page(page)) | |
341 | + putback_lru_page(page); | |
342 | +} | |
342 | 343 | |
344 | +void mem_cgroup_move_lists(struct page *page, | |
345 | + enum lru_list from, enum lru_list to) | |
346 | +{ | |
347 | + if (mem_cgroup_subsys.disabled) | |
348 | + return; | |
349 | + mem_cgroup_del_lru_list(page, from); | |
350 | + mem_cgroup_add_lru_list(page, to); | |
351 | +} | |
352 | + | |
343 | 353 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) |
344 | 354 | { |
345 | 355 | int ret; |
... | ... | @@ -351,37 +361,6 @@ |
351 | 361 | } |
352 | 362 | |
353 | 363 | /* |
354 | - * This routine assumes that the appropriate zone's lru lock is already held | |
355 | - */ | |
356 | -void mem_cgroup_move_lists(struct page *page, enum lru_list lru) | |
357 | -{ | |
358 | - struct page_cgroup *pc; | |
359 | - struct mem_cgroup_per_zone *mz; | |
360 | - unsigned long flags; | |
361 | - | |
362 | - if (mem_cgroup_subsys.disabled) | |
363 | - return; | |
364 | - | |
365 | - /* | |
366 | - * We cannot lock_page_cgroup while holding zone's lru_lock, | |
367 | - * because other holders of lock_page_cgroup can be interrupted | |
368 | - * with an attempt to rotate_reclaimable_page. But we cannot | |
369 | - * safely get to page_cgroup without it, so just try_lock it: | |
370 | - * mem_cgroup_isolate_pages allows for page left on wrong list. | |
371 | - */ | |
372 | - pc = lookup_page_cgroup(page); | |
373 | - if (!trylock_page_cgroup(pc)) | |
374 | - return; | |
375 | - if (pc && PageCgroupUsed(pc)) { | |
376 | - mz = page_cgroup_zoneinfo(pc); | |
377 | - spin_lock_irqsave(&mz->lru_lock, flags); | |
378 | - __mem_cgroup_move_lists(pc, lru); | |
379 | - spin_unlock_irqrestore(&mz->lru_lock, flags); | |
380 | - } | |
381 | - unlock_page_cgroup(pc); | |
382 | -} | |
383 | - | |
384 | -/* | |
385 | 364 | * Calculate mapped_ratio under memory controller. This will be used in |
386 | 365 | * vmscan.c for deteremining we have to reclaim mapped pages. |
387 | 366 | */ |
388 | 367 | |
389 | 368 | |
390 | 369 | |
391 | 370 | |
392 | 371 | |
... | ... | @@ -460,40 +439,24 @@ |
460 | 439 | mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); |
461 | 440 | src = &mz->lists[lru]; |
462 | 441 | |
463 | - spin_lock(&mz->lru_lock); | |
464 | 442 | scan = 0; |
465 | 443 | list_for_each_entry_safe_reverse(pc, tmp, src, lru) { |
466 | 444 | if (scan >= nr_to_scan) |
467 | 445 | break; |
446 | + | |
447 | + page = pc->page; | |
468 | 448 | if (unlikely(!PageCgroupUsed(pc))) |
469 | 449 | continue; |
470 | - page = pc->page; | |
471 | - | |
472 | 450 | if (unlikely(!PageLRU(page))) |
473 | 451 | continue; |
474 | 452 | |
475 | - /* | |
476 | - * TODO: play better with lumpy reclaim, grabbing anything. | |
477 | - */ | |
478 | - if (PageUnevictable(page) || | |
479 | - (PageActive(page) && !active) || | |
480 | - (!PageActive(page) && active)) { | |
481 | - __mem_cgroup_move_lists(pc, page_lru(page)); | |
482 | - continue; | |
483 | - } | |
484 | - | |
485 | 453 | scan++; |
486 | - list_move(&pc->lru, &pc_list); | |
487 | - | |
488 | 454 | if (__isolate_lru_page(page, mode, file) == 0) { |
489 | 455 | list_move(&page->lru, dst); |
490 | 456 | nr_taken++; |
491 | 457 | } |
492 | 458 | } |
493 | 459 | |
494 | - list_splice(&pc_list, src); | |
495 | - spin_unlock(&mz->lru_lock); | |
496 | - | |
497 | 460 | *scanned = scan; |
498 | 461 | return nr_taken; |
499 | 462 | } |
... | ... | @@ -608,9 +571,6 @@ |
608 | 571 | struct page_cgroup *pc, |
609 | 572 | enum charge_type ctype) |
610 | 573 | { |
611 | - struct mem_cgroup_per_zone *mz; | |
612 | - unsigned long flags; | |
613 | - | |
614 | 574 | /* try_charge() can return NULL to *memcg, taking care of it. */ |
615 | 575 | if (!mem) |
616 | 576 | return; |
617 | 577 | |
618 | 578 | |
... | ... | @@ -625,17 +585,11 @@ |
625 | 585 | return; |
626 | 586 | } |
627 | 587 | pc->mem_cgroup = mem; |
628 | - /* | |
629 | - * If a page is accounted as a page cache, insert to inactive list. | |
630 | - * If anon, insert to active list. | |
631 | - */ | |
588 | + smp_wmb(); | |
632 | 589 | pc->flags = pcg_default_flags[ctype]; |
633 | 590 | |
634 | - mz = page_cgroup_zoneinfo(pc); | |
591 | + mem_cgroup_charge_statistics(mem, pc, true); | |
635 | 592 | |
636 | - spin_lock_irqsave(&mz->lru_lock, flags); | |
637 | - __mem_cgroup_add_list(mz, pc, true); | |
638 | - spin_unlock_irqrestore(&mz->lru_lock, flags); | |
639 | 593 | unlock_page_cgroup(pc); |
640 | 594 | } |
641 | 595 | |
... | ... | @@ -646,8 +600,7 @@ |
646 | 600 | * @to: mem_cgroup which the page is moved to. @from != @to. |
647 | 601 | * |
648 | 602 | * The caller must confirm following. |
649 | - * 1. disable irq. | |
650 | - * 2. lru_lock of old mem_cgroup(@from) should be held. | |
603 | + * - page is not on LRU (isolate_page() is useful.) | |
651 | 604 | * |
652 | 605 | * returns 0 at success, |
653 | 606 | * returns -EBUSY when lock is busy or "pc" is unstable. |
654 | 607 | |
655 | 608 | |
... | ... | @@ -663,15 +616,14 @@ |
663 | 616 | int nid, zid; |
664 | 617 | int ret = -EBUSY; |
665 | 618 | |
666 | - VM_BUG_ON(!irqs_disabled()); | |
667 | 619 | VM_BUG_ON(from == to); |
620 | + VM_BUG_ON(PageLRU(pc->page)); | |
668 | 621 | |
669 | 622 | nid = page_cgroup_nid(pc); |
670 | 623 | zid = page_cgroup_zid(pc); |
671 | 624 | from_mz = mem_cgroup_zoneinfo(from, nid, zid); |
672 | 625 | to_mz = mem_cgroup_zoneinfo(to, nid, zid); |
673 | 626 | |
674 | - | |
675 | 627 | if (!trylock_page_cgroup(pc)) |
676 | 628 | return ret; |
677 | 629 | |
... | ... | @@ -681,18 +633,15 @@ |
681 | 633 | if (pc->mem_cgroup != from) |
682 | 634 | goto out; |
683 | 635 | |
684 | - if (spin_trylock(&to_mz->lru_lock)) { | |
685 | - __mem_cgroup_remove_list(from_mz, pc); | |
686 | - css_put(&from->css); | |
687 | - res_counter_uncharge(&from->res, PAGE_SIZE); | |
688 | - if (do_swap_account) | |
689 | - res_counter_uncharge(&from->memsw, PAGE_SIZE); | |
690 | - pc->mem_cgroup = to; | |
691 | - css_get(&to->css); | |
692 | - __mem_cgroup_add_list(to_mz, pc, false); | |
693 | - ret = 0; | |
694 | - spin_unlock(&to_mz->lru_lock); | |
695 | - } | |
636 | + css_put(&from->css); | |
637 | + res_counter_uncharge(&from->res, PAGE_SIZE); | |
638 | + mem_cgroup_charge_statistics(from, pc, false); | |
639 | + if (do_swap_account) | |
640 | + res_counter_uncharge(&from->memsw, PAGE_SIZE); | |
641 | + pc->mem_cgroup = to; | |
642 | + mem_cgroup_charge_statistics(to, pc, true); | |
643 | + css_get(&to->css); | |
644 | + ret = 0; | |
696 | 645 | out: |
697 | 646 | unlock_page_cgroup(pc); |
698 | 647 | return ret; |
699 | 648 | |
700 | 649 | |
701 | 650 | |
702 | 651 | |
703 | 652 | |
704 | 653 | |
705 | 654 | |
706 | 655 | |
707 | 656 | |
... | ... | @@ -706,39 +655,47 @@ |
706 | 655 | struct mem_cgroup *child, |
707 | 656 | gfp_t gfp_mask) |
708 | 657 | { |
658 | + struct page *page = pc->page; | |
709 | 659 | struct cgroup *cg = child->css.cgroup; |
710 | 660 | struct cgroup *pcg = cg->parent; |
711 | 661 | struct mem_cgroup *parent; |
712 | - struct mem_cgroup_per_zone *mz; | |
713 | - unsigned long flags; | |
714 | 662 | int ret; |
715 | 663 | |
716 | 664 | /* Is ROOT ? */ |
717 | 665 | if (!pcg) |
718 | 666 | return -EINVAL; |
719 | 667 | |
668 | + | |
720 | 669 | parent = mem_cgroup_from_cont(pcg); |
721 | 670 | |
671 | + | |
722 | 672 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); |
723 | 673 | if (ret) |
724 | 674 | return ret; |
725 | 675 | |
726 | - mz = mem_cgroup_zoneinfo(child, | |
727 | - page_cgroup_nid(pc), page_cgroup_zid(pc)); | |
676 | + if (!get_page_unless_zero(page)) | |
677 | + return -EBUSY; | |
728 | 678 | |
729 | - spin_lock_irqsave(&mz->lru_lock, flags); | |
679 | + ret = isolate_lru_page(page); | |
680 | + | |
681 | + if (ret) | |
682 | + goto cancel; | |
683 | + | |
730 | 684 | ret = mem_cgroup_move_account(pc, child, parent); |
731 | - spin_unlock_irqrestore(&mz->lru_lock, flags); | |
732 | 685 | |
733 | - /* drop extra refcnt */ | |
686 | + /* drop extra refcnt by try_charge() (move_account increment one) */ | |
734 | 687 | css_put(&parent->css); |
735 | - /* uncharge if move fails */ | |
736 | - if (ret) { | |
737 | - res_counter_uncharge(&parent->res, PAGE_SIZE); | |
738 | - if (do_swap_account) | |
739 | - res_counter_uncharge(&parent->memsw, PAGE_SIZE); | |
688 | + putback_lru_page(page); | |
689 | + if (!ret) { | |
690 | + put_page(page); | |
691 | + return 0; | |
740 | 692 | } |
741 | - | |
693 | + /* uncharge if move fails */ | |
694 | +cancel: | |
695 | + res_counter_uncharge(&parent->res, PAGE_SIZE); | |
696 | + if (do_swap_account) | |
697 | + res_counter_uncharge(&parent->memsw, PAGE_SIZE); | |
698 | + put_page(page); | |
742 | 699 | return ret; |
743 | 700 | } |
744 | 701 | |
... | ... | @@ -912,6 +869,8 @@ |
912 | 869 | } |
913 | 870 | if (!locked) |
914 | 871 | unlock_page(page); |
872 | + /* add this page(page_cgroup) to the LRU we want. */ | |
873 | + mem_cgroup_lru_fixup(page); | |
915 | 874 | |
916 | 875 | return ret; |
917 | 876 | } |
... | ... | @@ -944,6 +903,8 @@ |
944 | 903 | } |
945 | 904 | |
946 | 905 | } |
906 | + /* add this page(page_cgroup) to the LRU we want. */ | |
907 | + mem_cgroup_lru_fixup(page); | |
947 | 908 | } |
948 | 909 | |
949 | 910 | void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) |
... | ... | @@ -968,7 +929,6 @@ |
968 | 929 | struct page_cgroup *pc; |
969 | 930 | struct mem_cgroup *mem = NULL; |
970 | 931 | struct mem_cgroup_per_zone *mz; |
971 | - unsigned long flags; | |
972 | 932 | |
973 | 933 | if (mem_cgroup_subsys.disabled) |
974 | 934 | return NULL; |
975 | 935 | |
... | ... | @@ -1010,12 +970,10 @@ |
1010 | 970 | if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) |
1011 | 971 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); |
1012 | 972 | |
973 | + mem_cgroup_charge_statistics(mem, pc, false); | |
1013 | 974 | ClearPageCgroupUsed(pc); |
1014 | 975 | |
1015 | 976 | mz = page_cgroup_zoneinfo(pc); |
1016 | - spin_lock_irqsave(&mz->lru_lock, flags); | |
1017 | - __mem_cgroup_remove_list(mz, pc); | |
1018 | - spin_unlock_irqrestore(&mz->lru_lock, flags); | |
1019 | 977 | unlock_page_cgroup(pc); |
1020 | 978 | |
1021 | 979 | css_put(&mem->css); |
1022 | 980 | |
1023 | 981 | |
1024 | 982 | |
1025 | 983 | |
... | ... | @@ -1281,21 +1239,22 @@ |
1281 | 1239 | return ret; |
1282 | 1240 | } |
1283 | 1241 | |
1284 | - | |
1285 | 1242 | /* |
1286 | 1243 | * This routine traverse page_cgroup in given list and drop them all. |
1287 | 1244 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. |
1288 | 1245 | */ |
1289 | 1246 | static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, |
1290 | - struct mem_cgroup_per_zone *mz, | |
1291 | - enum lru_list lru) | |
1247 | + int node, int zid, enum lru_list lru) | |
1292 | 1248 | { |
1249 | + struct zone *zone; | |
1250 | + struct mem_cgroup_per_zone *mz; | |
1293 | 1251 | struct page_cgroup *pc, *busy; |
1294 | - unsigned long flags; | |
1295 | - unsigned long loop; | |
1252 | + unsigned long flags, loop; | |
1296 | 1253 | struct list_head *list; |
1297 | 1254 | int ret = 0; |
1298 | 1255 | |
1256 | + zone = &NODE_DATA(node)->node_zones[zid]; | |
1257 | + mz = mem_cgroup_zoneinfo(mem, node, zid); | |
1299 | 1258 | list = &mz->lists[lru]; |
1300 | 1259 | |
1301 | 1260 | loop = MEM_CGROUP_ZSTAT(mz, lru); |
1302 | 1261 | |
1303 | 1262 | |
1304 | 1263 | |
... | ... | @@ -1304,19 +1263,19 @@ |
1304 | 1263 | busy = NULL; |
1305 | 1264 | while (loop--) { |
1306 | 1265 | ret = 0; |
1307 | - spin_lock_irqsave(&mz->lru_lock, flags); | |
1266 | + spin_lock_irqsave(&zone->lru_lock, flags); | |
1308 | 1267 | if (list_empty(list)) { |
1309 | - spin_unlock_irqrestore(&mz->lru_lock, flags); | |
1268 | + spin_unlock_irqrestore(&zone->lru_lock, flags); | |
1310 | 1269 | break; |
1311 | 1270 | } |
1312 | 1271 | pc = list_entry(list->prev, struct page_cgroup, lru); |
1313 | 1272 | if (busy == pc) { |
1314 | 1273 | list_move(&pc->lru, list); |
1315 | 1274 | busy = 0; |
1316 | - spin_unlock_irqrestore(&mz->lru_lock, flags); | |
1275 | + spin_unlock_irqrestore(&zone->lru_lock, flags); | |
1317 | 1276 | continue; |
1318 | 1277 | } |
1319 | - spin_unlock_irqrestore(&mz->lru_lock, flags); | |
1278 | + spin_unlock_irqrestore(&zone->lru_lock, flags); | |
1320 | 1279 | |
1321 | 1280 | ret = mem_cgroup_move_parent(pc, mem, GFP_HIGHUSER_MOVABLE); |
1322 | 1281 | if (ret == -ENOMEM) |
... | ... | @@ -1329,6 +1288,7 @@ |
1329 | 1288 | } else |
1330 | 1289 | busy = NULL; |
1331 | 1290 | } |
1291 | + | |
1332 | 1292 | if (!ret && !list_empty(list)) |
1333 | 1293 | return -EBUSY; |
1334 | 1294 | return ret; |
1335 | 1295 | |
1336 | 1296 | |
... | ... | @@ -1364,12 +1324,10 @@ |
1364 | 1324 | ret = 0; |
1365 | 1325 | for_each_node_state(node, N_POSSIBLE) { |
1366 | 1326 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { |
1367 | - struct mem_cgroup_per_zone *mz; | |
1368 | 1327 | enum lru_list l; |
1369 | - mz = mem_cgroup_zoneinfo(mem, node, zid); | |
1370 | 1328 | for_each_lru(l) { |
1371 | 1329 | ret = mem_cgroup_force_empty_list(mem, |
1372 | - mz, l); | |
1330 | + node, zid, l); | |
1373 | 1331 | if (ret) |
1374 | 1332 | break; |
1375 | 1333 | } |
... | ... | @@ -1413,6 +1371,7 @@ |
1413 | 1371 | } |
1414 | 1372 | |
1415 | 1373 | } |
1374 | + lru_add_drain(); | |
1416 | 1375 | /* try move_account...there may be some *locked* pages. */ |
1417 | 1376 | if (mem->res.usage) |
1418 | 1377 | goto move_account; |
... | ... | @@ -1657,7 +1616,6 @@ |
1657 | 1616 | |
1658 | 1617 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
1659 | 1618 | mz = &pn->zoneinfo[zone]; |
1660 | - spin_lock_init(&mz->lru_lock); | |
1661 | 1619 | for_each_lru(l) |
1662 | 1620 | INIT_LIST_HEAD(&mz->lists[l]); |
1663 | 1621 | } |
1664 | 1622 | |
... | ... | @@ -1706,8 +1664,15 @@ |
1706 | 1664 | |
1707 | 1665 | static void mem_cgroup_free(struct mem_cgroup *mem) |
1708 | 1666 | { |
1667 | + int node; | |
1668 | + | |
1709 | 1669 | if (atomic_read(&mem->refcnt) > 0) |
1710 | 1670 | return; |
1671 | + | |
1672 | + | |
1673 | + for_each_node_state(node, N_POSSIBLE) | |
1674 | + free_mem_cgroup_per_zone_info(mem, node); | |
1675 | + | |
1711 | 1676 | if (mem_cgroup_size() < PAGE_SIZE) |
1712 | 1677 | kfree(mem); |
1713 | 1678 | else |
... | ... | @@ -1780,12 +1745,6 @@ |
1780 | 1745 | static void mem_cgroup_destroy(struct cgroup_subsys *ss, |
1781 | 1746 | struct cgroup *cont) |
1782 | 1747 | { |
1783 | - int node; | |
1784 | - struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | |
1785 | - | |
1786 | - for_each_node_state(node, N_POSSIBLE) | |
1787 | - free_mem_cgroup_per_zone_info(mem, node); | |
1788 | - | |
1789 | 1748 | mem_cgroup_free(mem_cgroup_from_cont(cont)); |
1790 | 1749 | } |
1791 | 1750 |
mm/page_cgroup.c
mm/swap.c
mm/vmscan.c
... | ... | @@ -512,7 +512,6 @@ |
512 | 512 | lru = LRU_UNEVICTABLE; |
513 | 513 | add_page_to_unevictable_list(page); |
514 | 514 | } |
515 | - mem_cgroup_move_lists(page, lru); | |
516 | 515 | |
517 | 516 | /* |
518 | 517 | * page's status can change while we move it among lru. If an evictable |
... | ... | @@ -547,7 +546,6 @@ |
547 | 546 | |
548 | 547 | lru = !!TestClearPageActive(page) + page_is_file_cache(page); |
549 | 548 | lru_cache_add_lru(page, lru); |
550 | - mem_cgroup_move_lists(page, lru); | |
551 | 549 | put_page(page); |
552 | 550 | } |
553 | 551 | #endif /* CONFIG_UNEVICTABLE_LRU */ |
... | ... | @@ -813,6 +811,7 @@ |
813 | 811 | return ret; |
814 | 812 | |
815 | 813 | ret = -EBUSY; |
814 | + | |
816 | 815 | if (likely(get_page_unless_zero(page))) { |
817 | 816 | /* |
818 | 817 | * Be careful not to clear PageLRU until after we're |
... | ... | @@ -821,6 +820,7 @@ |
821 | 820 | */ |
822 | 821 | ClearPageLRU(page); |
823 | 822 | ret = 0; |
823 | + mem_cgroup_del_lru(page); | |
824 | 824 | } |
825 | 825 | |
826 | 826 | return ret; |
... | ... | @@ -1134,7 +1134,6 @@ |
1134 | 1134 | SetPageLRU(page); |
1135 | 1135 | lru = page_lru(page); |
1136 | 1136 | add_page_to_lru_list(zone, page, lru); |
1137 | - mem_cgroup_move_lists(page, lru); | |
1138 | 1137 | if (PageActive(page) && scan_global_lru(sc)) { |
1139 | 1138 | int file = !!page_is_file_cache(page); |
1140 | 1139 | zone->recent_rotated[file]++; |
... | ... | @@ -1263,7 +1262,7 @@ |
1263 | 1262 | ClearPageActive(page); |
1264 | 1263 | |
1265 | 1264 | list_move(&page->lru, &zone->lru[lru].list); |
1266 | - mem_cgroup_move_lists(page, lru); | |
1265 | + mem_cgroup_add_lru_list(page, lru); | |
1267 | 1266 | pgmoved++; |
1268 | 1267 | if (!pagevec_add(&pvec, page)) { |
1269 | 1268 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); |
... | ... | @@ -2408,6 +2407,7 @@ |
2408 | 2407 | |
2409 | 2408 | __dec_zone_state(zone, NR_UNEVICTABLE); |
2410 | 2409 | list_move(&page->lru, &zone->lru[l].list); |
2410 | + mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l); | |
2411 | 2411 | __inc_zone_state(zone, NR_INACTIVE_ANON + l); |
2412 | 2412 | __count_vm_event(UNEVICTABLE_PGRESCUED); |
2413 | 2413 | } else { |
... | ... | @@ -2416,6 +2416,7 @@ |
2416 | 2416 | */ |
2417 | 2417 | SetPageUnevictable(page); |
2418 | 2418 | list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); |
2419 | + mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE); | |
2419 | 2420 | if (page_evictable(page, NULL)) |
2420 | 2421 | goto retry; |
2421 | 2422 | } |