Commit 08e552c69c6930d64722de3ec18c51844d06ee28

Authored by KAMEZAWA Hiroyuki
Committed by Linus Torvalds
1 parent 8c7c6e34a1

memcg: synchronized LRU

A big patch for changing memcg's LRU semantics.

Now,
  - page_cgroup is linked to mem_cgroup's its own LRU (per zone).

  - LRU of page_cgroup is not synchronous with global LRU.

  - page and page_cgroup is one-to-one and statically allocated.

  - To find page_cgroup is on what LRU, you have to check pc->mem_cgroup as
    - lru = page_cgroup_zoneinfo(pc, nid_of_pc, zid_of_pc);

  - SwapCache is handled.

And, when we handle LRU list of page_cgroup, we do following.

	pc = lookup_page_cgroup(page);
	lock_page_cgroup(pc); .....................(1)
	mz = page_cgroup_zoneinfo(pc);
	spin_lock(&mz->lru_lock);
	.....add to LRU
	spin_unlock(&mz->lru_lock);
	unlock_page_cgroup(pc);

But (1) is spin_lock and we have to be afraid of dead-lock with zone->lru_lock.
So, trylock() is used at (1), now. Without (1), we can't trust "mz" is correct.

This is a trial to remove this dirty nesting of locks.
This patch changes mz->lru_lock to be zone->lru_lock.
Then, above sequence will be written as

        spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
	mem_cgroup_add/remove/etc_lru() {
		pc = lookup_page_cgroup(page);
		mz = page_cgroup_zoneinfo(pc);
		if (PageCgroupUsed(pc)) {
			....add to LRU
		}
        spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU

This is much simpler.
(*) We're safe even if we don't take lock_page_cgroup(pc). Because..
    1. When pc->mem_cgroup can be modified.
       - at charge.
       - at account_move().
    2. at charge
       the PCG_USED bit is not set before pc->mem_cgroup is fixed.
    3. at account_move()
       the page is isolated and not on LRU.

Pros.
  - easy for maintenance.
  - memcg can make use of laziness of pagevec.
  - we don't have to duplicated LRU/Active/Unevictable bit in page_cgroup.
  - LRU status of memcg will be synchronized with global LRU's one.
  - # of locks are reduced.
  - account_move() is simplified very much.
Cons.
  - may increase cost of LRU rotation.
    (no impact if memcg is not configured.)

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 8 changed files with 178 additions and 206 deletions Side-by-side Diff

... ... @@ -21,6 +21,7 @@
21 21 #include <linux/file.h>
22 22 #include <linux/pagemap.h>
23 23 #include <linux/splice.h>
  24 +#include <linux/memcontrol.h>
24 25 #include <linux/mm_inline.h>
25 26 #include <linux/swap.h>
26 27 #include <linux/writeback.h>
include/linux/memcontrol.h
... ... @@ -40,7 +40,12 @@
40 40  
41 41 extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
42 42 gfp_t gfp_mask);
43   -extern void mem_cgroup_move_lists(struct page *page, enum lru_list lru);
  43 +extern void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru);
  44 +extern void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru);
  45 +extern void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru);
  46 +extern void mem_cgroup_del_lru(struct page *page);
  47 +extern void mem_cgroup_move_lists(struct page *page,
  48 + enum lru_list from, enum lru_list to);
44 49 extern void mem_cgroup_uncharge_page(struct page *page);
45 50 extern void mem_cgroup_uncharge_cache_page(struct page *page);
46 51 extern int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask);
... ... @@ -131,7 +136,27 @@
131 136 return 0;
132 137 }
133 138  
134   -static inline void mem_cgroup_move_lists(struct page *page, bool active)
  139 +static inline void mem_cgroup_add_lru_list(struct page *page, int lru)
  140 +{
  141 +}
  142 +
  143 +static inline void mem_cgroup_del_lru_list(struct page *page, int lru)
  144 +{
  145 + return ;
  146 +}
  147 +
  148 +static inline void mem_cgroup_rotate_lru_list(struct page *page, int lru)
  149 +{
  150 + return ;
  151 +}
  152 +
  153 +static inline void mem_cgroup_del_lru(struct page *page)
  154 +{
  155 + return ;
  156 +}
  157 +
  158 +static inline void
  159 +mem_cgroup_move_lists(struct page *page, enum lru_list from, enum lru_list to)
135 160 {
136 161 }
137 162  
include/linux/mm_inline.h
... ... @@ -28,6 +28,7 @@
28 28 {
29 29 list_add(&page->lru, &zone->lru[l].list);
30 30 __inc_zone_state(zone, NR_LRU_BASE + l);
  31 + mem_cgroup_add_lru_list(page, l);
31 32 }
32 33  
33 34 static inline void
... ... @@ -35,6 +36,7 @@
35 36 {
36 37 list_del(&page->lru);
37 38 __dec_zone_state(zone, NR_LRU_BASE + l);
  39 + mem_cgroup_del_lru_list(page, l);
38 40 }
39 41  
40 42 static inline void
... ... @@ -54,6 +56,7 @@
54 56 l += page_is_file_cache(page);
55 57 }
56 58 __dec_zone_state(zone, NR_LRU_BASE + l);
  59 + mem_cgroup_del_lru_list(page, l);
57 60 }
58 61  
59 62 /**
include/linux/page_cgroup.h
... ... @@ -26,10 +26,6 @@
26 26 PCG_LOCK, /* page cgroup is locked */
27 27 PCG_CACHE, /* charged as cache */
28 28 PCG_USED, /* this object is in use. */
29   - /* flags for LRU placement */
30   - PCG_ACTIVE, /* page is active in this cgroup */
31   - PCG_FILE, /* page is file system backed */
32   - PCG_UNEVICTABLE, /* page is unevictableable */
33 29 };
34 30  
35 31 #define TESTPCGFLAG(uname, lname) \
... ... @@ -49,19 +45,6 @@
49 45  
50 46 TESTPCGFLAG(Used, USED)
51 47 CLEARPCGFLAG(Used, USED)
52   -
53   -/* LRU management flags (from global-lru definition) */
54   -TESTPCGFLAG(File, FILE)
55   -SETPCGFLAG(File, FILE)
56   -CLEARPCGFLAG(File, FILE)
57   -
58   -TESTPCGFLAG(Active, ACTIVE)
59   -SETPCGFLAG(Active, ACTIVE)
60   -CLEARPCGFLAG(Active, ACTIVE)
61   -
62   -TESTPCGFLAG(Unevictable, UNEVICTABLE)
63   -SETPCGFLAG(Unevictable, UNEVICTABLE)
64   -CLEARPCGFLAG(Unevictable, UNEVICTABLE)
65 48  
66 49 static inline int page_cgroup_nid(struct page_cgroup *pc)
67 50 {
... ... @@ -36,6 +36,7 @@
36 36 #include <linux/vmalloc.h>
37 37 #include <linux/mm_inline.h>
38 38 #include <linux/page_cgroup.h>
  39 +#include "internal.h"
39 40  
40 41 #include <asm/uaccess.h>
41 42  
... ... @@ -100,7 +101,6 @@
100 101 /*
101 102 * spin_lock to protect the per cgroup LRU
102 103 */
103   - spinlock_t lru_lock;
104 104 struct list_head lists[NR_LRU_LISTS];
105 105 unsigned long count[NR_LRU_LISTS];
106 106 };
107 107  
108 108  
... ... @@ -163,14 +163,12 @@
163 163 /* only for here (for easy reading.) */
164 164 #define PCGF_CACHE (1UL << PCG_CACHE)
165 165 #define PCGF_USED (1UL << PCG_USED)
166   -#define PCGF_ACTIVE (1UL << PCG_ACTIVE)
167 166 #define PCGF_LOCK (1UL << PCG_LOCK)
168   -#define PCGF_FILE (1UL << PCG_FILE)
169 167 static const unsigned long
170 168 pcg_default_flags[NR_CHARGE_TYPE] = {
171   - PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */
172   - PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */
173   - PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
  169 + PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
  170 + PCGF_USED | PCGF_LOCK, /* Anon */
  171 + PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
174 172 0, /* FORCE */
175 173 };
176 174  
... ... @@ -185,9 +183,6 @@
185 183 static void mem_cgroup_get(struct mem_cgroup *mem);
186 184 static void mem_cgroup_put(struct mem_cgroup *mem);
187 185  
188   -/*
189   - * Always modified under lru lock. Then, not necessary to preempt_disable()
190   - */
191 186 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
192 187 struct page_cgroup *pc,
193 188 bool charge)
194 189  
... ... @@ -195,10 +190,9 @@
195 190 int val = (charge)? 1 : -1;
196 191 struct mem_cgroup_stat *stat = &mem->stat;
197 192 struct mem_cgroup_stat_cpu *cpustat;
  193 + int cpu = get_cpu();
198 194  
199   - VM_BUG_ON(!irqs_disabled());
200   -
201   - cpustat = &stat->cpustat[smp_processor_id()];
  195 + cpustat = &stat->cpustat[cpu];
202 196 if (PageCgroupCache(pc))
203 197 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
204 198 else
... ... @@ -210,6 +204,7 @@
210 204 else
211 205 __mem_cgroup_stat_add_safe(cpustat,
212 206 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
  207 + put_cpu();
213 208 }
214 209  
215 210 static struct mem_cgroup_per_zone *
216 211  
217 212  
218 213  
219 214  
220 215  
221 216  
222 217  
223 218  
224 219  
225 220  
226 221  
227 222  
228 223  
229 224  
230 225  
231 226  
... ... @@ -264,82 +259,97 @@
264 259 struct mem_cgroup, css);
265 260 }
266 261  
267   -static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
268   - struct page_cgroup *pc)
  262 +/*
  263 + * Following LRU functions are allowed to be used without PCG_LOCK.
  264 + * Operations are called by routine of global LRU independently from memcg.
  265 + * What we have to take care of here is validness of pc->mem_cgroup.
  266 + *
  267 + * Changes to pc->mem_cgroup happens when
  268 + * 1. charge
  269 + * 2. moving account
  270 + * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
  271 + * It is added to LRU before charge.
  272 + * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
  273 + * When moving account, the page is not on LRU. It's isolated.
  274 + */
  275 +
  276 +void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
269 277 {
270   - int lru = LRU_BASE;
  278 + struct page_cgroup *pc;
  279 + struct mem_cgroup *mem;
  280 + struct mem_cgroup_per_zone *mz;
271 281  
272   - if (PageCgroupUnevictable(pc))
273   - lru = LRU_UNEVICTABLE;
274   - else {
275   - if (PageCgroupActive(pc))
276   - lru += LRU_ACTIVE;
277   - if (PageCgroupFile(pc))
278   - lru += LRU_FILE;
279   - }
280   -
  282 + if (mem_cgroup_subsys.disabled)
  283 + return;
  284 + pc = lookup_page_cgroup(page);
  285 + /* can happen while we handle swapcache. */
  286 + if (list_empty(&pc->lru))
  287 + return;
  288 + mz = page_cgroup_zoneinfo(pc);
  289 + mem = pc->mem_cgroup;
281 290 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
  291 + list_del_init(&pc->lru);
  292 + return;
  293 +}
282 294  
283   - mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false);
284   - list_del(&pc->lru);
  295 +void mem_cgroup_del_lru(struct page *page)
  296 +{
  297 + mem_cgroup_del_lru_list(page, page_lru(page));
285 298 }
286 299  
287   -static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
288   - struct page_cgroup *pc, bool hot)
  300 +void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
289 301 {
290   - int lru = LRU_BASE;
  302 + struct mem_cgroup_per_zone *mz;
  303 + struct page_cgroup *pc;
291 304  
292   - if (PageCgroupUnevictable(pc))
293   - lru = LRU_UNEVICTABLE;
294   - else {
295   - if (PageCgroupActive(pc))
296   - lru += LRU_ACTIVE;
297   - if (PageCgroupFile(pc))
298   - lru += LRU_FILE;
299   - }
  305 + if (mem_cgroup_subsys.disabled)
  306 + return;
300 307  
301   - MEM_CGROUP_ZSTAT(mz, lru) += 1;
302   - if (hot)
303   - list_add(&pc->lru, &mz->lists[lru]);
304   - else
305   - list_add_tail(&pc->lru, &mz->lists[lru]);
306   -
307   - mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true);
  308 + pc = lookup_page_cgroup(page);
  309 + smp_rmb();
  310 + /* unused page is not rotated. */
  311 + if (!PageCgroupUsed(pc))
  312 + return;
  313 + mz = page_cgroup_zoneinfo(pc);
  314 + list_move(&pc->lru, &mz->lists[lru]);
308 315 }
309 316  
310   -static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru)
  317 +void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
311 318 {
312   - struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
313   - int active = PageCgroupActive(pc);
314   - int file = PageCgroupFile(pc);
315   - int unevictable = PageCgroupUnevictable(pc);
316   - enum lru_list from = unevictable ? LRU_UNEVICTABLE :
317   - (LRU_FILE * !!file + !!active);
  319 + struct page_cgroup *pc;
  320 + struct mem_cgroup_per_zone *mz;
318 321  
319   - if (lru == from)
  322 + if (mem_cgroup_subsys.disabled)
320 323 return;
  324 + pc = lookup_page_cgroup(page);
  325 + /* barrier to sync with "charge" */
  326 + smp_rmb();
  327 + if (!PageCgroupUsed(pc))
  328 + return;
321 329  
322   - MEM_CGROUP_ZSTAT(mz, from) -= 1;
323   - /*
324   - * However this is done under mz->lru_lock, another flags, which
325   - * are not related to LRU, will be modified from out-of-lock.
326   - * We have to use atomic set/clear flags.
327   - */
328   - if (is_unevictable_lru(lru)) {
329   - ClearPageCgroupActive(pc);
330   - SetPageCgroupUnevictable(pc);
331   - } else {
332   - if (is_active_lru(lru))
333   - SetPageCgroupActive(pc);
334   - else
335   - ClearPageCgroupActive(pc);
336   - ClearPageCgroupUnevictable(pc);
337   - }
338   -
  330 + mz = page_cgroup_zoneinfo(pc);
339 331 MEM_CGROUP_ZSTAT(mz, lru) += 1;
340   - list_move(&pc->lru, &mz->lists[lru]);
  332 + list_add(&pc->lru, &mz->lists[lru]);
341 333 }
  334 +/*
  335 + * To add swapcache into LRU. Be careful to all this function.
  336 + * zone->lru_lock shouldn't be held and irq must not be disabled.
  337 + */
  338 +static void mem_cgroup_lru_fixup(struct page *page)
  339 +{
  340 + if (!isolate_lru_page(page))
  341 + putback_lru_page(page);
  342 +}
342 343  
  344 +void mem_cgroup_move_lists(struct page *page,
  345 + enum lru_list from, enum lru_list to)
  346 +{
  347 + if (mem_cgroup_subsys.disabled)
  348 + return;
  349 + mem_cgroup_del_lru_list(page, from);
  350 + mem_cgroup_add_lru_list(page, to);
  351 +}
  352 +
343 353 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
344 354 {
345 355 int ret;
... ... @@ -351,37 +361,6 @@
351 361 }
352 362  
353 363 /*
354   - * This routine assumes that the appropriate zone's lru lock is already held
355   - */
356   -void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
357   -{
358   - struct page_cgroup *pc;
359   - struct mem_cgroup_per_zone *mz;
360   - unsigned long flags;
361   -
362   - if (mem_cgroup_subsys.disabled)
363   - return;
364   -
365   - /*
366   - * We cannot lock_page_cgroup while holding zone's lru_lock,
367   - * because other holders of lock_page_cgroup can be interrupted
368   - * with an attempt to rotate_reclaimable_page. But we cannot
369   - * safely get to page_cgroup without it, so just try_lock it:
370   - * mem_cgroup_isolate_pages allows for page left on wrong list.
371   - */
372   - pc = lookup_page_cgroup(page);
373   - if (!trylock_page_cgroup(pc))
374   - return;
375   - if (pc && PageCgroupUsed(pc)) {
376   - mz = page_cgroup_zoneinfo(pc);
377   - spin_lock_irqsave(&mz->lru_lock, flags);
378   - __mem_cgroup_move_lists(pc, lru);
379   - spin_unlock_irqrestore(&mz->lru_lock, flags);
380   - }
381   - unlock_page_cgroup(pc);
382   -}
383   -
384   -/*
385 364 * Calculate mapped_ratio under memory controller. This will be used in
386 365 * vmscan.c for deteremining we have to reclaim mapped pages.
387 366 */
388 367  
389 368  
390 369  
391 370  
392 371  
... ... @@ -460,40 +439,24 @@
460 439 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
461 440 src = &mz->lists[lru];
462 441  
463   - spin_lock(&mz->lru_lock);
464 442 scan = 0;
465 443 list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
466 444 if (scan >= nr_to_scan)
467 445 break;
  446 +
  447 + page = pc->page;
468 448 if (unlikely(!PageCgroupUsed(pc)))
469 449 continue;
470   - page = pc->page;
471   -
472 450 if (unlikely(!PageLRU(page)))
473 451 continue;
474 452  
475   - /*
476   - * TODO: play better with lumpy reclaim, grabbing anything.
477   - */
478   - if (PageUnevictable(page) ||
479   - (PageActive(page) && !active) ||
480   - (!PageActive(page) && active)) {
481   - __mem_cgroup_move_lists(pc, page_lru(page));
482   - continue;
483   - }
484   -
485 453 scan++;
486   - list_move(&pc->lru, &pc_list);
487   -
488 454 if (__isolate_lru_page(page, mode, file) == 0) {
489 455 list_move(&page->lru, dst);
490 456 nr_taken++;
491 457 }
492 458 }
493 459  
494   - list_splice(&pc_list, src);
495   - spin_unlock(&mz->lru_lock);
496   -
497 460 *scanned = scan;
498 461 return nr_taken;
499 462 }
... ... @@ -608,9 +571,6 @@
608 571 struct page_cgroup *pc,
609 572 enum charge_type ctype)
610 573 {
611   - struct mem_cgroup_per_zone *mz;
612   - unsigned long flags;
613   -
614 574 /* try_charge() can return NULL to *memcg, taking care of it. */
615 575 if (!mem)
616 576 return;
617 577  
618 578  
... ... @@ -625,17 +585,11 @@
625 585 return;
626 586 }
627 587 pc->mem_cgroup = mem;
628   - /*
629   - * If a page is accounted as a page cache, insert to inactive list.
630   - * If anon, insert to active list.
631   - */
  588 + smp_wmb();
632 589 pc->flags = pcg_default_flags[ctype];
633 590  
634   - mz = page_cgroup_zoneinfo(pc);
  591 + mem_cgroup_charge_statistics(mem, pc, true);
635 592  
636   - spin_lock_irqsave(&mz->lru_lock, flags);
637   - __mem_cgroup_add_list(mz, pc, true);
638   - spin_unlock_irqrestore(&mz->lru_lock, flags);
639 593 unlock_page_cgroup(pc);
640 594 }
641 595  
... ... @@ -646,8 +600,7 @@
646 600 * @to: mem_cgroup which the page is moved to. @from != @to.
647 601 *
648 602 * The caller must confirm following.
649   - * 1. disable irq.
650   - * 2. lru_lock of old mem_cgroup(@from) should be held.
  603 + * - page is not on LRU (isolate_page() is useful.)
651 604 *
652 605 * returns 0 at success,
653 606 * returns -EBUSY when lock is busy or "pc" is unstable.
654 607  
655 608  
... ... @@ -663,15 +616,14 @@
663 616 int nid, zid;
664 617 int ret = -EBUSY;
665 618  
666   - VM_BUG_ON(!irqs_disabled());
667 619 VM_BUG_ON(from == to);
  620 + VM_BUG_ON(PageLRU(pc->page));
668 621  
669 622 nid = page_cgroup_nid(pc);
670 623 zid = page_cgroup_zid(pc);
671 624 from_mz = mem_cgroup_zoneinfo(from, nid, zid);
672 625 to_mz = mem_cgroup_zoneinfo(to, nid, zid);
673 626  
674   -
675 627 if (!trylock_page_cgroup(pc))
676 628 return ret;
677 629  
... ... @@ -681,18 +633,15 @@
681 633 if (pc->mem_cgroup != from)
682 634 goto out;
683 635  
684   - if (spin_trylock(&to_mz->lru_lock)) {
685   - __mem_cgroup_remove_list(from_mz, pc);
686   - css_put(&from->css);
687   - res_counter_uncharge(&from->res, PAGE_SIZE);
688   - if (do_swap_account)
689   - res_counter_uncharge(&from->memsw, PAGE_SIZE);
690   - pc->mem_cgroup = to;
691   - css_get(&to->css);
692   - __mem_cgroup_add_list(to_mz, pc, false);
693   - ret = 0;
694   - spin_unlock(&to_mz->lru_lock);
695   - }
  636 + css_put(&from->css);
  637 + res_counter_uncharge(&from->res, PAGE_SIZE);
  638 + mem_cgroup_charge_statistics(from, pc, false);
  639 + if (do_swap_account)
  640 + res_counter_uncharge(&from->memsw, PAGE_SIZE);
  641 + pc->mem_cgroup = to;
  642 + mem_cgroup_charge_statistics(to, pc, true);
  643 + css_get(&to->css);
  644 + ret = 0;
696 645 out:
697 646 unlock_page_cgroup(pc);
698 647 return ret;
699 648  
700 649  
701 650  
702 651  
703 652  
704 653  
705 654  
706 655  
707 656  
... ... @@ -706,39 +655,47 @@
706 655 struct mem_cgroup *child,
707 656 gfp_t gfp_mask)
708 657 {
  658 + struct page *page = pc->page;
709 659 struct cgroup *cg = child->css.cgroup;
710 660 struct cgroup *pcg = cg->parent;
711 661 struct mem_cgroup *parent;
712   - struct mem_cgroup_per_zone *mz;
713   - unsigned long flags;
714 662 int ret;
715 663  
716 664 /* Is ROOT ? */
717 665 if (!pcg)
718 666 return -EINVAL;
719 667  
  668 +
720 669 parent = mem_cgroup_from_cont(pcg);
721 670  
  671 +
722 672 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
723 673 if (ret)
724 674 return ret;
725 675  
726   - mz = mem_cgroup_zoneinfo(child,
727   - page_cgroup_nid(pc), page_cgroup_zid(pc));
  676 + if (!get_page_unless_zero(page))
  677 + return -EBUSY;
728 678  
729   - spin_lock_irqsave(&mz->lru_lock, flags);
  679 + ret = isolate_lru_page(page);
  680 +
  681 + if (ret)
  682 + goto cancel;
  683 +
730 684 ret = mem_cgroup_move_account(pc, child, parent);
731   - spin_unlock_irqrestore(&mz->lru_lock, flags);
732 685  
733   - /* drop extra refcnt */
  686 + /* drop extra refcnt by try_charge() (move_account increment one) */
734 687 css_put(&parent->css);
735   - /* uncharge if move fails */
736   - if (ret) {
737   - res_counter_uncharge(&parent->res, PAGE_SIZE);
738   - if (do_swap_account)
739   - res_counter_uncharge(&parent->memsw, PAGE_SIZE);
  688 + putback_lru_page(page);
  689 + if (!ret) {
  690 + put_page(page);
  691 + return 0;
740 692 }
741   -
  693 + /* uncharge if move fails */
  694 +cancel:
  695 + res_counter_uncharge(&parent->res, PAGE_SIZE);
  696 + if (do_swap_account)
  697 + res_counter_uncharge(&parent->memsw, PAGE_SIZE);
  698 + put_page(page);
742 699 return ret;
743 700 }
744 701  
... ... @@ -912,6 +869,8 @@
912 869 }
913 870 if (!locked)
914 871 unlock_page(page);
  872 + /* add this page(page_cgroup) to the LRU we want. */
  873 + mem_cgroup_lru_fixup(page);
915 874  
916 875 return ret;
917 876 }
... ... @@ -944,6 +903,8 @@
944 903 }
945 904  
946 905 }
  906 + /* add this page(page_cgroup) to the LRU we want. */
  907 + mem_cgroup_lru_fixup(page);
947 908 }
948 909  
949 910 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
... ... @@ -968,7 +929,6 @@
968 929 struct page_cgroup *pc;
969 930 struct mem_cgroup *mem = NULL;
970 931 struct mem_cgroup_per_zone *mz;
971   - unsigned long flags;
972 932  
973 933 if (mem_cgroup_subsys.disabled)
974 934 return NULL;
975 935  
... ... @@ -1010,12 +970,10 @@
1010 970 if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1011 971 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1012 972  
  973 + mem_cgroup_charge_statistics(mem, pc, false);
1013 974 ClearPageCgroupUsed(pc);
1014 975  
1015 976 mz = page_cgroup_zoneinfo(pc);
1016   - spin_lock_irqsave(&mz->lru_lock, flags);
1017   - __mem_cgroup_remove_list(mz, pc);
1018   - spin_unlock_irqrestore(&mz->lru_lock, flags);
1019 977 unlock_page_cgroup(pc);
1020 978  
1021 979 css_put(&mem->css);
1022 980  
1023 981  
1024 982  
1025 983  
... ... @@ -1281,21 +1239,22 @@
1281 1239 return ret;
1282 1240 }
1283 1241  
1284   -
1285 1242 /*
1286 1243 * This routine traverse page_cgroup in given list and drop them all.
1287 1244 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
1288 1245 */
1289 1246 static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
1290   - struct mem_cgroup_per_zone *mz,
1291   - enum lru_list lru)
  1247 + int node, int zid, enum lru_list lru)
1292 1248 {
  1249 + struct zone *zone;
  1250 + struct mem_cgroup_per_zone *mz;
1293 1251 struct page_cgroup *pc, *busy;
1294   - unsigned long flags;
1295   - unsigned long loop;
  1252 + unsigned long flags, loop;
1296 1253 struct list_head *list;
1297 1254 int ret = 0;
1298 1255  
  1256 + zone = &NODE_DATA(node)->node_zones[zid];
  1257 + mz = mem_cgroup_zoneinfo(mem, node, zid);
1299 1258 list = &mz->lists[lru];
1300 1259  
1301 1260 loop = MEM_CGROUP_ZSTAT(mz, lru);
1302 1261  
1303 1262  
1304 1263  
... ... @@ -1304,19 +1263,19 @@
1304 1263 busy = NULL;
1305 1264 while (loop--) {
1306 1265 ret = 0;
1307   - spin_lock_irqsave(&mz->lru_lock, flags);
  1266 + spin_lock_irqsave(&zone->lru_lock, flags);
1308 1267 if (list_empty(list)) {
1309   - spin_unlock_irqrestore(&mz->lru_lock, flags);
  1268 + spin_unlock_irqrestore(&zone->lru_lock, flags);
1310 1269 break;
1311 1270 }
1312 1271 pc = list_entry(list->prev, struct page_cgroup, lru);
1313 1272 if (busy == pc) {
1314 1273 list_move(&pc->lru, list);
1315 1274 busy = 0;
1316   - spin_unlock_irqrestore(&mz->lru_lock, flags);
  1275 + spin_unlock_irqrestore(&zone->lru_lock, flags);
1317 1276 continue;
1318 1277 }
1319   - spin_unlock_irqrestore(&mz->lru_lock, flags);
  1278 + spin_unlock_irqrestore(&zone->lru_lock, flags);
1320 1279  
1321 1280 ret = mem_cgroup_move_parent(pc, mem, GFP_HIGHUSER_MOVABLE);
1322 1281 if (ret == -ENOMEM)
... ... @@ -1329,6 +1288,7 @@
1329 1288 } else
1330 1289 busy = NULL;
1331 1290 }
  1291 +
1332 1292 if (!ret && !list_empty(list))
1333 1293 return -EBUSY;
1334 1294 return ret;
1335 1295  
1336 1296  
... ... @@ -1364,12 +1324,10 @@
1364 1324 ret = 0;
1365 1325 for_each_node_state(node, N_POSSIBLE) {
1366 1326 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
1367   - struct mem_cgroup_per_zone *mz;
1368 1327 enum lru_list l;
1369   - mz = mem_cgroup_zoneinfo(mem, node, zid);
1370 1328 for_each_lru(l) {
1371 1329 ret = mem_cgroup_force_empty_list(mem,
1372   - mz, l);
  1330 + node, zid, l);
1373 1331 if (ret)
1374 1332 break;
1375 1333 }
... ... @@ -1413,6 +1371,7 @@
1413 1371 }
1414 1372  
1415 1373 }
  1374 + lru_add_drain();
1416 1375 /* try move_account...there may be some *locked* pages. */
1417 1376 if (mem->res.usage)
1418 1377 goto move_account;
... ... @@ -1657,7 +1616,6 @@
1657 1616  
1658 1617 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
1659 1618 mz = &pn->zoneinfo[zone];
1660   - spin_lock_init(&mz->lru_lock);
1661 1619 for_each_lru(l)
1662 1620 INIT_LIST_HEAD(&mz->lists[l]);
1663 1621 }
1664 1622  
... ... @@ -1706,8 +1664,15 @@
1706 1664  
1707 1665 static void mem_cgroup_free(struct mem_cgroup *mem)
1708 1666 {
  1667 + int node;
  1668 +
1709 1669 if (atomic_read(&mem->refcnt) > 0)
1710 1670 return;
  1671 +
  1672 +
  1673 + for_each_node_state(node, N_POSSIBLE)
  1674 + free_mem_cgroup_per_zone_info(mem, node);
  1675 +
1711 1676 if (mem_cgroup_size() < PAGE_SIZE)
1712 1677 kfree(mem);
1713 1678 else
... ... @@ -1780,12 +1745,6 @@
1780 1745 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
1781 1746 struct cgroup *cont)
1782 1747 {
1783   - int node;
1784   - struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1785   -
1786   - for_each_node_state(node, N_POSSIBLE)
1787   - free_mem_cgroup_per_zone_info(mem, node);
1788   -
1789 1748 mem_cgroup_free(mem_cgroup_from_cont(cont));
1790 1749 }
1791 1750  
... ... @@ -16,6 +16,7 @@
16 16 pc->flags = 0;
17 17 pc->mem_cgroup = NULL;
18 18 pc->page = pfn_to_page(pfn);
  19 + INIT_LIST_HEAD(&pc->lru);
19 20 }
20 21 static unsigned long total_usage;
21 22  
... ... @@ -168,7 +168,6 @@
168 168 lru += LRU_ACTIVE;
169 169 add_page_to_lru_list(zone, page, lru);
170 170 __count_vm_event(PGACTIVATE);
171   - mem_cgroup_move_lists(page, lru);
172 171  
173 172 zone->recent_rotated[!!file]++;
174 173 zone->recent_scanned[!!file]++;
... ... @@ -512,7 +512,6 @@
512 512 lru = LRU_UNEVICTABLE;
513 513 add_page_to_unevictable_list(page);
514 514 }
515   - mem_cgroup_move_lists(page, lru);
516 515  
517 516 /*
518 517 * page's status can change while we move it among lru. If an evictable
... ... @@ -547,7 +546,6 @@
547 546  
548 547 lru = !!TestClearPageActive(page) + page_is_file_cache(page);
549 548 lru_cache_add_lru(page, lru);
550   - mem_cgroup_move_lists(page, lru);
551 549 put_page(page);
552 550 }
553 551 #endif /* CONFIG_UNEVICTABLE_LRU */
... ... @@ -813,6 +811,7 @@
813 811 return ret;
814 812  
815 813 ret = -EBUSY;
  814 +
816 815 if (likely(get_page_unless_zero(page))) {
817 816 /*
818 817 * Be careful not to clear PageLRU until after we're
... ... @@ -821,6 +820,7 @@
821 820 */
822 821 ClearPageLRU(page);
823 822 ret = 0;
  823 + mem_cgroup_del_lru(page);
824 824 }
825 825  
826 826 return ret;
... ... @@ -1134,7 +1134,6 @@
1134 1134 SetPageLRU(page);
1135 1135 lru = page_lru(page);
1136 1136 add_page_to_lru_list(zone, page, lru);
1137   - mem_cgroup_move_lists(page, lru);
1138 1137 if (PageActive(page) && scan_global_lru(sc)) {
1139 1138 int file = !!page_is_file_cache(page);
1140 1139 zone->recent_rotated[file]++;
... ... @@ -1263,7 +1262,7 @@
1263 1262 ClearPageActive(page);
1264 1263  
1265 1264 list_move(&page->lru, &zone->lru[lru].list);
1266   - mem_cgroup_move_lists(page, lru);
  1265 + mem_cgroup_add_lru_list(page, lru);
1267 1266 pgmoved++;
1268 1267 if (!pagevec_add(&pvec, page)) {
1269 1268 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
... ... @@ -2408,6 +2407,7 @@
2408 2407  
2409 2408 __dec_zone_state(zone, NR_UNEVICTABLE);
2410 2409 list_move(&page->lru, &zone->lru[l].list);
  2410 + mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l);
2411 2411 __inc_zone_state(zone, NR_INACTIVE_ANON + l);
2412 2412 __count_vm_event(UNEVICTABLE_PGRESCUED);
2413 2413 } else {
... ... @@ -2416,6 +2416,7 @@
2416 2416 */
2417 2417 SetPageUnevictable(page);
2418 2418 list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list);
  2419 + mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE);
2419 2420 if (page_evictable(page, NULL))
2420 2421 goto retry;
2421 2422 }