Memory controller: add per cgroup LRU and reclaim

Add the page_cgroup to the per cgroup LRU. The reclaim algorithm has been modified to make the isolate_lru_pages() as a pluggable component. The scan_control data structure now accepts the cgroup on behalf of which reclaims are carried out. try_to_free_pages() has been extended to become cgroup aware. [akpm@linux-foundation.org: fix warning] [Lee.Schermerhorn@hp.com: initialize all scan_control's isolate_pages member] [bunk@kernel.org: make do_try_to_free_pages() static] [hugh@veritas.com: memcgroup: fix try_to_free order] [kamezawa.hiroyu@jp.fujitsu.com: this unlock_page_cgroup() is unnecessary] Signed-off-by: Pavel Emelianov <xemul@openvz.org> Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Paul Menage <menage@google.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Kirill Korotaev <dev@sw.ru> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: David Rientjes <rientjes@google.com> Cc: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Memory controller: add per cgroup LRU and reclaim
Add the page_cgroup to the per cgroup LRU. The reclaim algorithm has been modified to make the isolate_lru_pages() as a pluggable component. The scan_control data structure now accepts the cgroup on behalf of which reclaims are carried out. try_to_free_pages() has been extended to become cgroup aware. [akpm@linux-foundation.org: fix warning] [Lee.Schermerhorn@hp.com: initialize all scan_control's isolate_pages member] [bunk@kernel.org: make do_try_to_free_pages() static] [hugh@veritas.com: memcgroup: fix try_to_free order] [kamezawa.hiroyu@jp.fujitsu.com: this unlock_page_cgroup() is unnecessary] Signed-off-by: Pavel Emelianov <xemul@openvz.org> Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Paul Menage <menage@google.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Kirill Korotaev <dev@sw.ru> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: David Rientjes <rientjes@google.com> Cc: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Balbir Singh · Linus Torvalds
1 parent 67e465a77b
Showing 6 changed files with 286 additions and 30 deletions Side-by-side Diff
include/linux/memcontrol.h
include/linux/res_counter.h
include/linux/swap.h
mm/memcontrol.c
mm/swap.c
mm/vmscan.c
@@ -32,6 +32,13 @@
 extern struct page_cgroup *page_get_page_cgroup(struct page *page);
 extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm);
 extern void mem_cgroup_uncharge(struct page_cgroup *pc);
+extern void mem_cgroup_move_lists(struct page_cgroup *pc, bool active);
+extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
+					struct list_head *dst,
+					unsigned long *scanned, int order,
+					int mode, struct zone *z,
+					struct mem_cgroup *mem_cont,
+					int active);
  
 static inline void mem_cgroup_uncharge_page(struct page *page)
 {
@@ -68,6 +75,11 @@
 }
  
 static inline void mem_cgroup_uncharge_page(struct page *page)
+{
+}
+
+static inline void mem_cgroup_move_lists(struct page_cgroup *pc,
+						bool active)
 {
 }
  
@@ -99,5 +99,28 @@
 void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val);
 void res_counter_uncharge(struct res_counter *counter, unsigned long val);
  
+static inline bool res_counter_limit_check_locked(struct res_counter *cnt)
+{
+	if (cnt->usage < cnt->limit)
+		return true;
+
+	return false;
+}
+
+/*
+ * Helper function to detect if the cgroup is within it's limit or
+ * not. It's currently called from cgroup_rss_prepare()
+ */
+static inline bool res_counter_check_under_limit(struct res_counter *cnt)
+{
+	bool ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cnt->lock, flags);
+	ret = res_counter_limit_check_locked(cnt);
+	spin_unlock_irqrestore(&cnt->lock, flags);
+	return ret;
+}
+
 #endif
@@ -5,6 +5,7 @@
 #include <linux/linkage.h>
 #include <linux/mmzone.h>
 #include <linux/list.h>
+#include <linux/memcontrol.h>
 #include <linux/sched.h>
  
 #include <asm/atomic.h>
@@ -182,6 +183,8 @@
 /* linux/mm/vmscan.c */
 extern unsigned long try_to_free_pages(struct zone **zones, int order,
 					gfp_t gfp_mask);
+extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem);
+extern int __isolate_lru_page(struct page *page, int mode);
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
 extern int remove_mapping(struct address_space *mapping, struct page *page);
@@ -22,10 +22,15 @@
 #include <linux/cgroup.h>
 #include <linux/mm.h>
 #include <linux/page-flags.h>
+#include <linux/backing-dev.h>
 #include <linux/bit_spinlock.h>
 #include <linux/rcupdate.h>
+#include <linux/swap.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
  
 struct cgroup_subsys mem_cgroup_subsys;
+static const int MEM_CGROUP_RECLAIM_RETRIES = 5;
  
 /*
  * The memory controller data structure. The memory controller controls both
@@ -51,6 +56,10 @@
 	 */
 	struct list_head active_list;
 	struct list_head inactive_list;
+	/*
+	 * spin_lock to protect the per cgroup LRU
+	 */
+	spinlock_t lru_lock;
 };
  
 /*
  
@@ -141,7 +150,95 @@
 	bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
 }
  
+void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
+{
+	if (active)
+		list_move(&pc->lru, &pc->mem_cgroup->active_list);
+	else
+		list_move(&pc->lru, &pc->mem_cgroup->inactive_list);
+}
+
 /*
+ * This routine assumes that the appropriate zone's lru lock is already held
+ */
+void mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
+{
+	struct mem_cgroup *mem;
+	if (!pc)
+		return;
+
+	mem = pc->mem_cgroup;
+
+	spin_lock(&mem->lru_lock);
+	__mem_cgroup_move_lists(pc, active);
+	spin_unlock(&mem->lru_lock);
+}
+
+unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
+					struct list_head *dst,
+					unsigned long *scanned, int order,
+					int mode, struct zone *z,
+					struct mem_cgroup *mem_cont,
+					int active)
+{
+	unsigned long nr_taken = 0;
+	struct page *page;
+	unsigned long scan;
+	LIST_HEAD(pc_list);
+	struct list_head *src;
+	struct page_cgroup *pc;
+
+	if (active)
+		src = &mem_cont->active_list;
+	else
+		src = &mem_cont->inactive_list;
+
+	spin_lock(&mem_cont->lru_lock);
+	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
+		pc = list_entry(src->prev, struct page_cgroup, lru);
+		page = pc->page;
+		VM_BUG_ON(!pc);
+
+		if (PageActive(page) && !active) {
+			__mem_cgroup_move_lists(pc, true);
+			scan--;
+			continue;
+		}
+		if (!PageActive(page) && active) {
+			__mem_cgroup_move_lists(pc, false);
+			scan--;
+			continue;
+		}
+
+		/*
+		 * Reclaim, per zone
+		 * TODO: make the active/inactive lists per zone
+		 */
+		if (page_zone(page) != z)
+			continue;
+
+		/*
+		 * Check if the meta page went away from under us
+		 */
+		if (!list_empty(&pc->lru))
+			list_move(&pc->lru, &pc_list);
+		else
+			continue;
+
+		if (__isolate_lru_page(page, mode) == 0) {
+			list_move(&page->lru, dst);
+			nr_taken++;
+		}
+	}
+
+	list_splice(&pc_list, src);
+	spin_unlock(&mem_cont->lru_lock);
+
+	*scanned = scan;
+	return nr_taken;
+}
+
+/*
  * Charge the memory controller for page usage.
  * Return
  * 0 if the charge was successful
@@ -151,6 +248,8 @@
 {
 	struct mem_cgroup *mem;
 	struct page_cgroup *pc, *race_pc;
+	unsigned long flags;
+	unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
  
 	/*
 	 * Should page_cgroup's go to their own slab?
  
@@ -159,14 +258,20 @@
 	 * to see if the cgroup page already has a page_cgroup associated
 	 * with it
 	 */
+retry:
 	lock_page_cgroup(page);
 	pc = page_get_page_cgroup(page);
 	/*
 	 * The page_cgroup exists and the page has already been accounted
 	 */
 	if (pc) {
-		atomic_inc(&pc->ref_cnt);
-		goto done;
+		if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) {
+			/* this page is under being uncharged ? */
+			unlock_page_cgroup(page);
+			cpu_relax();
+			goto retry;
+		} else
+			goto done;
 	}
  
 	unlock_page_cgroup(page);
@@ -197,7 +302,32 @@
 	 * If we created the page_cgroup, we should free it on exceeding
 	 * the cgroup limit.
 	 */
-	if (res_counter_charge(&mem->res, 1)) {
+	while (res_counter_charge(&mem->res, 1)) {
+		if (try_to_free_mem_cgroup_pages(mem))
+			continue;
+
+		/*
+ 		 * try_to_free_mem_cgroup_pages() might not give us a full
+ 		 * picture of reclaim. Some pages are reclaimed and might be
+ 		 * moved to swap cache or just unmapped from the cgroup.
+ 		 * Check the limit again to see if the reclaim reduced the
+ 		 * current usage of the cgroup before giving up
+ 		 */
+		if (res_counter_check_under_limit(&mem->res))
+			continue;
+			/*
+			 * Since we control both RSS and cache, we end up with a
+			 * very interesting scenario where we end up reclaiming
+			 * memory (essentially RSS), since the memory is pushed
+			 * to swap cache, we eventually end up adding those
+			 * pages back to our list. Hence we give ourselves a
+			 * few chances before we fail
+			 */
+		else if (nr_retries--) {
+			congestion_wait(WRITE, HZ/10);
+			continue;
+		}
+
 		css_put(&mem->css);
 		goto free_pc;
 	}
  
  
@@ -221,14 +351,16 @@
 	pc->page = page;
 	page_assign_page_cgroup(page, pc);
  
+	spin_lock_irqsave(&mem->lru_lock, flags);
+	list_add(&pc->lru, &mem->active_list);
+	spin_unlock_irqrestore(&mem->lru_lock, flags);
+
 done:
 	unlock_page_cgroup(page);
 	return 0;
 free_pc:
 	kfree(pc);
-	return -ENOMEM;
 err:
-	unlock_page_cgroup(page);
 	return -ENOMEM;
 }
  
@@ -240,6 +372,7 @@
 {
 	struct mem_cgroup *mem;
 	struct page *page;
+	unsigned long flags;
  
 	if (!pc)
 		return;
@@ -252,6 +385,10 @@
 		page_assign_page_cgroup(page, NULL);
 		unlock_page_cgroup(page);
 		res_counter_uncharge(&mem->res, 1);
+
+ 		spin_lock_irqsave(&mem->lru_lock, flags);
+ 		list_del_init(&pc->lru);
+ 		spin_unlock_irqrestore(&mem->lru_lock, flags);
 		kfree(pc);
 	}
 }
@@ -310,6 +447,7 @@
 	res_counter_init(&mem->res);
 	INIT_LIST_HEAD(&mem->active_list);
 	INIT_LIST_HEAD(&mem->inactive_list);
+	spin_lock_init(&mem->lru_lock);
 	return &mem->css;
 }
  
@@ -29,6 +29,7 @@
 #include <linux/cpu.h>
 #include <linux/notifier.h>
 #include <linux/backing-dev.h>
+#include <linux/memcontrol.h>
  
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
@@ -175,6 +176,7 @@
 		SetPageActive(page);
 		add_page_to_active_list(zone, page);
 		__count_vm_event(PGACTIVATE);
+		mem_cgroup_move_lists(page_get_page_cgroup(page), true);
 	}
 	spin_unlock_irq(&zone->lru_lock);
 }
@@ -37,6 +37,7 @@
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/memcontrol.h>
  
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -68,6 +69,15 @@
 	int all_unreclaimable;
  
 	int order;
+
+	/* Which cgroup do we reclaim from */
+	struct mem_cgroup *mem_cgroup;
+
+	/* Pluggable isolate pages callback */
+	unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
+			unsigned long *scanned, int order, int mode,
+			struct zone *z, struct mem_cgroup *mem_cont,
+			int active);
 };
  
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -626,7 +636,7 @@
  *
  * returns 0 on success, -ve errno on failure.
  */
-static int __isolate_lru_page(struct page *page, int mode)
+int __isolate_lru_page(struct page *page, int mode)
 {
 	int ret = -EINVAL;
  
@@ -760,6 +770,21 @@
 	return nr_taken;
 }
  
+static unsigned long isolate_pages_global(unsigned long nr,
+					struct list_head *dst,
+					unsigned long *scanned, int order,
+					int mode, struct zone *z,
+					struct mem_cgroup *mem_cont,
+					int active)
+{
+	if (active)
+		return isolate_lru_pages(nr, &z->active_list, dst,
+						scanned, order, mode);
+	else
+		return isolate_lru_pages(nr, &z->inactive_list, dst,
+						scanned, order, mode);
+}
+
 /*
  * clear_active_flags() is a helper for shrink_active_list(), clearing
  * any active bits from the pages in the list.
  
@@ -801,11 +826,11 @@
 		unsigned long nr_freed;
 		unsigned long nr_active;
  
-		nr_taken = isolate_lru_pages(sc->swap_cluster_max,
-			     &zone->inactive_list,
+		nr_taken = sc->isolate_pages(sc->swap_cluster_max,
 			     &page_list, &nr_scan, sc->order,
 			     (sc->order > PAGE_ALLOC_COSTLY_ORDER)?
-					     ISOLATE_BOTH : ISOLATE_INACTIVE);
+					     ISOLATE_BOTH : ISOLATE_INACTIVE,
+				zone, sc->mem_cgroup, 0);
 		nr_active = clear_active_flags(&page_list);
 		__count_vm_events(PGDEACTIVATE, nr_active);
  
@@ -1018,8 +1043,9 @@
  
 	lru_add_drain();
 	spin_lock_irq(&zone->lru_lock);
-	pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
-			    &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE);
+	pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
+					ISOLATE_ACTIVE, zone,
+					sc->mem_cgroup, 1);
 	zone->pages_scanned += pgscanned;
 	__mod_zone_page_state(zone, NR_ACTIVE, -pgmoved);
 	spin_unlock_irq(&zone->lru_lock);
@@ -1051,6 +1077,7 @@
 		ClearPageActive(page);
  
 		list_move(&page->lru, &zone->inactive_list);
+		mem_cgroup_move_lists(page_get_page_cgroup(page), false);
 		pgmoved++;
 		if (!pagevec_add(&pvec, page)) {
 			__mod_zone_page_state(zone, NR_INACTIVE, pgmoved);
@@ -1079,6 +1106,7 @@
 		SetPageLRU(page);
 		VM_BUG_ON(!PageActive(page));
 		list_move(&page->lru, &zone->active_list);
+		mem_cgroup_move_lists(page_get_page_cgroup(page), true);
 		pgmoved++;
 		if (!pagevec_add(&pvec, page)) {
 			__mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
@@ -1206,7 +1234,8 @@
  * holds filesystem locks which prevent writeout this might not work, and the
  * allocation attempt will fail.
  */
-unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
+static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
+					  struct scan_control *sc)
 {
 	int priority;
 	int ret = 0;
@@ -1215,14 +1244,6 @@
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	unsigned long lru_pages = 0;
 	int i;
-	struct scan_control sc = {
-		.gfp_mask = gfp_mask,
-		.may_writepage = !laptop_mode,
-		.swap_cluster_max = SWAP_CLUSTER_MAX,
-		.may_swap = 1,
-		.swappiness = vm_swappiness,
-		.order = order,
-	};
  
 	count_vm_event(ALLOCSTALL);
  
  
  
@@ -1237,17 +1258,22 @@
 	}
  
 	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
-		sc.nr_scanned = 0;
+		sc->nr_scanned = 0;
 		if (!priority)
 			disable_swap_token();
-		nr_reclaimed += shrink_zones(priority, zones, &sc);
-		shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
+		nr_reclaimed += shrink_zones(priority, zones, sc);
+		/*
+		 * Don't shrink slabs when reclaiming memory from
+		 * over limit cgroups
+		 */
+		if (sc->mem_cgroup == NULL)
+			shrink_slab(sc->nr_scanned, gfp_mask, lru_pages);
 		if (reclaim_state) {
 			nr_reclaimed += reclaim_state->reclaimed_slab;
 			reclaim_state->reclaimed_slab = 0;
 		}
-		total_scanned += sc.nr_scanned;
-		if (nr_reclaimed >= sc.swap_cluster_max) {
+		total_scanned += sc->nr_scanned;
+		if (nr_reclaimed >= sc->swap_cluster_max) {
 			ret = 1;
 			goto out;
 		}
  
  
  
@@ -1259,18 +1285,18 @@
 		 * that's undesirable in laptop mode, where we *want* lumpy
 		 * writeout.  So in laptop mode, write out the whole world.
 		 */
-		if (total_scanned > sc.swap_cluster_max +
-					sc.swap_cluster_max / 2) {
+		if (total_scanned > sc->swap_cluster_max +
+					sc->swap_cluster_max / 2) {
 			wakeup_pdflush(laptop_mode ? 0 : total_scanned);
-			sc.may_writepage = 1;
+			sc->may_writepage = 1;
 		}
  
 		/* Take a nap, wait for some writeback to complete */
-		if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
+		if (sc->nr_scanned && priority < DEF_PRIORITY - 2)
 			congestion_wait(WRITE, HZ/10);
 	}
 	/* top priority shrink_caches still had more to do? don't OOM, then */
-	if (!sc.all_unreclaimable)
+	if (!sc->all_unreclaimable && sc->mem_cgroup == NULL)
 		ret = 1;
 out:
 	/*
@@ -1293,6 +1319,54 @@
 	return ret;
 }
  
+unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
+{
+	struct scan_control sc = {
+		.gfp_mask = gfp_mask,
+		.may_writepage = !laptop_mode,
+		.swap_cluster_max = SWAP_CLUSTER_MAX,
+		.may_swap = 1,
+		.swappiness = vm_swappiness,
+		.order = order,
+		.mem_cgroup = NULL,
+		.isolate_pages = isolate_pages_global,
+	};
+
+	return do_try_to_free_pages(zones, gfp_mask, &sc);
+}
+
+#ifdef CONFIG_CGROUP_MEM_CONT
+
+#ifdef CONFIG_HIGHMEM
+#define ZONE_USERPAGES ZONE_HIGHMEM
+#else
+#define ZONE_USERPAGES ZONE_NORMAL
+#endif
+
+unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont)
+{
+	struct scan_control sc = {
+		.gfp_mask = GFP_KERNEL,
+		.may_writepage = !laptop_mode,
+		.may_swap = 1,
+		.swap_cluster_max = SWAP_CLUSTER_MAX,
+		.swappiness = vm_swappiness,
+		.order = 0,
+		.mem_cgroup = mem_cont,
+		.isolate_pages = mem_cgroup_isolate_pages,
+	};
+	int node;
+	struct zone **zones;
+
+	for_each_online_node(node) {
+		zones = NODE_DATA(node)->node_zonelists[ZONE_USERPAGES].zones;
+		if (do_try_to_free_pages(zones, sc.gfp_mask, &sc))
+			return 1;
+	}
+	return 0;
+}
+#endif
+
 /*
  * For kswapd, balance_pgdat() will work across all this node's zones until
  * they are all at pages_high.
@@ -1328,6 +1402,8 @@
 		.swap_cluster_max = SWAP_CLUSTER_MAX,
 		.swappiness = vm_swappiness,
 		.order = order,
+		.mem_cgroup = NULL,
+		.isolate_pages = isolate_pages_global,
 	};
 	/*
 	 * temp_priority is used to remember the scanning priority at which
@@ -1649,6 +1725,7 @@
 		.swap_cluster_max = nr_pages,
 		.may_writepage = 1,
 		.swappiness = vm_swappiness,
+		.isolate_pages = isolate_pages_global,
 	};
  
 	current->reclaim_state = &reclaim_state;
@@ -1834,6 +1911,7 @@
 					SWAP_CLUSTER_MAX),
 		.gfp_mask = gfp_mask,
 		.swappiness = vm_swappiness,
+		.isolate_pages = isolate_pages_global,
 	};
 	unsigned long slab_reclaimable;
...	...	@@ -32,6 +32,13 @@
32	32	extern struct page_cgroup page_get_page_cgroup(struct page page);
33	33	extern int mem_cgroup_charge(struct page page, struct mm_struct mm);
34	34	extern void mem_cgroup_uncharge(struct page_cgroup *pc);
	35	+extern void mem_cgroup_move_lists(struct page_cgroup *pc, bool active);
	36	+extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
	37	+ struct list_head *dst,
	38	+ unsigned long *scanned, int order,
	39	+ int mode, struct zone *z,
	40	+ struct mem_cgroup *mem_cont,
	41	+ int active);
35	42
36	43	static inline void mem_cgroup_uncharge_page(struct page *page)
37	44	{
...	...	@@ -68,6 +75,11 @@
68	75	}
69	76
70	77	static inline void mem_cgroup_uncharge_page(struct page *page)
	78	+{
	79	+}
	80	+
	81	+static inline void mem_cgroup_move_lists(struct page_cgroup *pc,
	82	+ bool active)
71	83	{
72	84	}
73	85
...	...	@@ -99,5 +99,28 @@
99	99	void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val);
100	100	void res_counter_uncharge(struct res_counter *counter, unsigned long val);
101	101
	102	+static inline bool res_counter_limit_check_locked(struct res_counter *cnt)
	103	+{
	104	+ if (cnt->usage < cnt->limit)
	105	+ return true;
	106	+
	107	+ return false;
	108	+}
	109	+
	110	+/*
	111	+ * Helper function to detect if the cgroup is within it's limit or
	112	+ * not. It's currently called from cgroup_rss_prepare()
	113	+ */
	114	+static inline bool res_counter_check_under_limit(struct res_counter *cnt)
	115	+{
	116	+ bool ret;
	117	+ unsigned long flags;
	118	+
	119	+ spin_lock_irqsave(&cnt->lock, flags);
	120	+ ret = res_counter_limit_check_locked(cnt);
	121	+ spin_unlock_irqrestore(&cnt->lock, flags);
	122	+ return ret;
	123	+}
	124	+
102	125	#endif
...	...	@@ -5,6 +5,7 @@
5	5	#include <linux/linkage.h>
6	6	#include <linux/mmzone.h>
7	7	#include <linux/list.h>
	8	+#include <linux/memcontrol.h>
8	9	#include <linux/sched.h>
9	10
10	11	#include <asm/atomic.h>
...	...	@@ -182,6 +183,8 @@
182	183	/* linux/mm/vmscan.c */
183	184	extern unsigned long try_to_free_pages(struct zone **zones, int order,
184	185	gfp_t gfp_mask);
	186	+extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem);
	187	+extern int __isolate_lru_page(struct page *page, int mode);
185	188	extern unsigned long shrink_all_memory(unsigned long nr_pages);
186	189	extern int vm_swappiness;
187	190	extern int remove_mapping(struct address_space mapping, struct page page);
...	...	@@ -22,10 +22,15 @@
22	22	#include <linux/cgroup.h>
23	23	#include <linux/mm.h>
24	24	#include <linux/page-flags.h>
	25	+#include <linux/backing-dev.h>
25	26	#include <linux/bit_spinlock.h>
26	27	#include <linux/rcupdate.h>
	28	+#include <linux/swap.h>
	29	+#include <linux/spinlock.h>
	30	+#include <linux/fs.h>
27	31
28	32	struct cgroup_subsys mem_cgroup_subsys;
	33	+static const int MEM_CGROUP_RECLAIM_RETRIES = 5;
29	34
30	35	/*
31	36	* The memory controller data structure. The memory controller controls both
...	...	@@ -51,6 +56,10 @@
51	56	*/
52	57	struct list_head active_list;
53	58	struct list_head inactive_list;
	59	+ /*
	60	+ * spin_lock to protect the per cgroup LRU
	61	+ */
	62	+ spinlock_t lru_lock;
54	63	};
55	64
56	65	/*
57	66
...	...	@@ -141,7 +150,95 @@
141	150	bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
142	151	}
143	152
	153	+void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
	154	+{
	155	+ if (active)
	156	+ list_move(&pc->lru, &pc->mem_cgroup->active_list);
	157	+ else
	158	+ list_move(&pc->lru, &pc->mem_cgroup->inactive_list);
	159	+}
	160	+
144	161	/*
	162	+ * This routine assumes that the appropriate zone's lru lock is already held
	163	+ */
	164	+void mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
	165	+{
	166	+ struct mem_cgroup *mem;
	167	+ if (!pc)
	168	+ return;
	169	+
	170	+ mem = pc->mem_cgroup;
	171	+
	172	+ spin_lock(&mem->lru_lock);
	173	+ __mem_cgroup_move_lists(pc, active);
	174	+ spin_unlock(&mem->lru_lock);
	175	+}
	176	+
	177	+unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
	178	+ struct list_head *dst,
	179	+ unsigned long *scanned, int order,
	180	+ int mode, struct zone *z,
	181	+ struct mem_cgroup *mem_cont,
	182	+ int active)
	183	+{
	184	+ unsigned long nr_taken = 0;
	185	+ struct page *page;
	186	+ unsigned long scan;
	187	+ LIST_HEAD(pc_list);
	188	+ struct list_head *src;
	189	+ struct page_cgroup *pc;
	190	+
	191	+ if (active)
	192	+ src = &mem_cont->active_list;
	193	+ else
	194	+ src = &mem_cont->inactive_list;
	195	+
	196	+ spin_lock(&mem_cont->lru_lock);
	197	+ for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
	198	+ pc = list_entry(src->prev, struct page_cgroup, lru);
	199	+ page = pc->page;
	200	+ VM_BUG_ON(!pc);
	201	+
	202	+ if (PageActive(page) && !active) {
	203	+ __mem_cgroup_move_lists(pc, true);
	204	+ scan--;
	205	+ continue;
	206	+ }
	207	+ if (!PageActive(page) && active) {
	208	+ __mem_cgroup_move_lists(pc, false);
	209	+ scan--;
	210	+ continue;
	211	+ }
	212	+
	213	+ /*
	214	+ * Reclaim, per zone
	215	+ * TODO: make the active/inactive lists per zone
	216	+ */
	217	+ if (page_zone(page) != z)
	218	+ continue;
	219	+
	220	+ /*
	221	+ * Check if the meta page went away from under us
	222	+ */
	223	+ if (!list_empty(&pc->lru))
	224	+ list_move(&pc->lru, &pc_list);
	225	+ else
	226	+ continue;
	227	+
	228	+ if (__isolate_lru_page(page, mode) == 0) {
	229	+ list_move(&page->lru, dst);
	230	+ nr_taken++;
	231	+ }
	232	+ }
	233	+
	234	+ list_splice(&pc_list, src);
	235	+ spin_unlock(&mem_cont->lru_lock);
	236	+
	237	+ *scanned = scan;
	238	+ return nr_taken;
	239	+}
	240	+
	241	+/*
145	242	* Charge the memory controller for page usage.
146	243	* Return
147	244	* 0 if the charge was successful
...	...	@@ -151,6 +248,8 @@
151	248	{
152	249	struct mem_cgroup *mem;
153	250	struct page_cgroup pc, race_pc;
	251	+ unsigned long flags;
	252	+ unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
154	253
155	254	/*
156	255	* Should page_cgroup's go to their own slab?
157	256
...	...	@@ -159,14 +258,20 @@
159	258	* to see if the cgroup page already has a page_cgroup associated
160	259	* with it
161	260	*/
	261	+retry:
162	262	lock_page_cgroup(page);
163	263	pc = page_get_page_cgroup(page);
164	264	/*
165	265	* The page_cgroup exists and the page has already been accounted
166	266	*/
167	267	if (pc) {
168		- atomic_inc(&pc->ref_cnt);
169		- goto done;
	268	+ if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) {
	269	+ /* this page is under being uncharged ? */
	270	+ unlock_page_cgroup(page);
	271	+ cpu_relax();
	272	+ goto retry;
	273	+ } else
	274	+ goto done;
170	275	}
171	276
172	277	unlock_page_cgroup(page);
...	...	@@ -197,7 +302,32 @@
197	302	* If we created the page_cgroup, we should free it on exceeding
198	303	* the cgroup limit.
199	304	*/
200		- if (res_counter_charge(&mem->res, 1)) {
	305	+ while (res_counter_charge(&mem->res, 1)) {
	306	+ if (try_to_free_mem_cgroup_pages(mem))
	307	+ continue;
	308	+
	309	+ /*
	310	+ * try_to_free_mem_cgroup_pages() might not give us a full
	311	+ * picture of reclaim. Some pages are reclaimed and might be
	312	+ * moved to swap cache or just unmapped from the cgroup.
	313	+ * Check the limit again to see if the reclaim reduced the
	314	+ * current usage of the cgroup before giving up
	315	+ */
	316	+ if (res_counter_check_under_limit(&mem->res))
	317	+ continue;
	318	+ /*
	319	+ * Since we control both RSS and cache, we end up with a
	320	+ * very interesting scenario where we end up reclaiming
	321	+ * memory (essentially RSS), since the memory is pushed
	322	+ * to swap cache, we eventually end up adding those
	323	+ * pages back to our list. Hence we give ourselves a
	324	+ * few chances before we fail
	325	+ */
	326	+ else if (nr_retries--) {
	327	+ congestion_wait(WRITE, HZ/10);
	328	+ continue;
	329	+ }
	330	+
201	331	css_put(&mem->css);
202	332	goto free_pc;
203	333	}
204	334
205	335
...	...	@@ -221,14 +351,16 @@
221	351	pc->page = page;
222	352	page_assign_page_cgroup(page, pc);
223	353
	354	+ spin_lock_irqsave(&mem->lru_lock, flags);
	355	+ list_add(&pc->lru, &mem->active_list);
	356	+ spin_unlock_irqrestore(&mem->lru_lock, flags);
	357	+
224	358	done:
225	359	unlock_page_cgroup(page);
226	360	return 0;
227	361	free_pc:
228	362	kfree(pc);
229		- return -ENOMEM;
230	363	err:
231		- unlock_page_cgroup(page);
232	364	return -ENOMEM;
233	365	}
234	366
...	...	@@ -240,6 +372,7 @@
240	372	{
241	373	struct mem_cgroup *mem;
242	374	struct page *page;
	375	+ unsigned long flags;
243	376
244	377	if (!pc)
245	378	return;
...	...	@@ -252,6 +385,10 @@
252	385	page_assign_page_cgroup(page, NULL);
253	386	unlock_page_cgroup(page);
254	387	res_counter_uncharge(&mem->res, 1);
	388	+
	389	+ spin_lock_irqsave(&mem->lru_lock, flags);
	390	+ list_del_init(&pc->lru);
	391	+ spin_unlock_irqrestore(&mem->lru_lock, flags);
255	392	kfree(pc);
256	393	}
257	394	}
...	...	@@ -310,6 +447,7 @@
310	447	res_counter_init(&mem->res);
311	448	INIT_LIST_HEAD(&mem->active_list);
312	449	INIT_LIST_HEAD(&mem->inactive_list);
	450	+ spin_lock_init(&mem->lru_lock);
313	451	return &mem->css;
314	452	}
315	453
...	...	@@ -29,6 +29,7 @@
29	29	#include <linux/cpu.h>
30	30	#include <linux/notifier.h>
31	31	#include <linux/backing-dev.h>
	32	+#include <linux/memcontrol.h>
32	33
33	34	/* How many pages do we try to swap or page in/out together? */
34	35	int page_cluster;
...	...	@@ -175,6 +176,7 @@
175	176	SetPageActive(page);
176	177	add_page_to_active_list(zone, page);
177	178	__count_vm_event(PGACTIVATE);
	179	+ mem_cgroup_move_lists(page_get_page_cgroup(page), true);
178	180	}
179	181	spin_unlock_irq(&zone->lru_lock);
180	182	}
...	...	@@ -37,6 +37,7 @@
37	37	#include <linux/delay.h>
38	38	#include <linux/kthread.h>
39	39	#include <linux/freezer.h>
	40	+#include <linux/memcontrol.h>
40	41
41	42	#include <asm/tlbflush.h>
42	43	#include <asm/div64.h>
...	...	@@ -68,6 +69,15 @@
68	69	int all_unreclaimable;
69	70
70	71	int order;
	72	+
	73	+ /* Which cgroup do we reclaim from */
	74	+ struct mem_cgroup *mem_cgroup;
	75	+
	76	+ /* Pluggable isolate pages callback */
	77	+ unsigned long (isolate_pages)(unsigned long nr, struct list_head dst,
	78	+ unsigned long *scanned, int order, int mode,
	79	+ struct zone z, struct mem_cgroup mem_cont,
	80	+ int active);
71	81	};
72	82
73	83	#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
...	...	@@ -626,7 +636,7 @@
626	636	*
627	637	* returns 0 on success, -ve errno on failure.
628	638	*/
629		-static int __isolate_lru_page(struct page *page, int mode)
	639	+int __isolate_lru_page(struct page *page, int mode)
630	640	{
631	641	int ret = -EINVAL;
632	642
...	...	@@ -760,6 +770,21 @@
760	770	return nr_taken;
761	771	}
762	772
	773	+static unsigned long isolate_pages_global(unsigned long nr,
	774	+ struct list_head *dst,
	775	+ unsigned long *scanned, int order,
	776	+ int mode, struct zone *z,
	777	+ struct mem_cgroup *mem_cont,
	778	+ int active)
	779	+{
	780	+ if (active)
	781	+ return isolate_lru_pages(nr, &z->active_list, dst,
	782	+ scanned, order, mode);
	783	+ else
	784	+ return isolate_lru_pages(nr, &z->inactive_list, dst,
	785	+ scanned, order, mode);
	786	+}
	787	+
763	788	/*
764	789	* clear_active_flags() is a helper for shrink_active_list(), clearing
765	790	* any active bits from the pages in the list.
766	791
...	...	@@ -801,11 +826,11 @@
801	826	unsigned long nr_freed;
802	827	unsigned long nr_active;
803	828
804		- nr_taken = isolate_lru_pages(sc->swap_cluster_max,
805		- &zone->inactive_list,
	829	+ nr_taken = sc->isolate_pages(sc->swap_cluster_max,
806	830	&page_list, &nr_scan, sc->order,
807	831	(sc->order > PAGE_ALLOC_COSTLY_ORDER)?
808		- ISOLATE_BOTH : ISOLATE_INACTIVE);
	832	+ ISOLATE_BOTH : ISOLATE_INACTIVE,
	833	+ zone, sc->mem_cgroup, 0);
809	834	nr_active = clear_active_flags(&page_list);
810	835	__count_vm_events(PGDEACTIVATE, nr_active);
811	836
...	...	@@ -1018,8 +1043,9 @@
1018	1043
1019	1044	lru_add_drain();
1020	1045	spin_lock_irq(&zone->lru_lock);
1021		- pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
1022		- &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE);
	1046	+ pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
	1047	+ ISOLATE_ACTIVE, zone,
	1048	+ sc->mem_cgroup, 1);
1023	1049	zone->pages_scanned += pgscanned;
1024	1050	__mod_zone_page_state(zone, NR_ACTIVE, -pgmoved);
1025	1051	spin_unlock_irq(&zone->lru_lock);
...	...	@@ -1051,6 +1077,7 @@
1051	1077	ClearPageActive(page);
1052	1078
1053	1079	list_move(&page->lru, &zone->inactive_list);
	1080	+ mem_cgroup_move_lists(page_get_page_cgroup(page), false);
1054	1081	pgmoved++;
1055	1082	if (!pagevec_add(&pvec, page)) {
1056	1083	__mod_zone_page_state(zone, NR_INACTIVE, pgmoved);
...	...	@@ -1079,6 +1106,7 @@
1079	1106	SetPageLRU(page);
1080	1107	VM_BUG_ON(!PageActive(page));
1081	1108	list_move(&page->lru, &zone->active_list);
	1109	+ mem_cgroup_move_lists(page_get_page_cgroup(page), true);
1082	1110	pgmoved++;
1083	1111	if (!pagevec_add(&pvec, page)) {
1084	1112	__mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
...	...	@@ -1206,7 +1234,8 @@
1206	1234	* holds filesystem locks which prevent writeout this might not work, and the
1207	1235	* allocation attempt will fail.
1208	1236	*/
1209		-unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
	1237	+static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
	1238	+ struct scan_control *sc)
1210	1239	{
1211	1240	int priority;
1212	1241	int ret = 0;
...	...	@@ -1215,14 +1244,6 @@
1215	1244	struct reclaim_state *reclaim_state = current->reclaim_state;
1216	1245	unsigned long lru_pages = 0;
1217	1246	int i;
1218		- struct scan_control sc = {
1219		- .gfp_mask = gfp_mask,
1220		- .may_writepage = !laptop_mode,
1221		- .swap_cluster_max = SWAP_CLUSTER_MAX,
1222		- .may_swap = 1,
1223		- .swappiness = vm_swappiness,
1224		- .order = order,
1225		- };
1226	1247
1227	1248	count_vm_event(ALLOCSTALL);
1228	1249
1229	1250
1230	1251
...	...	@@ -1237,17 +1258,22 @@
1237	1258	}
1238	1259
1239	1260	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
1240		- sc.nr_scanned = 0;
	1261	+ sc->nr_scanned = 0;
1241	1262	if (!priority)
1242	1263	disable_swap_token();
1243		- nr_reclaimed += shrink_zones(priority, zones, &sc);
1244		- shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
	1264	+ nr_reclaimed += shrink_zones(priority, zones, sc);
	1265	+ /*
	1266	+ * Don't shrink slabs when reclaiming memory from
	1267	+ * over limit cgroups
	1268	+ */
	1269	+ if (sc->mem_cgroup == NULL)
	1270	+ shrink_slab(sc->nr_scanned, gfp_mask, lru_pages);
1245	1271	if (reclaim_state) {
1246	1272	nr_reclaimed += reclaim_state->reclaimed_slab;
1247	1273	reclaim_state->reclaimed_slab = 0;
1248	1274	}
1249		- total_scanned += sc.nr_scanned;
1250		- if (nr_reclaimed >= sc.swap_cluster_max) {
	1275	+ total_scanned += sc->nr_scanned;
	1276	+ if (nr_reclaimed >= sc->swap_cluster_max) {
1251	1277	ret = 1;
1252	1278	goto out;
1253	1279	}
1254	1280
1255	1281
1256	1282
...	...	@@ -1259,18 +1285,18 @@
1259	1285	* that's undesirable in laptop mode, where we want lumpy
1260	1286	* writeout. So in laptop mode, write out the whole world.
1261	1287	*/
1262		- if (total_scanned > sc.swap_cluster_max +
1263		- sc.swap_cluster_max / 2) {
	1288	+ if (total_scanned > sc->swap_cluster_max +
	1289	+ sc->swap_cluster_max / 2) {
1264	1290	wakeup_pdflush(laptop_mode ? 0 : total_scanned);
1265		- sc.may_writepage = 1;
	1291	+ sc->may_writepage = 1;
1266	1292	}
1267	1293
1268	1294	/* Take a nap, wait for some writeback to complete */
1269		- if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
	1295	+ if (sc->nr_scanned && priority < DEF_PRIORITY - 2)
1270	1296	congestion_wait(WRITE, HZ/10);
1271	1297	}
1272	1298	/* top priority shrink_caches still had more to do? don't OOM, then */
1273		- if (!sc.all_unreclaimable)
	1299	+ if (!sc->all_unreclaimable && sc->mem_cgroup == NULL)
1274	1300	ret = 1;
1275	1301	out:
1276	1302	/*
...	...	@@ -1293,6 +1319,54 @@
1293	1319	return ret;
1294	1320	}
1295	1321
	1322	+unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
	1323	+{
	1324	+ struct scan_control sc = {
	1325	+ .gfp_mask = gfp_mask,
	1326	+ .may_writepage = !laptop_mode,
	1327	+ .swap_cluster_max = SWAP_CLUSTER_MAX,
	1328	+ .may_swap = 1,
	1329	+ .swappiness = vm_swappiness,
	1330	+ .order = order,
	1331	+ .mem_cgroup = NULL,
	1332	+ .isolate_pages = isolate_pages_global,
	1333	+ };
	1334	+
	1335	+ return do_try_to_free_pages(zones, gfp_mask, &sc);
	1336	+}
	1337	+
	1338	+#ifdef CONFIG_CGROUP_MEM_CONT
	1339	+
	1340	+#ifdef CONFIG_HIGHMEM
	1341	+#define ZONE_USERPAGES ZONE_HIGHMEM
	1342	+#else
	1343	+#define ZONE_USERPAGES ZONE_NORMAL
	1344	+#endif
	1345	+
	1346	+unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont)
	1347	+{
	1348	+ struct scan_control sc = {
	1349	+ .gfp_mask = GFP_KERNEL,
	1350	+ .may_writepage = !laptop_mode,
	1351	+ .may_swap = 1,
	1352	+ .swap_cluster_max = SWAP_CLUSTER_MAX,
	1353	+ .swappiness = vm_swappiness,
	1354	+ .order = 0,
	1355	+ .mem_cgroup = mem_cont,
	1356	+ .isolate_pages = mem_cgroup_isolate_pages,
	1357	+ };
	1358	+ int node;
	1359	+ struct zone **zones;
	1360	+
	1361	+ for_each_online_node(node) {
	1362	+ zones = NODE_DATA(node)->node_zonelists[ZONE_USERPAGES].zones;
	1363	+ if (do_try_to_free_pages(zones, sc.gfp_mask, &sc))
	1364	+ return 1;
	1365	+ }
	1366	+ return 0;
	1367	+}
	1368	+#endif
	1369	+
1296	1370	/*
1297	1371	* For kswapd, balance_pgdat() will work across all this node's zones until
1298	1372	* they are all at pages_high.
...	...	@@ -1328,6 +1402,8 @@
1328	1402	.swap_cluster_max = SWAP_CLUSTER_MAX,
1329	1403	.swappiness = vm_swappiness,
1330	1404	.order = order,
	1405	+ .mem_cgroup = NULL,
	1406	+ .isolate_pages = isolate_pages_global,
1331	1407	};
1332	1408	/*
1333	1409	* temp_priority is used to remember the scanning priority at which
...	...	@@ -1649,6 +1725,7 @@
1649	1725	.swap_cluster_max = nr_pages,
1650	1726	.may_writepage = 1,
1651	1727	.swappiness = vm_swappiness,
	1728	+ .isolate_pages = isolate_pages_global,
1652	1729	};
1653	1730
1654	1731	current->reclaim_state = &reclaim_state;
...	...	@@ -1834,6 +1911,7 @@
1834	1911	SWAP_CLUSTER_MAX),
1835	1912	.gfp_mask = gfp_mask,
1836	1913	.swappiness = vm_swappiness,
	1914	+ .isolate_pages = isolate_pages_global,
1837	1915	};
1838	1916	unsigned long slab_reclaimable;
1839	1917