memcg: remove clear_page_cgroup and atomics

Remove clear_page_cgroup: it's an unhelpful helper, see for example how mem_cgroup_uncharge_page had to unlock_page_cgroup just in order to call it (serious races from that? I'm not sure). Once that's gone, you can see it's pointless for page_cgroup's ref_cnt to be atomic: it's always manipulated under lock_page_cgroup, except where force_empty unilaterally reset it to 0 (and how does uncharge's atomic_dec_and_test protect against that?). Simplify this page_cgroup locking: if you've got the lock and the pc is attached, then the ref_cnt must be positive: VM_BUG_ONs to check that, and to check that pc->page matches page (we're on the way to finding why sometimes it doesn't, but this patch doesn't fix that). Signed-off-by: Hugh Dickins <hugh@veritas.com> Cc: David Rientjes <rientjes@google.com> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Hirokazu Takahashi <taka@valinux.co.jp> Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp> Cc: Paul Menage <menage@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

memcg: remove clear_page_cgroup and atomics
Remove clear_page_cgroup: it's an unhelpful helper, see for example how mem_cgroup_uncharge_page had to unlock_page_cgroup just in order to call it (serious races from that? I'm not sure). Once that's gone, you can see it's pointless for page_cgroup's ref_cnt to be atomic: it's always manipulated under lock_page_cgroup, except where force_empty unilaterally reset it to 0 (and how does uncharge's atomic_dec_and_test protect against that?). Simplify this page_cgroup locking: if you've got the lock and the pc is attached, then the ref_cnt must be positive: VM_BUG_ONs to check that, and to check that pc->page matches page (we're on the way to finding why sometimes it doesn't, but this patch doesn't fix that). Signed-off-by: Hugh Dickins <hugh@veritas.com> Cc: David Rientjes <rientjes@google.com> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Hirokazu Takahashi <taka@valinux.co.jp> Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp> Cc: Paul Menage <menage@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Hugh Dickins · Linus Torvalds
1 parent d5b69e38f8
Showing 1 changed file with 43 additions and 63 deletions Side-by-side Diff
mm/memcontrol.c
@@ -161,8 +161,7 @@
 	struct list_head lru;		/* per cgroup LRU list */
 	struct page *page;
 	struct mem_cgroup *mem_cgroup;
-	atomic_t ref_cnt;		/* Helpful when pages move b/w  */
-					/* mapped and cached states     */
+	int ref_cnt;			/* cached, mapped, migrating */
 	int flags;
 };
 #define PAGE_CGROUP_FLAG_CACHE	(0x1)	/* charged as cache */
@@ -283,27 +282,6 @@
 	bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
 }
  
-/*
- * Clear page->page_cgroup member under lock_page_cgroup().
- * If given "pc" value is different from one page->page_cgroup,
- * page->cgroup is not cleared.
- * Returns a value of page->page_cgroup at lock taken.
- * A can can detect failure of clearing by following
- *  clear_page_cgroup(page, pc) == pc
- */
-static struct page_cgroup *clear_page_cgroup(struct page *page,
-						struct page_cgroup *pc)
-{
-	struct page_cgroup *ret;
-	/* lock and clear */
-	lock_page_cgroup(page);
-	ret = page_get_page_cgroup(page);
-	if (likely(ret == pc))
-		page_assign_page_cgroup(page, NULL);
-	unlock_page_cgroup(page);
-	return ret;
-}
-
 static void __mem_cgroup_remove_list(struct page_cgroup *pc)
 {
 	int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
@@ -555,15 +533,12 @@
 	 * the page has already been accounted.
 	 */
 	if (pc) {
-		if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) {
-			/* this page is under being uncharged ? */
-			unlock_page_cgroup(page);
-			cpu_relax();
-			goto retry;
-		} else {
-			unlock_page_cgroup(page);
-			goto done;
-		}
+		VM_BUG_ON(pc->page != page);
+		VM_BUG_ON(pc->ref_cnt <= 0);
+
+		pc->ref_cnt++;
+		unlock_page_cgroup(page);
+		goto done;
 	}
 	unlock_page_cgroup(page);
  
@@ -612,7 +587,7 @@
 		congestion_wait(WRITE, HZ/10);
 	}
  
-	atomic_set(&pc->ref_cnt, 1);
+	pc->ref_cnt = 1;
 	pc->mem_cgroup = mem;
 	pc->page = page;
 	pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
  
@@ -683,24 +658,24 @@
 	if (!pc)
 		goto unlock;
  
-	if (atomic_dec_and_test(&pc->ref_cnt)) {
-		page = pc->page;
-		mz = page_cgroup_zoneinfo(pc);
-		/*
-		 * get page->cgroup and clear it under lock.
-		 * force_empty can drop page->cgroup without checking refcnt.
-		 */
+	VM_BUG_ON(pc->page != page);
+	VM_BUG_ON(pc->ref_cnt <= 0);
+
+	if (--(pc->ref_cnt) == 0) {
+		page_assign_page_cgroup(page, NULL);
 		unlock_page_cgroup(page);
-		if (clear_page_cgroup(page, pc) == pc) {
-			mem = pc->mem_cgroup;
-			css_put(&mem->css);
-			res_counter_uncharge(&mem->res, PAGE_SIZE);
-			spin_lock_irqsave(&mz->lru_lock, flags);
-			__mem_cgroup_remove_list(pc);
-			spin_unlock_irqrestore(&mz->lru_lock, flags);
-			kfree(pc);
-		}
-		lock_page_cgroup(page);
+
+		mem = pc->mem_cgroup;
+		css_put(&mem->css);
+		res_counter_uncharge(&mem->res, PAGE_SIZE);
+
+		mz = page_cgroup_zoneinfo(pc);
+		spin_lock_irqsave(&mz->lru_lock, flags);
+		__mem_cgroup_remove_list(pc);
+		spin_unlock_irqrestore(&mz->lru_lock, flags);
+
+		kfree(pc);
+		return;
 	}
  
 unlock:
  
  
@@ -714,14 +689,13 @@
 int mem_cgroup_prepare_migration(struct page *page)
 {
 	struct page_cgroup *pc;
-	int ret = 0;
  
 	lock_page_cgroup(page);
 	pc = page_get_page_cgroup(page);
-	if (pc && atomic_inc_not_zero(&pc->ref_cnt))
-		ret = 1;
+	if (pc)
+		pc->ref_cnt++;
 	unlock_page_cgroup(page);
-	return ret;
+	return pc != NULL;
 }
  
 void mem_cgroup_end_migration(struct page *page)
  
  
  
  
@@ -740,15 +714,17 @@
 	struct mem_cgroup_per_zone *mz;
 	unsigned long flags;
  
-retry:
+	lock_page_cgroup(page);
 	pc = page_get_page_cgroup(page);
-	if (!pc)
+	if (!pc) {
+		unlock_page_cgroup(page);
 		return;
+	}
  
-	mz = page_cgroup_zoneinfo(pc);
-	if (clear_page_cgroup(page, pc) != pc)
-		goto retry;
+	page_assign_page_cgroup(page, NULL);
+	unlock_page_cgroup(page);
  
+	mz = page_cgroup_zoneinfo(pc);
 	spin_lock_irqsave(&mz->lru_lock, flags);
 	__mem_cgroup_remove_list(pc);
 	spin_unlock_irqrestore(&mz->lru_lock, flags);
  
  
@@ -794,15 +770,19 @@
 	while (--count && !list_empty(list)) {
 		pc = list_entry(list->prev, struct page_cgroup, lru);
 		page = pc->page;
-		/* Avoid race with charge */
-		atomic_set(&pc->ref_cnt, 0);
-		if (clear_page_cgroup(page, pc) == pc) {
+		lock_page_cgroup(page);
+		if (page_get_page_cgroup(page) == pc) {
+			page_assign_page_cgroup(page, NULL);
+			unlock_page_cgroup(page);
 			css_put(&mem->css);
 			res_counter_uncharge(&mem->res, PAGE_SIZE);
 			__mem_cgroup_remove_list(pc);
 			kfree(pc);
-		} else 	/* being uncharged ? ...do relax */
+		} else {
+			/* racing uncharge: let page go then retry */
+			unlock_page_cgroup(page);
 			break;
+		}
 	}
  
 	spin_unlock_irqrestore(&mz->lru_lock, flags);
...	...	@@ -161,8 +161,7 @@
161	161	struct list_head lru; /* per cgroup LRU list */
162	162	struct page *page;
163	163	struct mem_cgroup *mem_cgroup;
164		- atomic_t ref_cnt; /* Helpful when pages move b/w */
165		- /* mapped and cached states */
	164	+ int ref_cnt; /* cached, mapped, migrating */
166	165	int flags;
167	166	};
168	167	#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */
...	...	@@ -283,27 +282,6 @@
283	282	bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
284	283	}
285	284
286		-/*
287		- * Clear page->page_cgroup member under lock_page_cgroup().
288		- * If given "pc" value is different from one page->page_cgroup,
289		- * page->cgroup is not cleared.
290		- * Returns a value of page->page_cgroup at lock taken.
291		- * A can can detect failure of clearing by following
292		- * clear_page_cgroup(page, pc) == pc
293		- */
294		-static struct page_cgroup clear_page_cgroup(struct page page,
295		- struct page_cgroup *pc)
296		-{
297		- struct page_cgroup *ret;
298		- /* lock and clear */
299		- lock_page_cgroup(page);
300		- ret = page_get_page_cgroup(page);
301		- if (likely(ret == pc))
302		- page_assign_page_cgroup(page, NULL);
303		- unlock_page_cgroup(page);
304		- return ret;
305		-}
306		-
307	285	static void __mem_cgroup_remove_list(struct page_cgroup *pc)
308	286	{
309	287	int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
...	...	@@ -555,15 +533,12 @@
555	533	* the page has already been accounted.
556	534	*/
557	535	if (pc) {
558		- if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) {
559		- /* this page is under being uncharged ? */
560		- unlock_page_cgroup(page);
561		- cpu_relax();
562		- goto retry;
563		- } else {
564		- unlock_page_cgroup(page);
565		- goto done;
566		- }
	536	+ VM_BUG_ON(pc->page != page);
	537	+ VM_BUG_ON(pc->ref_cnt <= 0);
	538	+
	539	+ pc->ref_cnt++;
	540	+ unlock_page_cgroup(page);
	541	+ goto done;
567	542	}
568	543	unlock_page_cgroup(page);
569	544
...	...	@@ -612,7 +587,7 @@
612	587	congestion_wait(WRITE, HZ/10);
613	588	}
614	589
615		- atomic_set(&pc->ref_cnt, 1);
	590	+ pc->ref_cnt = 1;
616	591	pc->mem_cgroup = mem;
617	592	pc->page = page;
618	593	pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
619	594
...	...	@@ -683,24 +658,24 @@
683	658	if (!pc)
684	659	goto unlock;
685	660
686		- if (atomic_dec_and_test(&pc->ref_cnt)) {
687		- page = pc->page;
688		- mz = page_cgroup_zoneinfo(pc);
689		- /*
690		- * get page->cgroup and clear it under lock.
691		- * force_empty can drop page->cgroup without checking refcnt.
692		- */
	661	+ VM_BUG_ON(pc->page != page);
	662	+ VM_BUG_ON(pc->ref_cnt <= 0);
	663	+
	664	+ if (--(pc->ref_cnt) == 0) {
	665	+ page_assign_page_cgroup(page, NULL);
693	666	unlock_page_cgroup(page);
694		- if (clear_page_cgroup(page, pc) == pc) {
695		- mem = pc->mem_cgroup;
696		- css_put(&mem->css);
697		- res_counter_uncharge(&mem->res, PAGE_SIZE);
698		- spin_lock_irqsave(&mz->lru_lock, flags);
699		- __mem_cgroup_remove_list(pc);
700		- spin_unlock_irqrestore(&mz->lru_lock, flags);
701		- kfree(pc);
702		- }
703		- lock_page_cgroup(page);
	667	+
	668	+ mem = pc->mem_cgroup;
	669	+ css_put(&mem->css);
	670	+ res_counter_uncharge(&mem->res, PAGE_SIZE);
	671	+
	672	+ mz = page_cgroup_zoneinfo(pc);
	673	+ spin_lock_irqsave(&mz->lru_lock, flags);
	674	+ __mem_cgroup_remove_list(pc);
	675	+ spin_unlock_irqrestore(&mz->lru_lock, flags);
	676	+
	677	+ kfree(pc);
	678	+ return;
704	679	}
705	680
706	681	unlock:
707	682
708	683
...	...	@@ -714,14 +689,13 @@
714	689	int mem_cgroup_prepare_migration(struct page *page)
715	690	{
716	691	struct page_cgroup *pc;
717		- int ret = 0;
718	692
719	693	lock_page_cgroup(page);
720	694	pc = page_get_page_cgroup(page);
721		- if (pc && atomic_inc_not_zero(&pc->ref_cnt))
722		- ret = 1;
	695	+ if (pc)
	696	+ pc->ref_cnt++;
723	697	unlock_page_cgroup(page);
724		- return ret;
	698	+ return pc != NULL;
725	699	}
726	700
727	701	void mem_cgroup_end_migration(struct page *page)
728	702
729	703
730	704
731	705
...	...	@@ -740,15 +714,17 @@
740	714	struct mem_cgroup_per_zone *mz;
741	715	unsigned long flags;
742	716
743		-retry:
	717	+ lock_page_cgroup(page);
744	718	pc = page_get_page_cgroup(page);
745		- if (!pc)
	719	+ if (!pc) {
	720	+ unlock_page_cgroup(page);
746	721	return;
	722	+ }
747	723
748		- mz = page_cgroup_zoneinfo(pc);
749		- if (clear_page_cgroup(page, pc) != pc)
750		- goto retry;
	724	+ page_assign_page_cgroup(page, NULL);
	725	+ unlock_page_cgroup(page);
751	726
	727	+ mz = page_cgroup_zoneinfo(pc);
752	728	spin_lock_irqsave(&mz->lru_lock, flags);
753	729	__mem_cgroup_remove_list(pc);
754	730	spin_unlock_irqrestore(&mz->lru_lock, flags);
755	731
756	732
...	...	@@ -794,15 +770,19 @@
794	770	while (--count && !list_empty(list)) {
795	771	pc = list_entry(list->prev, struct page_cgroup, lru);
796	772	page = pc->page;
797		- /* Avoid race with charge */
798		- atomic_set(&pc->ref_cnt, 0);
799		- if (clear_page_cgroup(page, pc) == pc) {
	773	+ lock_page_cgroup(page);
	774	+ if (page_get_page_cgroup(page) == pc) {
	775	+ page_assign_page_cgroup(page, NULL);
	776	+ unlock_page_cgroup(page);
800	777	css_put(&mem->css);
801	778	res_counter_uncharge(&mem->res, PAGE_SIZE);
802	779	__mem_cgroup_remove_list(pc);
803	780	kfree(pc);
804		- } else /* being uncharged ? ...do relax */
	781	+ } else {
	782	+ /* racing uncharge: let page go then retry */
	783	+ unlock_page_cgroup(page);
805	784	break;
	785	+ }
806	786	}
807	787
808	788	spin_unlock_irqrestore(&mz->lru_lock, flags);