mm/zswap: support multiple swap devices

Cai Liu reporeted that now zbud pool pages counting has a problem when multiple swap is used because it just counts only one swap intead of all of swap so zswap cannot control writeback properly. The result is unnecessary writeback or no writeback when we should really writeback. IOW, it made zswap crazy. Another problem in zswap is: For example, let's assume we use two swap A and B with different priority and A already has charged 19% long time ago and let's assume that A swap is full now so VM start to use B so that B has charged 1% recently. It menas zswap charged (19% + 1%) is full by default. Then, if VM want to swap out more pages into B, zbud_reclaim_page would be evict one of pages in B's pool and it would be repeated continuously. It's totally LRU reverse problem and swap thrashing in B would happen. This patch makes zswap consider mutliple swap by creating *a* zbud pool which will be shared by multiple swap so all of zswap pages in multiple swap keep order by LRU so it can prevent above two problems. Signed-off-by: Minchan Kim <minchan@kernel.org> Reported-by: Cai Liu <cai.liu@samsung.com> Suggested-by: Weijie Yang <weijie.yang.kh@gmail.com> Cc: Seth Jennings <sjennings@variantweb.net> Reviewed-by: Bob Liu <bob.liu@oracle.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

mm/zswap: support multiple swap devices
Cai Liu reporeted that now zbud pool pages counting has a problem when multiple swap is used because it just counts only one swap intead of all of swap so zswap cannot control writeback properly. The result is unnecessary writeback or no writeback when we should really writeback. IOW, it made zswap crazy. Another problem in zswap is: For example, let's assume we use two swap A and B with different priority and A already has charged 19% long time ago and let's assume that A swap is full now so VM start to use B so that B has charged 1% recently. It menas zswap charged (19% + 1%) is full by default. Then, if VM want to swap out more pages into B, zbud_reclaim_page would be evict one of pages in B's pool and it would be repeated continuously. It's totally LRU reverse problem and swap thrashing in B would happen. This patch makes zswap consider mutliple swap by creating *a* zbud pool which will be shared by multiple swap so all of zswap pages in multiple swap keep order by LRU so it can prevent above two problems. Signed-off-by: Minchan Kim <minchan@kernel.org> Reported-by: Cai Liu <cai.liu@samsung.com> Suggested-by: Weijie Yang <weijie.yang.kh@gmail.com> Cc: Seth Jennings <sjennings@variantweb.net> Reviewed-by: Bob Liu <bob.liu@oracle.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Minchan Kim · Linus Torvalds
1 parent 6335b19344
Showing 1 changed file with 33 additions and 31 deletions Side-by-side Diff
mm/zswap.c
@@ -89,6 +89,9 @@
 module_param_named(max_pool_percent,
 			zswap_max_pool_percent, uint, 0644);
  
+/* zbud_pool is shared by all of zswap backend  */
+static struct zbud_pool *zswap_pool;
+
 /*********************************
 * compression functions
 **********************************/
@@ -189,7 +192,6 @@
 struct zswap_tree {
 	struct rb_root rbroot;
 	spinlock_t lock;
-	struct zbud_pool *pool;
 };
  
 static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
  
  
@@ -285,13 +287,12 @@
  * Carries out the common pattern of freeing and entry's zbud allocation,
  * freeing the entry itself, and decrementing the number of stored pages.
  */
-static void zswap_free_entry(struct zswap_tree *tree,
-			struct zswap_entry *entry)
+static void zswap_free_entry(struct zswap_entry *entry)
 {
-	zbud_free(tree->pool, entry->handle);
+	zbud_free(zswap_pool, entry->handle);
 	zswap_entry_cache_free(entry);
 	atomic_dec(&zswap_stored_pages);
-	zswap_pool_pages = zbud_get_pool_size(tree->pool);
+	zswap_pool_pages = zbud_get_pool_size(zswap_pool);
 }
  
 /* caller must hold the tree lock */
@@ -311,7 +312,7 @@
 	BUG_ON(refcount < 0);
 	if (refcount == 0) {
 		zswap_rb_erase(&tree->rbroot, entry);
-		zswap_free_entry(tree, entry);
+		zswap_free_entry(entry);
 	}
 }
  
@@ -545,7 +546,6 @@
 	zbud_unmap(pool, handle);
 	tree = zswap_trees[swp_type(swpentry)];
 	offset = swp_offset(swpentry);
-	BUG_ON(pool != tree->pool);
  
 	/* find and ref zswap entry */
 	spin_lock(&tree->lock);
  
@@ -573,13 +573,13 @@
 	case ZSWAP_SWAPCACHE_NEW: /* page is locked */
 		/* decompress */
 		dlen = PAGE_SIZE;
-		src = (u8 *)zbud_map(tree->pool, entry->handle) +
+		src = (u8 *)zbud_map(zswap_pool, entry->handle) +
 			sizeof(struct zswap_header);
 		dst = kmap_atomic(page);
 		ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src,
 				entry->length, dst, &dlen);
 		kunmap_atomic(dst);
-		zbud_unmap(tree->pool, entry->handle);
+		zbud_unmap(zswap_pool, entry->handle);
 		BUG_ON(ret);
 		BUG_ON(dlen != PAGE_SIZE);
  
@@ -652,7 +652,7 @@
 	/* reclaim space if needed */
 	if (zswap_is_full()) {
 		zswap_pool_limit_hit++;
-		if (zbud_reclaim_page(tree->pool, 8)) {
+		if (zbud_reclaim_page(zswap_pool, 8)) {
 			zswap_reject_reclaim_fail++;
 			ret = -ENOMEM;
 			goto reject;
@@ -679,7 +679,7 @@
  
 	/* store */
 	len = dlen + sizeof(struct zswap_header);
-	ret = zbud_alloc(tree->pool, len, __GFP_NORETRY | __GFP_NOWARN,
+	ret = zbud_alloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN,
 		&handle);
 	if (ret == -ENOSPC) {
 		zswap_reject_compress_poor++;
  
@@ -689,11 +689,11 @@
 		zswap_reject_alloc_fail++;
 		goto freepage;
 	}
-	zhdr = zbud_map(tree->pool, handle);
+	zhdr = zbud_map(zswap_pool, handle);
 	zhdr->swpentry = swp_entry(type, offset);
 	buf = (u8 *)(zhdr + 1);
 	memcpy(buf, dst, dlen);
-	zbud_unmap(tree->pool, handle);
+	zbud_unmap(zswap_pool, handle);
 	put_cpu_var(zswap_dstmem);
  
 	/* populate entry */
@@ -716,7 +716,7 @@
  
 	/* update stats */
 	atomic_inc(&zswap_stored_pages);
-	zswap_pool_pages = zbud_get_pool_size(tree->pool);
+	zswap_pool_pages = zbud_get_pool_size(zswap_pool);
  
 	return 0;
  
  
@@ -752,13 +752,13 @@
  
 	/* decompress */
 	dlen = PAGE_SIZE;
-	src = (u8 *)zbud_map(tree->pool, entry->handle) +
+	src = (u8 *)zbud_map(zswap_pool, entry->handle) +
 			sizeof(struct zswap_header);
 	dst = kmap_atomic(page);
 	ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length,
 		dst, &dlen);
 	kunmap_atomic(dst);
-	zbud_unmap(tree->pool, entry->handle);
+	zbud_unmap(zswap_pool, entry->handle);
 	BUG_ON(ret);
  
 	spin_lock(&tree->lock);
  
@@ -804,11 +804,9 @@
 	/* walk the tree and free everything */
 	spin_lock(&tree->lock);
 	rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
-		zswap_free_entry(tree, entry);
+		zswap_free_entry(entry);
 	tree->rbroot = RB_ROOT;
 	spin_unlock(&tree->lock);
-
-	zbud_destroy_pool(tree->pool);
 	kfree(tree);
 	zswap_trees[type] = NULL;
 }
  
@@ -822,20 +820,14 @@
 	struct zswap_tree *tree;
  
 	tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL);
-	if (!tree)
-		goto err;
-	tree->pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops);
-	if (!tree->pool)
-		goto freetree;
+	if (!tree) {
+		pr_err("alloc failed, zswap disabled for swap type %d\n", type);
+		return;
+	}
+
 	tree->rbroot = RB_ROOT;
 	spin_lock_init(&tree->lock);
 	zswap_trees[type] = tree;
-	return;
-
-freetree:
-	kfree(tree);
-err:
-	pr_err("alloc failed, zswap disabled for swap type %d\n", type);
 }
  
 static struct frontswap_ops zswap_frontswap_ops = {
  
@@ -907,9 +899,16 @@
 		return 0;
  
 	pr_info("loading zswap\n");
+
+	zswap_pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops);
+	if (!zswap_pool) {
+		pr_err("zbud pool creation failed\n");
+		goto error;
+	}
+
 	if (zswap_entry_cache_create()) {
 		pr_err("entry cache creation failed\n");
-		goto error;
+		goto cachefail;
 	}
 	if (zswap_comp_init()) {
 		pr_err("compressor initialization failed\n");
@@ -919,6 +918,7 @@
 		pr_err("per-cpu initialization failed\n");
 		goto pcpufail;
 	}
+
 	frontswap_register_ops(&zswap_frontswap_ops);
 	if (zswap_debugfs_init())
 		pr_warn("debugfs initialization failed\n");
@@ -927,6 +927,8 @@
 	zswap_comp_exit();
 compfail:
 	zswap_entry_cache_destory();
+cachefail:
+	zbud_destroy_pool(zswap_pool);
 error:
 	return -ENOMEM;
 }
...	...	@@ -89,6 +89,9 @@
89	89	module_param_named(max_pool_percent,
90	90	zswap_max_pool_percent, uint, 0644);
91	91
	92	+/* zbud_pool is shared by all of zswap backend */
	93	+static struct zbud_pool *zswap_pool;
	94	+
92	95	/*********************************
93	96	* compression functions
94	97	**********************************/
...	...	@@ -189,7 +192,6 @@
189	192	struct zswap_tree {
190	193	struct rb_root rbroot;
191	194	spinlock_t lock;
192		- struct zbud_pool *pool;
193	195	};
194	196
195	197	static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
196	198
197	199
...	...	@@ -285,13 +287,12 @@
285	287	* Carries out the common pattern of freeing and entry's zbud allocation,
286	288	* freeing the entry itself, and decrementing the number of stored pages.
287	289	*/
288		-static void zswap_free_entry(struct zswap_tree *tree,
289		- struct zswap_entry *entry)
	290	+static void zswap_free_entry(struct zswap_entry *entry)
290	291	{
291		- zbud_free(tree->pool, entry->handle);
	292	+ zbud_free(zswap_pool, entry->handle);
292	293	zswap_entry_cache_free(entry);
293	294	atomic_dec(&zswap_stored_pages);
294		- zswap_pool_pages = zbud_get_pool_size(tree->pool);
	295	+ zswap_pool_pages = zbud_get_pool_size(zswap_pool);
295	296	}
296	297
297	298	/* caller must hold the tree lock */
...	...	@@ -311,7 +312,7 @@
311	312	BUG_ON(refcount < 0);
312	313	if (refcount == 0) {
313	314	zswap_rb_erase(&tree->rbroot, entry);
314		- zswap_free_entry(tree, entry);
	315	+ zswap_free_entry(entry);
315	316	}
316	317	}
317	318
...	...	@@ -545,7 +546,6 @@
545	546	zbud_unmap(pool, handle);
546	547	tree = zswap_trees[swp_type(swpentry)];
547	548	offset = swp_offset(swpentry);
548		- BUG_ON(pool != tree->pool);
549	549
550	550	/* find and ref zswap entry */
551	551	spin_lock(&tree->lock);
552	552
...	...	@@ -573,13 +573,13 @@
573	573	case ZSWAP_SWAPCACHE_NEW: /* page is locked */
574	574	/* decompress */
575	575	dlen = PAGE_SIZE;
576		- src = (u8 *)zbud_map(tree->pool, entry->handle) +
	576	+ src = (u8 *)zbud_map(zswap_pool, entry->handle) +
577	577	sizeof(struct zswap_header);
578	578	dst = kmap_atomic(page);
579	579	ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src,
580	580	entry->length, dst, &dlen);
581	581	kunmap_atomic(dst);
582		- zbud_unmap(tree->pool, entry->handle);
	582	+ zbud_unmap(zswap_pool, entry->handle);
583	583	BUG_ON(ret);
584	584	BUG_ON(dlen != PAGE_SIZE);
585	585
...	...	@@ -652,7 +652,7 @@
652	652	/* reclaim space if needed */
653	653	if (zswap_is_full()) {
654	654	zswap_pool_limit_hit++;
655		- if (zbud_reclaim_page(tree->pool, 8)) {
	655	+ if (zbud_reclaim_page(zswap_pool, 8)) {
656	656	zswap_reject_reclaim_fail++;
657	657	ret = -ENOMEM;
658	658	goto reject;
...	...	@@ -679,7 +679,7 @@
679	679
680	680	/* store */
681	681	len = dlen + sizeof(struct zswap_header);
682		- ret = zbud_alloc(tree->pool, len, __GFP_NORETRY \| __GFP_NOWARN,
	682	+ ret = zbud_alloc(zswap_pool, len, __GFP_NORETRY \| __GFP_NOWARN,
683	683	&handle);
684	684	if (ret == -ENOSPC) {
685	685	zswap_reject_compress_poor++;
686	686
...	...	@@ -689,11 +689,11 @@
689	689	zswap_reject_alloc_fail++;
690	690	goto freepage;
691	691	}
692		- zhdr = zbud_map(tree->pool, handle);
	692	+ zhdr = zbud_map(zswap_pool, handle);
693	693	zhdr->swpentry = swp_entry(type, offset);
694	694	buf = (u8 *)(zhdr + 1);
695	695	memcpy(buf, dst, dlen);
696		- zbud_unmap(tree->pool, handle);
	696	+ zbud_unmap(zswap_pool, handle);
697	697	put_cpu_var(zswap_dstmem);
698	698
699	699	/* populate entry */
...	...	@@ -716,7 +716,7 @@
716	716
717	717	/* update stats */
718	718	atomic_inc(&zswap_stored_pages);
719		- zswap_pool_pages = zbud_get_pool_size(tree->pool);
	719	+ zswap_pool_pages = zbud_get_pool_size(zswap_pool);
720	720
721	721	return 0;
722	722
723	723
...	...	@@ -752,13 +752,13 @@
752	752
753	753	/* decompress */
754	754	dlen = PAGE_SIZE;
755		- src = (u8 *)zbud_map(tree->pool, entry->handle) +
	755	+ src = (u8 *)zbud_map(zswap_pool, entry->handle) +
756	756	sizeof(struct zswap_header);
757	757	dst = kmap_atomic(page);
758	758	ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length,
759	759	dst, &dlen);
760	760	kunmap_atomic(dst);
761		- zbud_unmap(tree->pool, entry->handle);
	761	+ zbud_unmap(zswap_pool, entry->handle);
762	762	BUG_ON(ret);
763	763
764	764	spin_lock(&tree->lock);
765	765
...	...	@@ -804,11 +804,9 @@
804	804	/* walk the tree and free everything */
805	805	spin_lock(&tree->lock);
806	806	rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
807		- zswap_free_entry(tree, entry);
	807	+ zswap_free_entry(entry);
808	808	tree->rbroot = RB_ROOT;
809	809	spin_unlock(&tree->lock);
810		-
811		- zbud_destroy_pool(tree->pool);
812	810	kfree(tree);
813	811	zswap_trees[type] = NULL;
814	812	}
815	813
...	...	@@ -822,20 +820,14 @@
822	820	struct zswap_tree *tree;
823	821
824	822	tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL);
825		- if (!tree)
826		- goto err;
827		- tree->pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops);
828		- if (!tree->pool)
829		- goto freetree;
	823	+ if (!tree) {
	824	+ pr_err("alloc failed, zswap disabled for swap type %d\n", type);
	825	+ return;
	826	+ }
	827	+
830	828	tree->rbroot = RB_ROOT;
831	829	spin_lock_init(&tree->lock);
832	830	zswap_trees[type] = tree;
833		- return;
834		-
835		-freetree:
836		- kfree(tree);
837		-err:
838		- pr_err("alloc failed, zswap disabled for swap type %d\n", type);
839	831	}
840	832
841	833	static struct frontswap_ops zswap_frontswap_ops = {
842	834
...	...	@@ -907,9 +899,16 @@
907	899	return 0;
908	900
909	901	pr_info("loading zswap\n");
	902	+
	903	+ zswap_pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops);
	904	+ if (!zswap_pool) {
	905	+ pr_err("zbud pool creation failed\n");
	906	+ goto error;
	907	+ }
	908	+
910	909	if (zswap_entry_cache_create()) {
911	910	pr_err("entry cache creation failed\n");
912		- goto error;
	911	+ goto cachefail;
913	912	}
914	913	if (zswap_comp_init()) {
915	914	pr_err("compressor initialization failed\n");
...	...	@@ -919,6 +918,7 @@
919	918	pr_err("per-cpu initialization failed\n");
920	919	goto pcpufail;
921	920	}
	921	+
922	922	frontswap_register_ops(&zswap_frontswap_ops);
923	923	if (zswap_debugfs_init())
924	924	pr_warn("debugfs initialization failed\n");
...	...	@@ -927,6 +927,8 @@
927	927	zswap_comp_exit();
928	928	compfail:
929	929	zswap_entry_cache_destory();
	930	+cachefail:
	931	+ zbud_destroy_pool(zswap_pool);
930	932	error:
931	933	return -ENOMEM;
932	934	}