swapfile: swap allocation use discard

When scan_swap_map() finds a free cluster of swap pages to allocate, discard the old contents of the cluster if the device supports discard. But don't bother when swap is so fragmented that we allocate single pages. Be careful about racing allocations made while we're scanning for a cluster; and hold up allocations made while we're discarding. Signed-off-by: Hugh Dickins <hugh@veritas.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: David Woodhouse <dwmw2@infradead.org> Cc: Jens Axboe <jens.axboe@oracle.com> Cc: Matthew Wilcox <matthew@wil.cx> Cc: Joern Engel <joern@logfs.org> Cc: James Bottomley <James.Bottomley@HansenPartnership.com> Cc: Donjun Shin <djshin90@gmail.com> Cc: Tejun Heo <teheo@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

swapfile: swap allocation use discard
When scan_swap_map() finds a free cluster of swap pages to allocate, discard the old contents of the cluster if the device supports discard. But don't bother when swap is so fragmented that we allocate single pages. Be careful about racing allocations made while we're scanning for a cluster; and hold up allocations made while we're discarding. Signed-off-by: Hugh Dickins <hugh@veritas.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: David Woodhouse <dwmw2@infradead.org> Cc: Jens Axboe <jens.axboe@oracle.com> Cc: Matthew Wilcox <matthew@wil.cx> Cc: Joern Engel <joern@logfs.org> Cc: James Bottomley <James.Bottomley@HansenPartnership.com> Cc: Donjun Shin <djshin90@gmail.com> Cc: Tejun Heo <teheo@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Hugh Dickins · Linus Torvalds
1 parent 6a6ba83175
Showing 2 changed files with 121 additions and 1 deletions Side-by-side Diff
include/linux/swap.h
mm/swapfile.c
@@ -121,6 +121,7 @@
 	SWP_USED	= (1 << 0),	/* is slot in swap_info[] used? */
 	SWP_WRITEOK	= (1 << 1),	/* ok to write to this swap?	*/
 	SWP_DISCARDABLE = (1 << 2),	/* blkdev supports discard */
+	SWP_DISCARDING	= (1 << 3),	/* now discarding a free cluster */
 					/* add others here before... */
 	SWP_SCANNING	= (1 << 8),	/* refcount in scan_swap_map */
 };
@@ -144,6 +145,8 @@
 	unsigned short *swap_map;
 	unsigned int lowest_bit;
 	unsigned int highest_bit;
+	unsigned int lowest_alloc;	/* while preparing discard cluster */
+	unsigned int highest_alloc;	/* while preparing discard cluster */
 	unsigned int cluster_next;
 	unsigned int cluster_nr;
 	unsigned int pages;
@@ -115,14 +115,62 @@
 	return err;		/* That will often be -EOPNOTSUPP */
 }
  
+/*
+ * swap allocation tell device that a cluster of swap can now be discarded,
+ * to allow the swap device to optimize its wear-levelling.
+ */
+static void discard_swap_cluster(struct swap_info_struct *si,
+				 pgoff_t start_page, pgoff_t nr_pages)
+{
+	struct swap_extent *se = si->curr_swap_extent;
+	int found_extent = 0;
+
+	while (nr_pages) {
+		struct list_head *lh;
+
+		if (se->start_page <= start_page &&
+		    start_page < se->start_page + se->nr_pages) {
+			pgoff_t offset = start_page - se->start_page;
+			sector_t start_block = se->start_block + offset;
+			pgoff_t nr_blocks = se->nr_pages - offset;
+
+			if (nr_blocks > nr_pages)
+				nr_blocks = nr_pages;
+			start_page += nr_blocks;
+			nr_pages -= nr_blocks;
+
+			if (!found_extent++)
+				si->curr_swap_extent = se;
+
+			start_block <<= PAGE_SHIFT - 9;
+			nr_blocks <<= PAGE_SHIFT - 9;
+			if (blkdev_issue_discard(si->bdev, start_block,
+							nr_blocks, GFP_NOIO))
+				break;
+		}
+
+		lh = se->list.next;
+		if (lh == &si->extent_list)
+			lh = lh->next;
+		se = list_entry(lh, struct swap_extent, list);
+	}
+}
+
+static int wait_for_discard(void *word)
+{
+	schedule();
+	return 0;
+}
+
 #define SWAPFILE_CLUSTER	256
 #define LATENCY_LIMIT		256
  
 static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 {
 	unsigned long offset;
-	unsigned long last_in_cluster;
+	unsigned long last_in_cluster = 0;
 	int latency_ration = LATENCY_LIMIT;
+	int found_free_cluster = 0;
  
 	/*
 	 * We try to cluster swap pages by allocating them sequentially
@@ -142,6 +190,19 @@
 			si->cluster_nr = SWAPFILE_CLUSTER - 1;
 			goto checks;
 		}
+		if (si->flags & SWP_DISCARDABLE) {
+			/*
+			 * Start range check on racing allocations, in case
+			 * they overlap the cluster we eventually decide on
+			 * (we scan without swap_lock to allow preemption).
+			 * It's hardly conceivable that cluster_nr could be
+			 * wrapped during our scan, but don't depend on it.
+			 */
+			if (si->lowest_alloc)
+				goto checks;
+			si->lowest_alloc = si->max;
+			si->highest_alloc = 0;
+		}
 		spin_unlock(&swap_lock);
  
 		offset = si->lowest_bit;
@@ -156,6 +217,7 @@
 				offset -= SWAPFILE_CLUSTER - 1;
 				si->cluster_next = offset;
 				si->cluster_nr = SWAPFILE_CLUSTER - 1;
+				found_free_cluster = 1;
 				goto checks;
 			}
 			if (unlikely(--latency_ration < 0)) {
@@ -167,6 +229,7 @@
 		offset = si->lowest_bit;
 		spin_lock(&swap_lock);
 		si->cluster_nr = SWAPFILE_CLUSTER - 1;
+		si->lowest_alloc = 0;
 	}
  
 checks:
@@ -191,6 +254,60 @@
 	si->swap_map[offset] = 1;
 	si->cluster_next = offset + 1;
 	si->flags -= SWP_SCANNING;
+
+	if (si->lowest_alloc) {
+		/*
+		 * Only set when SWP_DISCARDABLE, and there's a scan
+		 * for a free cluster in progress or just completed.
+		 */
+		if (found_free_cluster) {
+			/*
+			 * To optimize wear-levelling, discard the
+			 * old data of the cluster, taking care not to
+			 * discard any of its pages that have already
+			 * been allocated by racing tasks (offset has
+			 * already stepped over any at the beginning).
+			 */
+			if (offset < si->highest_alloc &&
+			    si->lowest_alloc <= last_in_cluster)
+				last_in_cluster = si->lowest_alloc - 1;
+			si->flags |= SWP_DISCARDING;
+			spin_unlock(&swap_lock);
+
+			if (offset < last_in_cluster)
+				discard_swap_cluster(si, offset,
+					last_in_cluster - offset + 1);
+
+			spin_lock(&swap_lock);
+			si->lowest_alloc = 0;
+			si->flags &= ~SWP_DISCARDING;
+
+			smp_mb();	/* wake_up_bit advises this */
+			wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
+
+		} else if (si->flags & SWP_DISCARDING) {
+			/*
+			 * Delay using pages allocated by racing tasks
+			 * until the whole discard has been issued. We
+			 * could defer that delay until swap_writepage,
+			 * but it's easier to keep this self-contained.
+			 */
+			spin_unlock(&swap_lock);
+			wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
+				wait_for_discard, TASK_UNINTERRUPTIBLE);
+			spin_lock(&swap_lock);
+		} else {
+			/*
+			 * Note pages allocated by racing tasks while
+			 * scan for a free cluster is in progress, so
+			 * that its final discard can exclude them.
+			 */
+			if (offset < si->lowest_alloc)
+				si->lowest_alloc = offset;
+			if (offset > si->highest_alloc)
+				si->highest_alloc = offset;
+		}
+	}
 	return offset;
  
 scan:
...	...	@@ -121,6 +121,7 @@
121	121	SWP_USED = (1 << 0), /* is slot in swap_info[] used? */
122	122	SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */
123	123	SWP_DISCARDABLE = (1 << 2), /* blkdev supports discard */
	124	+ SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */
124	125	/* add others here before... */
125	126	SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */
126	127	};
...	...	@@ -144,6 +145,8 @@
144	145	unsigned short *swap_map;
145	146	unsigned int lowest_bit;
146	147	unsigned int highest_bit;
	148	+ unsigned int lowest_alloc; /* while preparing discard cluster */
	149	+ unsigned int highest_alloc; /* while preparing discard cluster */
147	150	unsigned int cluster_next;
148	151	unsigned int cluster_nr;
149	152	unsigned int pages;
...	...	@@ -115,14 +115,62 @@
115	115	return err; /* That will often be -EOPNOTSUPP */
116	116	}
117	117
	118	+/*
	119	+ * swap allocation tell device that a cluster of swap can now be discarded,
	120	+ * to allow the swap device to optimize its wear-levelling.
	121	+ */
	122	+static void discard_swap_cluster(struct swap_info_struct *si,
	123	+ pgoff_t start_page, pgoff_t nr_pages)
	124	+{
	125	+ struct swap_extent *se = si->curr_swap_extent;
	126	+ int found_extent = 0;
	127	+
	128	+ while (nr_pages) {
	129	+ struct list_head *lh;
	130	+
	131	+ if (se->start_page <= start_page &&
	132	+ start_page < se->start_page + se->nr_pages) {
	133	+ pgoff_t offset = start_page - se->start_page;
	134	+ sector_t start_block = se->start_block + offset;
	135	+ pgoff_t nr_blocks = se->nr_pages - offset;
	136	+
	137	+ if (nr_blocks > nr_pages)
	138	+ nr_blocks = nr_pages;
	139	+ start_page += nr_blocks;
	140	+ nr_pages -= nr_blocks;
	141	+
	142	+ if (!found_extent++)
	143	+ si->curr_swap_extent = se;
	144	+
	145	+ start_block <<= PAGE_SHIFT - 9;
	146	+ nr_blocks <<= PAGE_SHIFT - 9;
	147	+ if (blkdev_issue_discard(si->bdev, start_block,
	148	+ nr_blocks, GFP_NOIO))
	149	+ break;
	150	+ }
	151	+
	152	+ lh = se->list.next;
	153	+ if (lh == &si->extent_list)
	154	+ lh = lh->next;
	155	+ se = list_entry(lh, struct swap_extent, list);
	156	+ }
	157	+}
	158	+
	159	+static int wait_for_discard(void *word)
	160	+{
	161	+ schedule();
	162	+ return 0;
	163	+}
	164	+
118	165	#define SWAPFILE_CLUSTER 256
119	166	#define LATENCY_LIMIT 256
120	167
121	168	static inline unsigned long scan_swap_map(struct swap_info_struct *si)
122	169	{
123	170	unsigned long offset;
124		- unsigned long last_in_cluster;
	171	+ unsigned long last_in_cluster = 0;
125	172	int latency_ration = LATENCY_LIMIT;
	173	+ int found_free_cluster = 0;
126	174
127	175	/*
128	176	* We try to cluster swap pages by allocating them sequentially
...	...	@@ -142,6 +190,19 @@
142	190	si->cluster_nr = SWAPFILE_CLUSTER - 1;
143	191	goto checks;
144	192	}
	193	+ if (si->flags & SWP_DISCARDABLE) {
	194	+ /*
	195	+ * Start range check on racing allocations, in case
	196	+ * they overlap the cluster we eventually decide on
	197	+ * (we scan without swap_lock to allow preemption).
	198	+ * It's hardly conceivable that cluster_nr could be
	199	+ * wrapped during our scan, but don't depend on it.
	200	+ */
	201	+ if (si->lowest_alloc)
	202	+ goto checks;
	203	+ si->lowest_alloc = si->max;
	204	+ si->highest_alloc = 0;
	205	+ }
145	206	spin_unlock(&swap_lock);
146	207
147	208	offset = si->lowest_bit;
...	...	@@ -156,6 +217,7 @@
156	217	offset -= SWAPFILE_CLUSTER - 1;
157	218	si->cluster_next = offset;
158	219	si->cluster_nr = SWAPFILE_CLUSTER - 1;
	220	+ found_free_cluster = 1;
159	221	goto checks;
160	222	}
161	223	if (unlikely(--latency_ration < 0)) {
...	...	@@ -167,6 +229,7 @@
167	229	offset = si->lowest_bit;
168	230	spin_lock(&swap_lock);
169	231	si->cluster_nr = SWAPFILE_CLUSTER - 1;
	232	+ si->lowest_alloc = 0;
170	233	}
171	234
172	235	checks:
...	...	@@ -191,6 +254,60 @@
191	254	si->swap_map[offset] = 1;
192	255	si->cluster_next = offset + 1;
193	256	si->flags -= SWP_SCANNING;
	257	+
	258	+ if (si->lowest_alloc) {
	259	+ /*
	260	+ * Only set when SWP_DISCARDABLE, and there's a scan
	261	+ * for a free cluster in progress or just completed.
	262	+ */
	263	+ if (found_free_cluster) {
	264	+ /*
	265	+ * To optimize wear-levelling, discard the
	266	+ * old data of the cluster, taking care not to
	267	+ * discard any of its pages that have already
	268	+ * been allocated by racing tasks (offset has
	269	+ * already stepped over any at the beginning).
	270	+ */
	271	+ if (offset < si->highest_alloc &&
	272	+ si->lowest_alloc <= last_in_cluster)
	273	+ last_in_cluster = si->lowest_alloc - 1;
	274	+ si->flags \|= SWP_DISCARDING;
	275	+ spin_unlock(&swap_lock);
	276	+
	277	+ if (offset < last_in_cluster)
	278	+ discard_swap_cluster(si, offset,
	279	+ last_in_cluster - offset + 1);
	280	+
	281	+ spin_lock(&swap_lock);
	282	+ si->lowest_alloc = 0;
	283	+ si->flags &= ~SWP_DISCARDING;
	284	+
	285	+ smp_mb(); /* wake_up_bit advises this */
	286	+ wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
	287	+
	288	+ } else if (si->flags & SWP_DISCARDING) {
	289	+ /*
	290	+ * Delay using pages allocated by racing tasks
	291	+ * until the whole discard has been issued. We
	292	+ * could defer that delay until swap_writepage,
	293	+ * but it's easier to keep this self-contained.
	294	+ */
	295	+ spin_unlock(&swap_lock);
	296	+ wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
	297	+ wait_for_discard, TASK_UNINTERRUPTIBLE);
	298	+ spin_lock(&swap_lock);
	299	+ } else {
	300	+ /*
	301	+ * Note pages allocated by racing tasks while
	302	+ * scan for a free cluster is in progress, so
	303	+ * that its final discard can exclude them.
	304	+ */
	305	+ if (offset < si->lowest_alloc)
	306	+ si->lowest_alloc = offset;
	307	+ if (offset > si->highest_alloc)
	308	+ si->highest_alloc = offset;
	309	+ }
	310	+ }
194	311	return offset;
195	312
196	313	scan: