swap: make swap discard async

swap can do cluster discard for SSD, which is good, but there are some problems here: 1. swap do the discard just before page reclaim gets a swap entry and writes the disk sectors. This is useless for high end SSD, because an overwrite to a sector implies a discard to original sector too. A discard + overwrite == overwrite. 2. the purpose of doing discard is to improve SSD firmware garbage collection. Idealy we should send discard as early as possible, so firmware can do something smart. Sending discard just after swap entry is freed is considered early compared to sending discard before write. Of course, if workload is already bound to gc speed, sending discard earlier or later doesn't make 3. block discard is a sync API, which will delay scan_swap_map() significantly. 4. Write and discard command can be executed parallel in PCIe SSD. Making swap discard async can make execution more efficiently. This patch makes swap discard async and moves discard to where swap entry is freed. Discard and write have no dependence now, so above issues can be avoided. Idealy we should do discard for any freed sectors, but some SSD discard is very slow. This patch still does discard for a whole cluster. My test does a several round of 'mmap, write, unmap', which will trigger a lot of swap discard. In a fusionio card, with this patch, the test runtime is reduced to 18% of the time without it, so around 5.5x faster. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Shaohua Li <shli@fusionio.com> Cc: Rik van Riel <riel@redhat.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Kyungmin Park <kmpark@infradead.org> Cc: Hugh Dickins <hughd@google.com> Cc: Rafael Aquini <aquini@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

swap: make swap discard async
swap can do cluster discard for SSD, which is good, but there are some problems here: 1. swap do the discard just before page reclaim gets a swap entry and writes the disk sectors. This is useless for high end SSD, because an overwrite to a sector implies a discard to original sector too. A discard + overwrite == overwrite. 2. the purpose of doing discard is to improve SSD firmware garbage collection. Idealy we should send discard as early as possible, so firmware can do something smart. Sending discard just after swap entry is freed is considered early compared to sending discard before write. Of course, if workload is already bound to gc speed, sending discard earlier or later doesn't make 3. block discard is a sync API, which will delay scan_swap_map() significantly. 4. Write and discard command can be executed parallel in PCIe SSD. Making swap discard async can make execution more efficiently. This patch makes swap discard async and moves discard to where swap entry is freed. Discard and write have no dependence now, so above issues can be avoided. Idealy we should do discard for any freed sectors, but some SSD discard is very slow. This patch still does discard for a whole cluster. My test does a several round of 'mmap, write, unmap', which will trigger a lot of swap discard. In a fusionio card, with this patch, the test runtime is reduced to 18% of the time without it, so around 5.5x faster. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Shaohua Li <shli@fusionio.com> Cc: Rik van Riel <riel@redhat.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Kyungmin Park <kmpark@infradead.org> Cc: Hugh Dickins <hughd@google.com> Cc: Rafael Aquini <aquini@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Shaohua Li · Linus Torvalds
1 parent 2a8f944934
Showing 2 changed files with 125 additions and 87 deletions Side-by-side Diff
include/linux/swap.h
mm/swapfile.c
@@ -217,8 +217,6 @@
 	unsigned int inuse_pages;	/* number of those currently in use */
 	unsigned int cluster_next;	/* likely index for next allocation */
 	unsigned int cluster_nr;	/* countdown to next cluster search */
-	unsigned int lowest_alloc;	/* while preparing discard cluster */
-	unsigned int highest_alloc;	/* while preparing discard cluster */
 	struct swap_extent *curr_swap_extent;
 	struct swap_extent first_swap_extent;
 	struct block_device *bdev;	/* swap device or bdev of swap file */
  
@@ -232,14 +230,18 @@
 					 * protect map scan related fields like
 					 * swap_map, lowest_bit, highest_bit,
 					 * inuse_pages, cluster_next,
-					 * cluster_nr, lowest_alloc and
-					 * highest_alloc. other fields are only
-					 * changed at swapon/swapoff, so are
-					 * protected by swap_lock. changing
-					 * flags need hold this lock and
-					 * swap_lock. If both locks need hold,
-					 * hold swap_lock first.
+					 * cluster_nr, lowest_alloc,
+					 * highest_alloc, free/discard cluster
+					 * list. other fields are only changed
+					 * at swapon/swapoff, so are protected
+					 * by swap_lock. changing flags need
+					 * hold this lock and swap_lock. If
+					 * both locks need hold, hold swap_lock
+					 * first.
 					 */
+	struct work_struct discard_work; /* discard worker */
+	struct swap_cluster_info discard_cluster_head; /* list head of discard clusters */
+	struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */
 };
  
 struct swap_list_t {
@@ -175,12 +175,6 @@
 	}
 }
  
-static int wait_for_discard(void *word)
-{
-	schedule();
-	return 0;
-}
-
 #define SWAPFILE_CLUSTER	256
 #define LATENCY_LIMIT		256
  
  
@@ -242,7 +236,91 @@
 	info->data = 0;
 }
  
+/* Add a cluster to discard list and schedule it to do discard */
+static void swap_cluster_schedule_discard(struct swap_info_struct *si,
+		unsigned int idx)
+{
+	/*
+	 * If scan_swap_map() can't find a free cluster, it will check
+	 * si->swap_map directly. To make sure the discarding cluster isn't
+	 * taken by scan_swap_map(), mark the swap entries bad (occupied). It
+	 * will be cleared after discard
+	 */
+	memset(si->swap_map + idx * SWAPFILE_CLUSTER,
+			SWAP_MAP_BAD, SWAPFILE_CLUSTER);
+
+	if (cluster_is_null(&si->discard_cluster_head)) {
+		cluster_set_next_flag(&si->discard_cluster_head,
+						idx, 0);
+		cluster_set_next_flag(&si->discard_cluster_tail,
+						idx, 0);
+	} else {
+		unsigned int tail = cluster_next(&si->discard_cluster_tail);
+		cluster_set_next(&si->cluster_info[tail], idx);
+		cluster_set_next_flag(&si->discard_cluster_tail,
+						idx, 0);
+	}
+
+	schedule_work(&si->discard_work);
+}
+
 /*
+ * Doing discard actually. After a cluster discard is finished, the cluster
+ * will be added to free cluster list. caller should hold si->lock.
+*/
+static void swap_do_scheduled_discard(struct swap_info_struct *si)
+{
+	struct swap_cluster_info *info;
+	unsigned int idx;
+
+	info = si->cluster_info;
+
+	while (!cluster_is_null(&si->discard_cluster_head)) {
+		idx = cluster_next(&si->discard_cluster_head);
+
+		cluster_set_next_flag(&si->discard_cluster_head,
+						cluster_next(&info[idx]), 0);
+		if (cluster_next(&si->discard_cluster_tail) == idx) {
+			cluster_set_null(&si->discard_cluster_head);
+			cluster_set_null(&si->discard_cluster_tail);
+		}
+		spin_unlock(&si->lock);
+
+		discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
+				SWAPFILE_CLUSTER);
+
+		spin_lock(&si->lock);
+		cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE);
+		if (cluster_is_null(&si->free_cluster_head)) {
+			cluster_set_next_flag(&si->free_cluster_head,
+						idx, 0);
+			cluster_set_next_flag(&si->free_cluster_tail,
+						idx, 0);
+		} else {
+			unsigned int tail;
+
+			tail = cluster_next(&si->free_cluster_tail);
+			cluster_set_next(&info[tail], idx);
+			cluster_set_next_flag(&si->free_cluster_tail,
+						idx, 0);
+		}
+		memset(si->swap_map + idx * SWAPFILE_CLUSTER,
+				0, SWAPFILE_CLUSTER);
+	}
+}
+
+static void swap_discard_work(struct work_struct *work)
+{
+	struct swap_info_struct *si;
+
+	si = container_of(work, struct swap_info_struct, discard_work);
+
+	spin_lock(&si->lock);
+	swap_do_scheduled_discard(si);
+	spin_unlock(&si->lock);
+}
+
+/*
  * The cluster corresponding to page_nr will be used. The cluster will be
  * removed from free cluster list and its usage counter will be increased.
  */
@@ -287,6 +365,16 @@
 		cluster_count(&cluster_info[idx]) - 1);
  
 	if (cluster_count(&cluster_info[idx]) == 0) {
+		/*
+		 * If the swap is discardable, prepare discard the cluster
+		 * instead of free it immediately. The cluster will be freed
+		 * after discard.
+		 */
+		if (p->flags & SWP_PAGE_DISCARD) {
+			swap_cluster_schedule_discard(p, idx);
+			return;
+		}
+
 		cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
 		if (cluster_is_null(&p->free_cluster_head)) {
 			cluster_set_next_flag(&p->free_cluster_head, idx, 0);
@@ -319,7 +407,6 @@
 	unsigned long scan_base;
 	unsigned long last_in_cluster = 0;
 	int latency_ration = LATENCY_LIMIT;
-	int found_free_cluster = 0;
  
 	/*
 	 * We try to cluster swap pages by allocating them sequentially
@@ -340,19 +427,6 @@
 			si->cluster_nr = SWAPFILE_CLUSTER - 1;
 			goto checks;
 		}
-		if (si->flags & SWP_PAGE_DISCARD) {
-			/*
-			 * Start range check on racing allocations, in case
-			 * they overlap the cluster we eventually decide on
-			 * (we scan without swap_lock to allow preemption).
-			 * It's hardly conceivable that cluster_nr could be
-			 * wrapped during our scan, but don't depend on it.
-			 */
-			if (si->lowest_alloc)
-				goto checks;
-			si->lowest_alloc = si->max;
-			si->highest_alloc = 0;
-		}
 check_cluster:
 		if (!cluster_is_null(&si->free_cluster_head)) {
 			offset = cluster_next(&si->free_cluster_head) *
  
  
@@ -360,15 +434,27 @@
 			last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
 			si->cluster_next = offset;
 			si->cluster_nr = SWAPFILE_CLUSTER - 1;
-			found_free_cluster = 1;
 			goto checks;
 		} else if (si->cluster_info) {
 			/*
+			 * we don't have free cluster but have some clusters in
+			 * discarding, do discard now and reclaim them
+			 */
+			if (!cluster_is_null(&si->discard_cluster_head)) {
+				si->cluster_nr = 0;
+				swap_do_scheduled_discard(si);
+				scan_base = offset = si->cluster_next;
+				if (!si->cluster_nr)
+					goto check_cluster;
+				si->cluster_nr--;
+				goto checks;
+			}
+
+			/*
 			 * Checking free cluster is fast enough, we can do the
 			 * check every time
 			 */
 			si->cluster_nr = 0;
-			si->lowest_alloc = 0;
 			goto checks;
 		}
  
@@ -395,7 +481,6 @@
 				offset -= SWAPFILE_CLUSTER - 1;
 				si->cluster_next = offset;
 				si->cluster_nr = SWAPFILE_CLUSTER - 1;
-				found_free_cluster = 1;
 				goto checks;
 			}
 			if (unlikely(--latency_ration < 0)) {
@@ -416,7 +501,6 @@
 				offset -= SWAPFILE_CLUSTER - 1;
 				si->cluster_next = offset;
 				si->cluster_nr = SWAPFILE_CLUSTER - 1;
-				found_free_cluster = 1;
 				goto checks;
 			}
 			if (unlikely(--latency_ration < 0)) {
@@ -428,7 +512,6 @@
 		offset = scan_base;
 		spin_lock(&si->lock);
 		si->cluster_nr = SWAPFILE_CLUSTER - 1;
-		si->lowest_alloc = 0;
 	}
  
 checks:
@@ -470,59 +553,6 @@
 	si->cluster_next = offset + 1;
 	si->flags -= SWP_SCANNING;
  
-	if (si->lowest_alloc) {
-		/*
-		 * Only set when SWP_PAGE_DISCARD, and there's a scan
-		 * for a free cluster in progress or just completed.
-		 */
-		if (found_free_cluster) {
-			/*
-			 * To optimize wear-levelling, discard the
-			 * old data of the cluster, taking care not to
-			 * discard any of its pages that have already
-			 * been allocated by racing tasks (offset has
-			 * already stepped over any at the beginning).
-			 */
-			if (offset < si->highest_alloc &&
-			    si->lowest_alloc <= last_in_cluster)
-				last_in_cluster = si->lowest_alloc - 1;
-			si->flags |= SWP_DISCARDING;
-			spin_unlock(&si->lock);
-
-			if (offset < last_in_cluster)
-				discard_swap_cluster(si, offset,
-					last_in_cluster - offset + 1);
-
-			spin_lock(&si->lock);
-			si->lowest_alloc = 0;
-			si->flags &= ~SWP_DISCARDING;
-
-			smp_mb();	/* wake_up_bit advises this */
-			wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
-
-		} else if (si->flags & SWP_DISCARDING) {
-			/*
-			 * Delay using pages allocated by racing tasks
-			 * until the whole discard has been issued. We
-			 * could defer that delay until swap_writepage,
-			 * but it's easier to keep this self-contained.
-			 */
-			spin_unlock(&si->lock);
-			wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
-				wait_for_discard, TASK_UNINTERRUPTIBLE);
-			spin_lock(&si->lock);
-		} else {
-			/*
-			 * Note pages allocated by racing tasks while
-			 * scan for a free cluster is in progress, so
-			 * that its final discard can exclude them.
-			 */
-			if (offset < si->lowest_alloc)
-				si->lowest_alloc = offset;
-			if (offset > si->highest_alloc)
-				si->highest_alloc = offset;
-		}
-	}
 	return offset;
  
 scan:
@@ -1806,6 +1836,8 @@
 		goto out_dput;
 	}
  
+	flush_work(&p->discard_work);
+
 	destroy_swap_extents(p);
 	if (p->flags & SWP_CONTINUED)
 		free_swap_count_continuations(p);
@@ -2172,6 +2204,8 @@
  
 	cluster_set_null(&p->free_cluster_head);
 	cluster_set_null(&p->free_cluster_tail);
+	cluster_set_null(&p->discard_cluster_head);
+	cluster_set_null(&p->discard_cluster_tail);
  
 	for (i = 0; i < swap_header->info.nr_badpages; i++) {
 		unsigned int page_nr = swap_header->info.badpages[i];
@@ -2280,6 +2314,8 @@
 	p = alloc_swap_info();
 	if (IS_ERR(p))
 		return PTR_ERR(p);
+
+	INIT_WORK(&p->discard_work, swap_discard_work);
  
 	name = getname(specialfile);
 	if (IS_ERR(name)) {
...	...	@@ -217,8 +217,6 @@
217	217	unsigned int inuse_pages; /* number of those currently in use */
218	218	unsigned int cluster_next; /* likely index for next allocation */
219	219	unsigned int cluster_nr; /* countdown to next cluster search */
220		- unsigned int lowest_alloc; /* while preparing discard cluster */
221		- unsigned int highest_alloc; /* while preparing discard cluster */
222	220	struct swap_extent *curr_swap_extent;
223	221	struct swap_extent first_swap_extent;
224	222	struct block_device bdev; / swap device or bdev of swap file */
225	223
...	...	@@ -232,14 +230,18 @@
232	230	* protect map scan related fields like
233	231	* swap_map, lowest_bit, highest_bit,
234	232	* inuse_pages, cluster_next,
235		- * cluster_nr, lowest_alloc and
236		- * highest_alloc. other fields are only
237		- * changed at swapon/swapoff, so are
238		- * protected by swap_lock. changing
239		- * flags need hold this lock and
240		- * swap_lock. If both locks need hold,
241		- * hold swap_lock first.
	233	+ * cluster_nr, lowest_alloc,
	234	+ * highest_alloc, free/discard cluster
	235	+ * list. other fields are only changed
	236	+ * at swapon/swapoff, so are protected
	237	+ * by swap_lock. changing flags need
	238	+ * hold this lock and swap_lock. If
	239	+ * both locks need hold, hold swap_lock
	240	+ * first.
242	241	*/
	242	+ struct work_struct discard_work; /* discard worker */
	243	+ struct swap_cluster_info discard_cluster_head; /* list head of discard clusters */
	244	+ struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */
243	245	};
244	246
245	247	struct swap_list_t {
...	...	@@ -175,12 +175,6 @@
175	175	}
176	176	}
177	177
178		-static int wait_for_discard(void *word)
179		-{
180		- schedule();
181		- return 0;
182		-}
183		-
184	178	#define SWAPFILE_CLUSTER 256
185	179	#define LATENCY_LIMIT 256
186	180
187	181
...	...	@@ -242,7 +236,91 @@
242	236	info->data = 0;
243	237	}
244	238
	239	+/* Add a cluster to discard list and schedule it to do discard */
	240	+static void swap_cluster_schedule_discard(struct swap_info_struct *si,
	241	+ unsigned int idx)
	242	+{
	243	+ /*
	244	+ * If scan_swap_map() can't find a free cluster, it will check
	245	+ * si->swap_map directly. To make sure the discarding cluster isn't
	246	+ * taken by scan_swap_map(), mark the swap entries bad (occupied). It
	247	+ * will be cleared after discard
	248	+ */
	249	+ memset(si->swap_map + idx * SWAPFILE_CLUSTER,
	250	+ SWAP_MAP_BAD, SWAPFILE_CLUSTER);
	251	+
	252	+ if (cluster_is_null(&si->discard_cluster_head)) {
	253	+ cluster_set_next_flag(&si->discard_cluster_head,
	254	+ idx, 0);
	255	+ cluster_set_next_flag(&si->discard_cluster_tail,
	256	+ idx, 0);
	257	+ } else {
	258	+ unsigned int tail = cluster_next(&si->discard_cluster_tail);
	259	+ cluster_set_next(&si->cluster_info[tail], idx);
	260	+ cluster_set_next_flag(&si->discard_cluster_tail,
	261	+ idx, 0);
	262	+ }
	263	+
	264	+ schedule_work(&si->discard_work);
	265	+}
	266	+
245	267	/*
	268	+ * Doing discard actually. After a cluster discard is finished, the cluster
	269	+ * will be added to free cluster list. caller should hold si->lock.
	270	+*/
	271	+static void swap_do_scheduled_discard(struct swap_info_struct *si)
	272	+{
	273	+ struct swap_cluster_info *info;
	274	+ unsigned int idx;
	275	+
	276	+ info = si->cluster_info;
	277	+
	278	+ while (!cluster_is_null(&si->discard_cluster_head)) {
	279	+ idx = cluster_next(&si->discard_cluster_head);
	280	+
	281	+ cluster_set_next_flag(&si->discard_cluster_head,
	282	+ cluster_next(&info[idx]), 0);
	283	+ if (cluster_next(&si->discard_cluster_tail) == idx) {
	284	+ cluster_set_null(&si->discard_cluster_head);
	285	+ cluster_set_null(&si->discard_cluster_tail);
	286	+ }
	287	+ spin_unlock(&si->lock);
	288	+
	289	+ discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
	290	+ SWAPFILE_CLUSTER);
	291	+
	292	+ spin_lock(&si->lock);
	293	+ cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE);
	294	+ if (cluster_is_null(&si->free_cluster_head)) {
	295	+ cluster_set_next_flag(&si->free_cluster_head,
	296	+ idx, 0);
	297	+ cluster_set_next_flag(&si->free_cluster_tail,
	298	+ idx, 0);
	299	+ } else {
	300	+ unsigned int tail;
	301	+
	302	+ tail = cluster_next(&si->free_cluster_tail);
	303	+ cluster_set_next(&info[tail], idx);
	304	+ cluster_set_next_flag(&si->free_cluster_tail,
	305	+ idx, 0);
	306	+ }
	307	+ memset(si->swap_map + idx * SWAPFILE_CLUSTER,
	308	+ 0, SWAPFILE_CLUSTER);
	309	+ }
	310	+}
	311	+
	312	+static void swap_discard_work(struct work_struct *work)
	313	+{
	314	+ struct swap_info_struct *si;
	315	+
	316	+ si = container_of(work, struct swap_info_struct, discard_work);
	317	+
	318	+ spin_lock(&si->lock);
	319	+ swap_do_scheduled_discard(si);
	320	+ spin_unlock(&si->lock);
	321	+}
	322	+
	323	+/*
246	324	* The cluster corresponding to page_nr will be used. The cluster will be
247	325	* removed from free cluster list and its usage counter will be increased.
248	326	*/
...	...	@@ -287,6 +365,16 @@
287	365	cluster_count(&cluster_info[idx]) - 1);
288	366
289	367	if (cluster_count(&cluster_info[idx]) == 0) {
	368	+ /*
	369	+ * If the swap is discardable, prepare discard the cluster
	370	+ * instead of free it immediately. The cluster will be freed
	371	+ * after discard.
	372	+ */
	373	+ if (p->flags & SWP_PAGE_DISCARD) {
	374	+ swap_cluster_schedule_discard(p, idx);
	375	+ return;
	376	+ }
	377	+
290	378	cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
291	379	if (cluster_is_null(&p->free_cluster_head)) {
292	380	cluster_set_next_flag(&p->free_cluster_head, idx, 0);
...	...	@@ -319,7 +407,6 @@
319	407	unsigned long scan_base;
320	408	unsigned long last_in_cluster = 0;
321	409	int latency_ration = LATENCY_LIMIT;
322		- int found_free_cluster = 0;
323	410
324	411	/*
325	412	* We try to cluster swap pages by allocating them sequentially
...	...	@@ -340,19 +427,6 @@
340	427	si->cluster_nr = SWAPFILE_CLUSTER - 1;
341	428	goto checks;
342	429	}
343		- if (si->flags & SWP_PAGE_DISCARD) {
344		- /*
345		- * Start range check on racing allocations, in case
346		- * they overlap the cluster we eventually decide on
347		- * (we scan without swap_lock to allow preemption).
348		- * It's hardly conceivable that cluster_nr could be
349		- * wrapped during our scan, but don't depend on it.
350		- */
351		- if (si->lowest_alloc)
352		- goto checks;
353		- si->lowest_alloc = si->max;
354		- si->highest_alloc = 0;
355		- }
356	430	check_cluster:
357	431	if (!cluster_is_null(&si->free_cluster_head)) {
358	432	offset = cluster_next(&si->free_cluster_head) *
359	433
360	434
...	...	@@ -360,15 +434,27 @@
360	434	last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
361	435	si->cluster_next = offset;
362	436	si->cluster_nr = SWAPFILE_CLUSTER - 1;
363		- found_free_cluster = 1;
364	437	goto checks;
365	438	} else if (si->cluster_info) {
366	439	/*
	440	+ * we don't have free cluster but have some clusters in
	441	+ * discarding, do discard now and reclaim them
	442	+ */
	443	+ if (!cluster_is_null(&si->discard_cluster_head)) {
	444	+ si->cluster_nr = 0;
	445	+ swap_do_scheduled_discard(si);
	446	+ scan_base = offset = si->cluster_next;
	447	+ if (!si->cluster_nr)
	448	+ goto check_cluster;
	449	+ si->cluster_nr--;
	450	+ goto checks;
	451	+ }
	452	+
	453	+ /*
367	454	* Checking free cluster is fast enough, we can do the
368	455	* check every time
369	456	*/
370	457	si->cluster_nr = 0;
371		- si->lowest_alloc = 0;
372	458	goto checks;
373	459	}
374	460
...	...	@@ -395,7 +481,6 @@
395	481	offset -= SWAPFILE_CLUSTER - 1;
396	482	si->cluster_next = offset;
397	483	si->cluster_nr = SWAPFILE_CLUSTER - 1;
398		- found_free_cluster = 1;
399	484	goto checks;
400	485	}
401	486	if (unlikely(--latency_ration < 0)) {
...	...	@@ -416,7 +501,6 @@
416	501	offset -= SWAPFILE_CLUSTER - 1;
417	502	si->cluster_next = offset;
418	503	si->cluster_nr = SWAPFILE_CLUSTER - 1;
419		- found_free_cluster = 1;
420	504	goto checks;
421	505	}
422	506	if (unlikely(--latency_ration < 0)) {
...	...	@@ -428,7 +512,6 @@
428	512	offset = scan_base;
429	513	spin_lock(&si->lock);
430	514	si->cluster_nr = SWAPFILE_CLUSTER - 1;
431		- si->lowest_alloc = 0;
432	515	}
433	516
434	517	checks:
...	...	@@ -470,59 +553,6 @@
470	553	si->cluster_next = offset + 1;
471	554	si->flags -= SWP_SCANNING;
472	555
473		- if (si->lowest_alloc) {
474		- /*
475		- * Only set when SWP_PAGE_DISCARD, and there's a scan
476		- * for a free cluster in progress or just completed.
477		- */
478		- if (found_free_cluster) {
479		- /*
480		- * To optimize wear-levelling, discard the
481		- * old data of the cluster, taking care not to
482		- * discard any of its pages that have already
483		- * been allocated by racing tasks (offset has
484		- * already stepped over any at the beginning).
485		- */
486		- if (offset < si->highest_alloc &&
487		- si->lowest_alloc <= last_in_cluster)
488		- last_in_cluster = si->lowest_alloc - 1;
489		- si->flags \|= SWP_DISCARDING;
490		- spin_unlock(&si->lock);
491		-
492		- if (offset < last_in_cluster)
493		- discard_swap_cluster(si, offset,
494		- last_in_cluster - offset + 1);
495		-
496		- spin_lock(&si->lock);
497		- si->lowest_alloc = 0;
498		- si->flags &= ~SWP_DISCARDING;
499		-
500		- smp_mb(); /* wake_up_bit advises this */
501		- wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
502		-
503		- } else if (si->flags & SWP_DISCARDING) {
504		- /*
505		- * Delay using pages allocated by racing tasks
506		- * until the whole discard has been issued. We
507		- * could defer that delay until swap_writepage,
508		- * but it's easier to keep this self-contained.
509		- */
510		- spin_unlock(&si->lock);
511		- wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
512		- wait_for_discard, TASK_UNINTERRUPTIBLE);
513		- spin_lock(&si->lock);
514		- } else {
515		- /*
516		- * Note pages allocated by racing tasks while
517		- * scan for a free cluster is in progress, so
518		- * that its final discard can exclude them.
519		- */
520		- if (offset < si->lowest_alloc)
521		- si->lowest_alloc = offset;
522		- if (offset > si->highest_alloc)
523		- si->highest_alloc = offset;
524		- }
525		- }
526	556	return offset;
527	557
528	558	scan:
...	...	@@ -1806,6 +1836,8 @@
1806	1836	goto out_dput;
1807	1837	}
1808	1838
	1839	+ flush_work(&p->discard_work);
	1840	+
1809	1841	destroy_swap_extents(p);
1810	1842	if (p->flags & SWP_CONTINUED)
1811	1843	free_swap_count_continuations(p);
...	...	@@ -2172,6 +2204,8 @@
2172	2204
2173	2205	cluster_set_null(&p->free_cluster_head);
2174	2206	cluster_set_null(&p->free_cluster_tail);
	2207	+ cluster_set_null(&p->discard_cluster_head);
	2208	+ cluster_set_null(&p->discard_cluster_tail);
2175	2209
2176	2210	for (i = 0; i < swap_header->info.nr_badpages; i++) {
2177	2211	unsigned int page_nr = swap_header->info.badpages[i];
...	...	@@ -2280,6 +2314,8 @@
2280	2314	p = alloc_swap_info();
2281	2315	if (IS_ERR(p))
2282	2316	return PTR_ERR(p);
	2317	+
	2318	+ INIT_WORK(&p->discard_work, swap_discard_work);
2283	2319
2284	2320	name = getname(specialfile);
2285	2321	if (IS_ERR(name)) {