Commit 7992fde72ce06c73280a1939b7a1e903bc95ef85

Authored by Hugh Dickins
Committed by Linus Torvalds
1 parent 6a6ba83175

swapfile: swap allocation use discard

When scan_swap_map() finds a free cluster of swap pages to allocate,
discard the old contents of the cluster if the device supports discard.
But don't bother when swap is so fragmented that we allocate single pages.

Be careful about racing allocations made while we're scanning for a
cluster; and hold up allocations made while we're discarding.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Joern Engel <joern@logfs.org>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Donjun Shin <djshin90@gmail.com>
Cc: Tejun Heo <teheo@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 2 changed files with 121 additions and 1 deletions Side-by-side Diff

include/linux/swap.h
... ... @@ -121,6 +121,7 @@
121 121 SWP_USED = (1 << 0), /* is slot in swap_info[] used? */
122 122 SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */
123 123 SWP_DISCARDABLE = (1 << 2), /* blkdev supports discard */
  124 + SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */
124 125 /* add others here before... */
125 126 SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */
126 127 };
... ... @@ -144,6 +145,8 @@
144 145 unsigned short *swap_map;
145 146 unsigned int lowest_bit;
146 147 unsigned int highest_bit;
  148 + unsigned int lowest_alloc; /* while preparing discard cluster */
  149 + unsigned int highest_alloc; /* while preparing discard cluster */
147 150 unsigned int cluster_next;
148 151 unsigned int cluster_nr;
149 152 unsigned int pages;
... ... @@ -115,14 +115,62 @@
115 115 return err; /* That will often be -EOPNOTSUPP */
116 116 }
117 117  
  118 +/*
  119 + * swap allocation tell device that a cluster of swap can now be discarded,
  120 + * to allow the swap device to optimize its wear-levelling.
  121 + */
  122 +static void discard_swap_cluster(struct swap_info_struct *si,
  123 + pgoff_t start_page, pgoff_t nr_pages)
  124 +{
  125 + struct swap_extent *se = si->curr_swap_extent;
  126 + int found_extent = 0;
  127 +
  128 + while (nr_pages) {
  129 + struct list_head *lh;
  130 +
  131 + if (se->start_page <= start_page &&
  132 + start_page < se->start_page + se->nr_pages) {
  133 + pgoff_t offset = start_page - se->start_page;
  134 + sector_t start_block = se->start_block + offset;
  135 + pgoff_t nr_blocks = se->nr_pages - offset;
  136 +
  137 + if (nr_blocks > nr_pages)
  138 + nr_blocks = nr_pages;
  139 + start_page += nr_blocks;
  140 + nr_pages -= nr_blocks;
  141 +
  142 + if (!found_extent++)
  143 + si->curr_swap_extent = se;
  144 +
  145 + start_block <<= PAGE_SHIFT - 9;
  146 + nr_blocks <<= PAGE_SHIFT - 9;
  147 + if (blkdev_issue_discard(si->bdev, start_block,
  148 + nr_blocks, GFP_NOIO))
  149 + break;
  150 + }
  151 +
  152 + lh = se->list.next;
  153 + if (lh == &si->extent_list)
  154 + lh = lh->next;
  155 + se = list_entry(lh, struct swap_extent, list);
  156 + }
  157 +}
  158 +
  159 +static int wait_for_discard(void *word)
  160 +{
  161 + schedule();
  162 + return 0;
  163 +}
  164 +
118 165 #define SWAPFILE_CLUSTER 256
119 166 #define LATENCY_LIMIT 256
120 167  
121 168 static inline unsigned long scan_swap_map(struct swap_info_struct *si)
122 169 {
123 170 unsigned long offset;
124   - unsigned long last_in_cluster;
  171 + unsigned long last_in_cluster = 0;
125 172 int latency_ration = LATENCY_LIMIT;
  173 + int found_free_cluster = 0;
126 174  
127 175 /*
128 176 * We try to cluster swap pages by allocating them sequentially
... ... @@ -142,6 +190,19 @@
142 190 si->cluster_nr = SWAPFILE_CLUSTER - 1;
143 191 goto checks;
144 192 }
  193 + if (si->flags & SWP_DISCARDABLE) {
  194 + /*
  195 + * Start range check on racing allocations, in case
  196 + * they overlap the cluster we eventually decide on
  197 + * (we scan without swap_lock to allow preemption).
  198 + * It's hardly conceivable that cluster_nr could be
  199 + * wrapped during our scan, but don't depend on it.
  200 + */
  201 + if (si->lowest_alloc)
  202 + goto checks;
  203 + si->lowest_alloc = si->max;
  204 + si->highest_alloc = 0;
  205 + }
145 206 spin_unlock(&swap_lock);
146 207  
147 208 offset = si->lowest_bit;
... ... @@ -156,6 +217,7 @@
156 217 offset -= SWAPFILE_CLUSTER - 1;
157 218 si->cluster_next = offset;
158 219 si->cluster_nr = SWAPFILE_CLUSTER - 1;
  220 + found_free_cluster = 1;
159 221 goto checks;
160 222 }
161 223 if (unlikely(--latency_ration < 0)) {
... ... @@ -167,6 +229,7 @@
167 229 offset = si->lowest_bit;
168 230 spin_lock(&swap_lock);
169 231 si->cluster_nr = SWAPFILE_CLUSTER - 1;
  232 + si->lowest_alloc = 0;
170 233 }
171 234  
172 235 checks:
... ... @@ -191,6 +254,60 @@
191 254 si->swap_map[offset] = 1;
192 255 si->cluster_next = offset + 1;
193 256 si->flags -= SWP_SCANNING;
  257 +
  258 + if (si->lowest_alloc) {
  259 + /*
  260 + * Only set when SWP_DISCARDABLE, and there's a scan
  261 + * for a free cluster in progress or just completed.
  262 + */
  263 + if (found_free_cluster) {
  264 + /*
  265 + * To optimize wear-levelling, discard the
  266 + * old data of the cluster, taking care not to
  267 + * discard any of its pages that have already
  268 + * been allocated by racing tasks (offset has
  269 + * already stepped over any at the beginning).
  270 + */
  271 + if (offset < si->highest_alloc &&
  272 + si->lowest_alloc <= last_in_cluster)
  273 + last_in_cluster = si->lowest_alloc - 1;
  274 + si->flags |= SWP_DISCARDING;
  275 + spin_unlock(&swap_lock);
  276 +
  277 + if (offset < last_in_cluster)
  278 + discard_swap_cluster(si, offset,
  279 + last_in_cluster - offset + 1);
  280 +
  281 + spin_lock(&swap_lock);
  282 + si->lowest_alloc = 0;
  283 + si->flags &= ~SWP_DISCARDING;
  284 +
  285 + smp_mb(); /* wake_up_bit advises this */
  286 + wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
  287 +
  288 + } else if (si->flags & SWP_DISCARDING) {
  289 + /*
  290 + * Delay using pages allocated by racing tasks
  291 + * until the whole discard has been issued. We
  292 + * could defer that delay until swap_writepage,
  293 + * but it's easier to keep this self-contained.
  294 + */
  295 + spin_unlock(&swap_lock);
  296 + wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
  297 + wait_for_discard, TASK_UNINTERRUPTIBLE);
  298 + spin_lock(&swap_lock);
  299 + } else {
  300 + /*
  301 + * Note pages allocated by racing tasks while
  302 + * scan for a free cluster is in progress, so
  303 + * that its final discard can exclude them.
  304 + */
  305 + if (offset < si->lowest_alloc)
  306 + si->lowest_alloc = offset;
  307 + if (offset > si->highest_alloc)
  308 + si->highest_alloc = offset;
  309 + }
  310 + }
194 311 return offset;
195 312  
196 313 scan: