Commit 815c2c543d3aeb914a361f981440ece552778724

Authored by Shaohua Li
Committed by Linus Torvalds
1 parent 2a8f944934

swap: make swap discard async

swap can do cluster discard for SSD, which is good, but there are some
problems here:

1. swap do the discard just before page reclaim gets a swap entry and
   writes the disk sectors.  This is useless for high end SSD, because an
   overwrite to a sector implies a discard to original sector too.  A
   discard + overwrite == overwrite.

2. the purpose of doing discard is to improve SSD firmware garbage
   collection.  Idealy we should send discard as early as possible, so
   firmware can do something smart.  Sending discard just after swap entry
   is freed is considered early compared to sending discard before write.
   Of course, if workload is already bound to gc speed, sending discard
   earlier or later doesn't make

3. block discard is a sync API, which will delay scan_swap_map()
   significantly.

4. Write and discard command can be executed parallel in PCIe SSD.
   Making swap discard async can make execution more efficiently.

This patch makes swap discard async and moves discard to where swap entry
is freed.  Discard and write have no dependence now, so above issues can
be avoided.  Idealy we should do discard for any freed sectors, but some
SSD discard is very slow.  This patch still does discard for a whole
cluster.

My test does a several round of 'mmap, write, unmap', which will trigger a
lot of swap discard.  In a fusionio card, with this patch, the test
runtime is reduced to 18% of the time without it, so around 5.5x faster.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Shaohua Li <shli@fusionio.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Kyungmin Park <kmpark@infradead.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rafael Aquini <aquini@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 2 changed files with 125 additions and 87 deletions Side-by-side Diff

include/linux/swap.h
... ... @@ -217,8 +217,6 @@
217 217 unsigned int inuse_pages; /* number of those currently in use */
218 218 unsigned int cluster_next; /* likely index for next allocation */
219 219 unsigned int cluster_nr; /* countdown to next cluster search */
220   - unsigned int lowest_alloc; /* while preparing discard cluster */
221   - unsigned int highest_alloc; /* while preparing discard cluster */
222 220 struct swap_extent *curr_swap_extent;
223 221 struct swap_extent first_swap_extent;
224 222 struct block_device *bdev; /* swap device or bdev of swap file */
225 223  
... ... @@ -232,14 +230,18 @@
232 230 * protect map scan related fields like
233 231 * swap_map, lowest_bit, highest_bit,
234 232 * inuse_pages, cluster_next,
235   - * cluster_nr, lowest_alloc and
236   - * highest_alloc. other fields are only
237   - * changed at swapon/swapoff, so are
238   - * protected by swap_lock. changing
239   - * flags need hold this lock and
240   - * swap_lock. If both locks need hold,
241   - * hold swap_lock first.
  233 + * cluster_nr, lowest_alloc,
  234 + * highest_alloc, free/discard cluster
  235 + * list. other fields are only changed
  236 + * at swapon/swapoff, so are protected
  237 + * by swap_lock. changing flags need
  238 + * hold this lock and swap_lock. If
  239 + * both locks need hold, hold swap_lock
  240 + * first.
242 241 */
  242 + struct work_struct discard_work; /* discard worker */
  243 + struct swap_cluster_info discard_cluster_head; /* list head of discard clusters */
  244 + struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */
243 245 };
244 246  
245 247 struct swap_list_t {
... ... @@ -175,12 +175,6 @@
175 175 }
176 176 }
177 177  
178   -static int wait_for_discard(void *word)
179   -{
180   - schedule();
181   - return 0;
182   -}
183   -
184 178 #define SWAPFILE_CLUSTER 256
185 179 #define LATENCY_LIMIT 256
186 180  
187 181  
... ... @@ -242,7 +236,91 @@
242 236 info->data = 0;
243 237 }
244 238  
  239 +/* Add a cluster to discard list and schedule it to do discard */
  240 +static void swap_cluster_schedule_discard(struct swap_info_struct *si,
  241 + unsigned int idx)
  242 +{
  243 + /*
  244 + * If scan_swap_map() can't find a free cluster, it will check
  245 + * si->swap_map directly. To make sure the discarding cluster isn't
  246 + * taken by scan_swap_map(), mark the swap entries bad (occupied). It
  247 + * will be cleared after discard
  248 + */
  249 + memset(si->swap_map + idx * SWAPFILE_CLUSTER,
  250 + SWAP_MAP_BAD, SWAPFILE_CLUSTER);
  251 +
  252 + if (cluster_is_null(&si->discard_cluster_head)) {
  253 + cluster_set_next_flag(&si->discard_cluster_head,
  254 + idx, 0);
  255 + cluster_set_next_flag(&si->discard_cluster_tail,
  256 + idx, 0);
  257 + } else {
  258 + unsigned int tail = cluster_next(&si->discard_cluster_tail);
  259 + cluster_set_next(&si->cluster_info[tail], idx);
  260 + cluster_set_next_flag(&si->discard_cluster_tail,
  261 + idx, 0);
  262 + }
  263 +
  264 + schedule_work(&si->discard_work);
  265 +}
  266 +
245 267 /*
  268 + * Doing discard actually. After a cluster discard is finished, the cluster
  269 + * will be added to free cluster list. caller should hold si->lock.
  270 +*/
  271 +static void swap_do_scheduled_discard(struct swap_info_struct *si)
  272 +{
  273 + struct swap_cluster_info *info;
  274 + unsigned int idx;
  275 +
  276 + info = si->cluster_info;
  277 +
  278 + while (!cluster_is_null(&si->discard_cluster_head)) {
  279 + idx = cluster_next(&si->discard_cluster_head);
  280 +
  281 + cluster_set_next_flag(&si->discard_cluster_head,
  282 + cluster_next(&info[idx]), 0);
  283 + if (cluster_next(&si->discard_cluster_tail) == idx) {
  284 + cluster_set_null(&si->discard_cluster_head);
  285 + cluster_set_null(&si->discard_cluster_tail);
  286 + }
  287 + spin_unlock(&si->lock);
  288 +
  289 + discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
  290 + SWAPFILE_CLUSTER);
  291 +
  292 + spin_lock(&si->lock);
  293 + cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE);
  294 + if (cluster_is_null(&si->free_cluster_head)) {
  295 + cluster_set_next_flag(&si->free_cluster_head,
  296 + idx, 0);
  297 + cluster_set_next_flag(&si->free_cluster_tail,
  298 + idx, 0);
  299 + } else {
  300 + unsigned int tail;
  301 +
  302 + tail = cluster_next(&si->free_cluster_tail);
  303 + cluster_set_next(&info[tail], idx);
  304 + cluster_set_next_flag(&si->free_cluster_tail,
  305 + idx, 0);
  306 + }
  307 + memset(si->swap_map + idx * SWAPFILE_CLUSTER,
  308 + 0, SWAPFILE_CLUSTER);
  309 + }
  310 +}
  311 +
  312 +static void swap_discard_work(struct work_struct *work)
  313 +{
  314 + struct swap_info_struct *si;
  315 +
  316 + si = container_of(work, struct swap_info_struct, discard_work);
  317 +
  318 + spin_lock(&si->lock);
  319 + swap_do_scheduled_discard(si);
  320 + spin_unlock(&si->lock);
  321 +}
  322 +
  323 +/*
246 324 * The cluster corresponding to page_nr will be used. The cluster will be
247 325 * removed from free cluster list and its usage counter will be increased.
248 326 */
... ... @@ -287,6 +365,16 @@
287 365 cluster_count(&cluster_info[idx]) - 1);
288 366  
289 367 if (cluster_count(&cluster_info[idx]) == 0) {
  368 + /*
  369 + * If the swap is discardable, prepare discard the cluster
  370 + * instead of free it immediately. The cluster will be freed
  371 + * after discard.
  372 + */
  373 + if (p->flags & SWP_PAGE_DISCARD) {
  374 + swap_cluster_schedule_discard(p, idx);
  375 + return;
  376 + }
  377 +
290 378 cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
291 379 if (cluster_is_null(&p->free_cluster_head)) {
292 380 cluster_set_next_flag(&p->free_cluster_head, idx, 0);
... ... @@ -319,7 +407,6 @@
319 407 unsigned long scan_base;
320 408 unsigned long last_in_cluster = 0;
321 409 int latency_ration = LATENCY_LIMIT;
322   - int found_free_cluster = 0;
323 410  
324 411 /*
325 412 * We try to cluster swap pages by allocating them sequentially
... ... @@ -340,19 +427,6 @@
340 427 si->cluster_nr = SWAPFILE_CLUSTER - 1;
341 428 goto checks;
342 429 }
343   - if (si->flags & SWP_PAGE_DISCARD) {
344   - /*
345   - * Start range check on racing allocations, in case
346   - * they overlap the cluster we eventually decide on
347   - * (we scan without swap_lock to allow preemption).
348   - * It's hardly conceivable that cluster_nr could be
349   - * wrapped during our scan, but don't depend on it.
350   - */
351   - if (si->lowest_alloc)
352   - goto checks;
353   - si->lowest_alloc = si->max;
354   - si->highest_alloc = 0;
355   - }
356 430 check_cluster:
357 431 if (!cluster_is_null(&si->free_cluster_head)) {
358 432 offset = cluster_next(&si->free_cluster_head) *
359 433  
360 434  
... ... @@ -360,15 +434,27 @@
360 434 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
361 435 si->cluster_next = offset;
362 436 si->cluster_nr = SWAPFILE_CLUSTER - 1;
363   - found_free_cluster = 1;
364 437 goto checks;
365 438 } else if (si->cluster_info) {
366 439 /*
  440 + * we don't have free cluster but have some clusters in
  441 + * discarding, do discard now and reclaim them
  442 + */
  443 + if (!cluster_is_null(&si->discard_cluster_head)) {
  444 + si->cluster_nr = 0;
  445 + swap_do_scheduled_discard(si);
  446 + scan_base = offset = si->cluster_next;
  447 + if (!si->cluster_nr)
  448 + goto check_cluster;
  449 + si->cluster_nr--;
  450 + goto checks;
  451 + }
  452 +
  453 + /*
367 454 * Checking free cluster is fast enough, we can do the
368 455 * check every time
369 456 */
370 457 si->cluster_nr = 0;
371   - si->lowest_alloc = 0;
372 458 goto checks;
373 459 }
374 460  
... ... @@ -395,7 +481,6 @@
395 481 offset -= SWAPFILE_CLUSTER - 1;
396 482 si->cluster_next = offset;
397 483 si->cluster_nr = SWAPFILE_CLUSTER - 1;
398   - found_free_cluster = 1;
399 484 goto checks;
400 485 }
401 486 if (unlikely(--latency_ration < 0)) {
... ... @@ -416,7 +501,6 @@
416 501 offset -= SWAPFILE_CLUSTER - 1;
417 502 si->cluster_next = offset;
418 503 si->cluster_nr = SWAPFILE_CLUSTER - 1;
419   - found_free_cluster = 1;
420 504 goto checks;
421 505 }
422 506 if (unlikely(--latency_ration < 0)) {
... ... @@ -428,7 +512,6 @@
428 512 offset = scan_base;
429 513 spin_lock(&si->lock);
430 514 si->cluster_nr = SWAPFILE_CLUSTER - 1;
431   - si->lowest_alloc = 0;
432 515 }
433 516  
434 517 checks:
... ... @@ -470,59 +553,6 @@
470 553 si->cluster_next = offset + 1;
471 554 si->flags -= SWP_SCANNING;
472 555  
473   - if (si->lowest_alloc) {
474   - /*
475   - * Only set when SWP_PAGE_DISCARD, and there's a scan
476   - * for a free cluster in progress or just completed.
477   - */
478   - if (found_free_cluster) {
479   - /*
480   - * To optimize wear-levelling, discard the
481   - * old data of the cluster, taking care not to
482   - * discard any of its pages that have already
483   - * been allocated by racing tasks (offset has
484   - * already stepped over any at the beginning).
485   - */
486   - if (offset < si->highest_alloc &&
487   - si->lowest_alloc <= last_in_cluster)
488   - last_in_cluster = si->lowest_alloc - 1;
489   - si->flags |= SWP_DISCARDING;
490   - spin_unlock(&si->lock);
491   -
492   - if (offset < last_in_cluster)
493   - discard_swap_cluster(si, offset,
494   - last_in_cluster - offset + 1);
495   -
496   - spin_lock(&si->lock);
497   - si->lowest_alloc = 0;
498   - si->flags &= ~SWP_DISCARDING;
499   -
500   - smp_mb(); /* wake_up_bit advises this */
501   - wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
502   -
503   - } else if (si->flags & SWP_DISCARDING) {
504   - /*
505   - * Delay using pages allocated by racing tasks
506   - * until the whole discard has been issued. We
507   - * could defer that delay until swap_writepage,
508   - * but it's easier to keep this self-contained.
509   - */
510   - spin_unlock(&si->lock);
511   - wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
512   - wait_for_discard, TASK_UNINTERRUPTIBLE);
513   - spin_lock(&si->lock);
514   - } else {
515   - /*
516   - * Note pages allocated by racing tasks while
517   - * scan for a free cluster is in progress, so
518   - * that its final discard can exclude them.
519   - */
520   - if (offset < si->lowest_alloc)
521   - si->lowest_alloc = offset;
522   - if (offset > si->highest_alloc)
523   - si->highest_alloc = offset;
524   - }
525   - }
526 556 return offset;
527 557  
528 558 scan:
... ... @@ -1806,6 +1836,8 @@
1806 1836 goto out_dput;
1807 1837 }
1808 1838  
  1839 + flush_work(&p->discard_work);
  1840 +
1809 1841 destroy_swap_extents(p);
1810 1842 if (p->flags & SWP_CONTINUED)
1811 1843 free_swap_count_continuations(p);
... ... @@ -2172,6 +2204,8 @@
2172 2204  
2173 2205 cluster_set_null(&p->free_cluster_head);
2174 2206 cluster_set_null(&p->free_cluster_tail);
  2207 + cluster_set_null(&p->discard_cluster_head);
  2208 + cluster_set_null(&p->discard_cluster_tail);
2175 2209  
2176 2210 for (i = 0; i < swap_header->info.nr_badpages; i++) {
2177 2211 unsigned int page_nr = swap_header->info.badpages[i];
... ... @@ -2280,6 +2314,8 @@
2280 2314 p = alloc_swap_info();
2281 2315 if (IS_ERR(p))
2282 2316 return PTR_ERR(p);
  2317 +
  2318 + INIT_WORK(&p->discard_work, swap_discard_work);
2283 2319  
2284 2320 name = getname(specialfile);
2285 2321 if (IS_ERR(name)) {