Commit dcf6b7ddd7df8965727746f89c59229b23180e5a

Authored by Rafael Aquini
Committed by Linus Torvalds
1 parent 917d9290af

swap: discard while swapping only if SWAP_FLAG_DISCARD_PAGES

Considering the use cases where the swap device supports discard:
a) and can do it quickly;
b) but it's slow to do in small granularities (or concurrent with other
   I/O);
c) but the implementation is so horrendous that you don't even want to
   send one down;

And assuming that the sysadmin considers it useful to send the discards down
at all, we would (probably) want the following solutions:

  i. do the fine-grained discards for freed swap pages, if device is
     capable of doing so optimally;
 ii. do single-time (batched) swap area discards, either at swapon
     or via something like fstrim (not implemented yet);
iii. allow doing both single-time and fine-grained discards; or
 iv. turn it off completely (default behavior)

As implemented today, one can only enable/disable discards for swap, but
one cannot select, for instance, solution (ii) on a swap device like (b)
even though the single-time discard is regarded to be interesting, or
necessary to the workload because it would imply (1), and the device is
not capable of performing it optimally.

This patch addresses the scenario depicted above by introducing a way to
ensure the (probably) wanted solutions (i, ii, iii and iv) can be flexibly
flagged through swapon(8) to allow a sysadmin to select the best suitable
swap discard policy accordingly to system constraints.

This patch introduces SWAP_FLAG_DISCARD_PAGES and SWAP_FLAG_DISCARD_ONCE
new flags to allow more flexibe swap discard policies being flagged
through swapon(8).  The default behavior is to keep both single-time, or
batched, area discards (SWAP_FLAG_DISCARD_ONCE) and fine-grained discards
for page-clusters (SWAP_FLAG_DISCARD_PAGES) enabled, in order to keep
consistentcy with older kernel behavior, as well as maintain compatibility
with older swapon(8).  However, through the new introduced flags the best
suitable discard policy can be selected accordingly to any given swap
device constraint.

[akpm@linux-foundation.org: tweak comments]
Signed-off-by: Rafael Aquini <aquini@redhat.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Karel Zak <kzak@redhat.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 2 changed files with 59 additions and 9 deletions Side-by-side Diff

include/linux/swap.h
... ... @@ -20,10 +20,13 @@
20 20 #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */
21 21 #define SWAP_FLAG_PRIO_MASK 0x7fff
22 22 #define SWAP_FLAG_PRIO_SHIFT 0
23   -#define SWAP_FLAG_DISCARD 0x10000 /* discard swap cluster after use */
  23 +#define SWAP_FLAG_DISCARD 0x10000 /* enable discard for swap */
  24 +#define SWAP_FLAG_DISCARD_ONCE 0x20000 /* discard swap area at swapon-time */
  25 +#define SWAP_FLAG_DISCARD_PAGES 0x40000 /* discard page-clusters after use */
24 26  
25 27 #define SWAP_FLAGS_VALID (SWAP_FLAG_PRIO_MASK | SWAP_FLAG_PREFER | \
26   - SWAP_FLAG_DISCARD)
  28 + SWAP_FLAG_DISCARD | SWAP_FLAG_DISCARD_ONCE | \
  29 + SWAP_FLAG_DISCARD_PAGES)
27 30  
28 31 static inline int current_is_kswapd(void)
29 32 {
30 33  
31 34  
... ... @@ -147,14 +150,16 @@
147 150 enum {
148 151 SWP_USED = (1 << 0), /* is slot in swap_info[] used? */
149 152 SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */
150   - SWP_DISCARDABLE = (1 << 2), /* swapon+blkdev support discard */
  153 + SWP_DISCARDABLE = (1 << 2), /* blkdev support discard */
151 154 SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */
152 155 SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */
153 156 SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */
154 157 SWP_BLKDEV = (1 << 6), /* its a block device */
155 158 SWP_FILE = (1 << 7), /* set after swap_activate success */
  159 + SWP_AREA_DISCARD = (1 << 8), /* single-time swap area discards */
  160 + SWP_PAGE_DISCARD = (1 << 9), /* freed swap page-cluster discards */
156 161 /* add others here before... */
157   - SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */
  162 + SWP_SCANNING = (1 << 10), /* refcount in scan_swap_map */
158 163 };
159 164  
160 165 #define SWAP_CLUSTER_MAX 32UL
... ... @@ -212,7 +212,7 @@
212 212 si->cluster_nr = SWAPFILE_CLUSTER - 1;
213 213 goto checks;
214 214 }
215   - if (si->flags & SWP_DISCARDABLE) {
  215 + if (si->flags & SWP_PAGE_DISCARD) {
216 216 /*
217 217 * Start range check on racing allocations, in case
218 218 * they overlap the cluster we eventually decide on
... ... @@ -322,7 +322,7 @@
322 322  
323 323 if (si->lowest_alloc) {
324 324 /*
325   - * Only set when SWP_DISCARDABLE, and there's a scan
  325 + * Only set when SWP_PAGE_DISCARD, and there's a scan
326 326 * for a free cluster in progress or just completed.
327 327 */
328 328 if (found_free_cluster) {
... ... @@ -2016,6 +2016,20 @@
2016 2016 return nr_extents;
2017 2017 }
2018 2018  
  2019 +/*
  2020 + * Helper to sys_swapon determining if a given swap
  2021 + * backing device queue supports DISCARD operations.
  2022 + */
  2023 +static bool swap_discardable(struct swap_info_struct *si)
  2024 +{
  2025 + struct request_queue *q = bdev_get_queue(si->bdev);
  2026 +
  2027 + if (!q || !blk_queue_discard(q))
  2028 + return false;
  2029 +
  2030 + return true;
  2031 +}
  2032 +
2019 2033 SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2020 2034 {
2021 2035 struct swap_info_struct *p;
... ... @@ -2123,8 +2137,37 @@
2123 2137 p->flags |= SWP_SOLIDSTATE;
2124 2138 p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
2125 2139 }
2126   - if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0)
2127   - p->flags |= SWP_DISCARDABLE;
  2140 +
  2141 + if ((swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
  2142 + /*
  2143 + * When discard is enabled for swap with no particular
  2144 + * policy flagged, we set all swap discard flags here in
  2145 + * order to sustain backward compatibility with older
  2146 + * swapon(8) releases.
  2147 + */
  2148 + p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
  2149 + SWP_PAGE_DISCARD);
  2150 +
  2151 + /*
  2152 + * By flagging sys_swapon, a sysadmin can tell us to
  2153 + * either do single-time area discards only, or to just
  2154 + * perform discards for released swap page-clusters.
  2155 + * Now it's time to adjust the p->flags accordingly.
  2156 + */
  2157 + if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
  2158 + p->flags &= ~SWP_PAGE_DISCARD;
  2159 + else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
  2160 + p->flags &= ~SWP_AREA_DISCARD;
  2161 +
  2162 + /* issue a swapon-time discard if it's still required */
  2163 + if (p->flags & SWP_AREA_DISCARD) {
  2164 + int err = discard_swap(p);
  2165 + if (unlikely(err))
  2166 + printk(KERN_ERR
  2167 + "swapon: discard_swap(%p): %d\n",
  2168 + p, err);
  2169 + }
  2170 + }
2128 2171 }
2129 2172  
2130 2173 mutex_lock(&swapon_mutex);
2131 2174  
... ... @@ -2135,11 +2178,13 @@
2135 2178 enable_swap_info(p, prio, swap_map, frontswap_map);
2136 2179  
2137 2180 printk(KERN_INFO "Adding %uk swap on %s. "
2138   - "Priority:%d extents:%d across:%lluk %s%s%s\n",
  2181 + "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
2139 2182 p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
2140 2183 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
2141 2184 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
2142 2185 (p->flags & SWP_DISCARDABLE) ? "D" : "",
  2186 + (p->flags & SWP_AREA_DISCARD) ? "s" : "",
  2187 + (p->flags & SWP_PAGE_DISCARD) ? "c" : "",
2143 2188 (frontswap_map) ? "FS" : "");
2144 2189  
2145 2190 mutex_unlock(&swapon_mutex);