Commit 5515061d22f0f9976ae7815864bfd22042d36848

Authored by Mel Gorman
Committed by Linus Torvalds
1 parent 7f338fe454

mm: throttle direct reclaimers if PF_MEMALLOC reserves are low and swap is backed by network storage

If swap is backed by network storage such as NBD, there is a risk that a
large number of reclaimers can hang the system by consuming all
PF_MEMALLOC reserves.  To avoid these hangs, the administrator must tune
min_free_kbytes in advance which is a bit fragile.

This patch throttles direct reclaimers if half the PF_MEMALLOC reserves
are in use.  If the system is routinely getting throttled the system
administrator can increase min_free_kbytes so degradation is smoother but
the system will keep running.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: David Miller <davem@davemloft.net>
Cc: Neil Brown <neilb@suse.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Christie <michaelc@cs.wisc.edu>
Cc: Eric B Munson <emunson@mgebm.net>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 3 changed files with 122 additions and 8 deletions Side-by-side Diff

include/linux/mmzone.h
... ... @@ -705,6 +705,7 @@
705 705 range, including holes */
706 706 int node_id;
707 707 wait_queue_head_t kswapd_wait;
  708 + wait_queue_head_t pfmemalloc_wait;
708 709 struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */
709 710 int kswapd_max_order;
710 711 enum zone_type classzone_idx;
... ... @@ -4389,6 +4389,7 @@
4389 4389 pgdat_resize_init(pgdat);
4390 4390 pgdat->nr_zones = 0;
4391 4391 init_waitqueue_head(&pgdat->kswapd_wait);
  4392 + init_waitqueue_head(&pgdat->pfmemalloc_wait);
4392 4393 pgdat->kswapd_max_order = 0;
4393 4394 pgdat_page_cgroup_init(pgdat);
4394 4395  
... ... @@ -2112,6 +2112,80 @@
2112 2112 return 0;
2113 2113 }
2114 2114  
  2115 +static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
  2116 +{
  2117 + struct zone *zone;
  2118 + unsigned long pfmemalloc_reserve = 0;
  2119 + unsigned long free_pages = 0;
  2120 + int i;
  2121 + bool wmark_ok;
  2122 +
  2123 + for (i = 0; i <= ZONE_NORMAL; i++) {
  2124 + zone = &pgdat->node_zones[i];
  2125 + pfmemalloc_reserve += min_wmark_pages(zone);
  2126 + free_pages += zone_page_state(zone, NR_FREE_PAGES);
  2127 + }
  2128 +
  2129 + wmark_ok = free_pages > pfmemalloc_reserve / 2;
  2130 +
  2131 + /* kswapd must be awake if processes are being throttled */
  2132 + if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
  2133 + pgdat->classzone_idx = min(pgdat->classzone_idx,
  2134 + (enum zone_type)ZONE_NORMAL);
  2135 + wake_up_interruptible(&pgdat->kswapd_wait);
  2136 + }
  2137 +
  2138 + return wmark_ok;
  2139 +}
  2140 +
  2141 +/*
  2142 + * Throttle direct reclaimers if backing storage is backed by the network
  2143 + * and the PFMEMALLOC reserve for the preferred node is getting dangerously
  2144 + * depleted. kswapd will continue to make progress and wake the processes
  2145 + * when the low watermark is reached
  2146 + */
  2147 +static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
  2148 + nodemask_t *nodemask)
  2149 +{
  2150 + struct zone *zone;
  2151 + int high_zoneidx = gfp_zone(gfp_mask);
  2152 + pg_data_t *pgdat;
  2153 +
  2154 + /*
  2155 + * Kernel threads should not be throttled as they may be indirectly
  2156 + * responsible for cleaning pages necessary for reclaim to make forward
  2157 + * progress. kjournald for example may enter direct reclaim while
  2158 + * committing a transaction where throttling it could forcing other
  2159 + * processes to block on log_wait_commit().
  2160 + */
  2161 + if (current->flags & PF_KTHREAD)
  2162 + return;
  2163 +
  2164 + /* Check if the pfmemalloc reserves are ok */
  2165 + first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
  2166 + pgdat = zone->zone_pgdat;
  2167 + if (pfmemalloc_watermark_ok(pgdat))
  2168 + return;
  2169 +
  2170 + /*
  2171 + * If the caller cannot enter the filesystem, it's possible that it
  2172 + * is due to the caller holding an FS lock or performing a journal
  2173 + * transaction in the case of a filesystem like ext[3|4]. In this case,
  2174 + * it is not safe to block on pfmemalloc_wait as kswapd could be
  2175 + * blocked waiting on the same lock. Instead, throttle for up to a
  2176 + * second before continuing.
  2177 + */
  2178 + if (!(gfp_mask & __GFP_FS)) {
  2179 + wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
  2180 + pfmemalloc_watermark_ok(pgdat), HZ);
  2181 + return;
  2182 + }
  2183 +
  2184 + /* Throttle until kswapd wakes the process */
  2185 + wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
  2186 + pfmemalloc_watermark_ok(pgdat));
  2187 +}
  2188 +
2115 2189 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2116 2190 gfp_t gfp_mask, nodemask_t *nodemask)
2117 2191 {
... ... @@ -2131,6 +2205,15 @@
2131 2205 .gfp_mask = sc.gfp_mask,
2132 2206 };
2133 2207  
  2208 + throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
  2209 +
  2210 + /*
  2211 + * Do not enter reclaim if fatal signal is pending. 1 is returned so
  2212 + * that the page allocator does not consider triggering OOM
  2213 + */
  2214 + if (fatal_signal_pending(current))
  2215 + return 1;
  2216 +
2134 2217 trace_mm_vmscan_direct_reclaim_begin(order,
2135 2218 sc.may_writepage,
2136 2219 gfp_mask);
... ... @@ -2275,8 +2358,13 @@
2275 2358 return balanced_pages >= (present_pages >> 2);
2276 2359 }
2277 2360  
2278   -/* is kswapd sleeping prematurely? */
2279   -static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
  2361 +/*
  2362 + * Prepare kswapd for sleeping. This verifies that there are no processes
  2363 + * waiting in throttle_direct_reclaim() and that watermarks have been met.
  2364 + *
  2365 + * Returns true if kswapd is ready to sleep
  2366 + */
  2367 +static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2280 2368 int classzone_idx)
2281 2369 {
2282 2370 int i;
2283 2371  
... ... @@ -2285,8 +2373,22 @@
2285 2373  
2286 2374 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
2287 2375 if (remaining)
2288   - return true;
  2376 + return false;
2289 2377  
  2378 + /*
  2379 + * There is a potential race between when kswapd checks its watermarks
  2380 + * and a process gets throttled. There is also a potential race if
  2381 + * processes get throttled, kswapd wakes, a large process exits therby
  2382 + * balancing the zones that causes kswapd to miss a wakeup. If kswapd
  2383 + * is going to sleep, no process should be sleeping on pfmemalloc_wait
  2384 + * so wake them now if necessary. If necessary, processes will wake
  2385 + * kswapd and get throttled again
  2386 + */
  2387 + if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
  2388 + wake_up(&pgdat->pfmemalloc_wait);
  2389 + return false;
  2390 + }
  2391 +
2290 2392 /* Check the watermark levels */
2291 2393 for (i = 0; i <= classzone_idx; i++) {
2292 2394 struct zone *zone = pgdat->node_zones + i;
2293 2395  
... ... @@ -2318,9 +2420,9 @@
2318 2420 * must be balanced
2319 2421 */
2320 2422 if (order)
2321   - return !pgdat_balanced(pgdat, balanced, classzone_idx);
  2423 + return pgdat_balanced(pgdat, balanced, classzone_idx);
2322 2424 else
2323   - return !all_zones_ok;
  2425 + return all_zones_ok;
2324 2426 }
2325 2427  
2326 2428 /*
... ... @@ -2546,6 +2648,16 @@
2546 2648 }
2547 2649  
2548 2650 }
  2651 +
  2652 + /*
  2653 + * If the low watermark is met there is no need for processes
  2654 + * to be throttled on pfmemalloc_wait as they should not be
  2655 + * able to safely make forward progress. Wake them
  2656 + */
  2657 + if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
  2658 + pfmemalloc_watermark_ok(pgdat))
  2659 + wake_up(&pgdat->pfmemalloc_wait);
  2660 +
2549 2661 if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
2550 2662 break; /* kswapd: all done */
2551 2663 /*
... ... @@ -2647,7 +2759,7 @@
2647 2759 }
2648 2760  
2649 2761 /*
2650   - * Return the order we were reclaiming at so sleeping_prematurely()
  2762 + * Return the order we were reclaiming at so prepare_kswapd_sleep()
2651 2763 * makes a decision on the order we were last reclaiming at. However,
2652 2764 * if another caller entered the allocator slow path while kswapd
2653 2765 * was awake, order will remain at the higher level
... ... @@ -2667,7 +2779,7 @@
2667 2779 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2668 2780  
2669 2781 /* Try to sleep for a short interval */
2670   - if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
  2782 + if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
2671 2783 remaining = schedule_timeout(HZ/10);
2672 2784 finish_wait(&pgdat->kswapd_wait, &wait);
2673 2785 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
... ... @@ -2677,7 +2789,7 @@
2677 2789 * After a short sleep, check if it was a premature sleep. If not, then
2678 2790 * go fully to sleep until explicitly woken up.
2679 2791 */
2680   - if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
  2792 + if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
2681 2793 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2682 2794  
2683 2795 /*