Commit 5515061d22f0f9976ae7815864bfd22042d36848
Committed by
Linus Torvalds
1 parent
7f338fe454
Exists in
master
and in
20 other branches
mm: throttle direct reclaimers if PF_MEMALLOC reserves are low and swap is backed by network storage
If swap is backed by network storage such as NBD, there is a risk that a large number of reclaimers can hang the system by consuming all PF_MEMALLOC reserves. To avoid these hangs, the administrator must tune min_free_kbytes in advance which is a bit fragile. This patch throttles direct reclaimers if half the PF_MEMALLOC reserves are in use. If the system is routinely getting throttled the system administrator can increase min_free_kbytes so degradation is smoother but the system will keep running. Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: David Miller <davem@davemloft.net> Cc: Neil Brown <neilb@suse.de> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Mike Christie <michaelc@cs.wisc.edu> Cc: Eric B Munson <emunson@mgebm.net> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Sebastian Andrzej Siewior <sebastian@breakpoint.cc> Cc: Mel Gorman <mgorman@suse.de> Cc: Christoph Lameter <cl@linux.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 3 changed files with 122 additions and 8 deletions Side-by-side Diff
include/linux/mmzone.h
... | ... | @@ -705,6 +705,7 @@ |
705 | 705 | range, including holes */ |
706 | 706 | int node_id; |
707 | 707 | wait_queue_head_t kswapd_wait; |
708 | + wait_queue_head_t pfmemalloc_wait; | |
708 | 709 | struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */ |
709 | 710 | int kswapd_max_order; |
710 | 711 | enum zone_type classzone_idx; |
mm/page_alloc.c
mm/vmscan.c
... | ... | @@ -2112,6 +2112,80 @@ |
2112 | 2112 | return 0; |
2113 | 2113 | } |
2114 | 2114 | |
2115 | +static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) | |
2116 | +{ | |
2117 | + struct zone *zone; | |
2118 | + unsigned long pfmemalloc_reserve = 0; | |
2119 | + unsigned long free_pages = 0; | |
2120 | + int i; | |
2121 | + bool wmark_ok; | |
2122 | + | |
2123 | + for (i = 0; i <= ZONE_NORMAL; i++) { | |
2124 | + zone = &pgdat->node_zones[i]; | |
2125 | + pfmemalloc_reserve += min_wmark_pages(zone); | |
2126 | + free_pages += zone_page_state(zone, NR_FREE_PAGES); | |
2127 | + } | |
2128 | + | |
2129 | + wmark_ok = free_pages > pfmemalloc_reserve / 2; | |
2130 | + | |
2131 | + /* kswapd must be awake if processes are being throttled */ | |
2132 | + if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { | |
2133 | + pgdat->classzone_idx = min(pgdat->classzone_idx, | |
2134 | + (enum zone_type)ZONE_NORMAL); | |
2135 | + wake_up_interruptible(&pgdat->kswapd_wait); | |
2136 | + } | |
2137 | + | |
2138 | + return wmark_ok; | |
2139 | +} | |
2140 | + | |
2141 | +/* | |
2142 | + * Throttle direct reclaimers if backing storage is backed by the network | |
2143 | + * and the PFMEMALLOC reserve for the preferred node is getting dangerously | |
2144 | + * depleted. kswapd will continue to make progress and wake the processes | |
2145 | + * when the low watermark is reached | |
2146 | + */ | |
2147 | +static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, | |
2148 | + nodemask_t *nodemask) | |
2149 | +{ | |
2150 | + struct zone *zone; | |
2151 | + int high_zoneidx = gfp_zone(gfp_mask); | |
2152 | + pg_data_t *pgdat; | |
2153 | + | |
2154 | + /* | |
2155 | + * Kernel threads should not be throttled as they may be indirectly | |
2156 | + * responsible for cleaning pages necessary for reclaim to make forward | |
2157 | + * progress. kjournald for example may enter direct reclaim while | |
2158 | + * committing a transaction where throttling it could forcing other | |
2159 | + * processes to block on log_wait_commit(). | |
2160 | + */ | |
2161 | + if (current->flags & PF_KTHREAD) | |
2162 | + return; | |
2163 | + | |
2164 | + /* Check if the pfmemalloc reserves are ok */ | |
2165 | + first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); | |
2166 | + pgdat = zone->zone_pgdat; | |
2167 | + if (pfmemalloc_watermark_ok(pgdat)) | |
2168 | + return; | |
2169 | + | |
2170 | + /* | |
2171 | + * If the caller cannot enter the filesystem, it's possible that it | |
2172 | + * is due to the caller holding an FS lock or performing a journal | |
2173 | + * transaction in the case of a filesystem like ext[3|4]. In this case, | |
2174 | + * it is not safe to block on pfmemalloc_wait as kswapd could be | |
2175 | + * blocked waiting on the same lock. Instead, throttle for up to a | |
2176 | + * second before continuing. | |
2177 | + */ | |
2178 | + if (!(gfp_mask & __GFP_FS)) { | |
2179 | + wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, | |
2180 | + pfmemalloc_watermark_ok(pgdat), HZ); | |
2181 | + return; | |
2182 | + } | |
2183 | + | |
2184 | + /* Throttle until kswapd wakes the process */ | |
2185 | + wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, | |
2186 | + pfmemalloc_watermark_ok(pgdat)); | |
2187 | +} | |
2188 | + | |
2115 | 2189 | unsigned long try_to_free_pages(struct zonelist *zonelist, int order, |
2116 | 2190 | gfp_t gfp_mask, nodemask_t *nodemask) |
2117 | 2191 | { |
... | ... | @@ -2131,6 +2205,15 @@ |
2131 | 2205 | .gfp_mask = sc.gfp_mask, |
2132 | 2206 | }; |
2133 | 2207 | |
2208 | + throttle_direct_reclaim(gfp_mask, zonelist, nodemask); | |
2209 | + | |
2210 | + /* | |
2211 | + * Do not enter reclaim if fatal signal is pending. 1 is returned so | |
2212 | + * that the page allocator does not consider triggering OOM | |
2213 | + */ | |
2214 | + if (fatal_signal_pending(current)) | |
2215 | + return 1; | |
2216 | + | |
2134 | 2217 | trace_mm_vmscan_direct_reclaim_begin(order, |
2135 | 2218 | sc.may_writepage, |
2136 | 2219 | gfp_mask); |
... | ... | @@ -2275,8 +2358,13 @@ |
2275 | 2358 | return balanced_pages >= (present_pages >> 2); |
2276 | 2359 | } |
2277 | 2360 | |
2278 | -/* is kswapd sleeping prematurely? */ | |
2279 | -static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, | |
2361 | +/* | |
2362 | + * Prepare kswapd for sleeping. This verifies that there are no processes | |
2363 | + * waiting in throttle_direct_reclaim() and that watermarks have been met. | |
2364 | + * | |
2365 | + * Returns true if kswapd is ready to sleep | |
2366 | + */ | |
2367 | +static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, | |
2280 | 2368 | int classzone_idx) |
2281 | 2369 | { |
2282 | 2370 | int i; |
2283 | 2371 | |
... | ... | @@ -2285,8 +2373,22 @@ |
2285 | 2373 | |
2286 | 2374 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ |
2287 | 2375 | if (remaining) |
2288 | - return true; | |
2376 | + return false; | |
2289 | 2377 | |
2378 | + /* | |
2379 | + * There is a potential race between when kswapd checks its watermarks | |
2380 | + * and a process gets throttled. There is also a potential race if | |
2381 | + * processes get throttled, kswapd wakes, a large process exits therby | |
2382 | + * balancing the zones that causes kswapd to miss a wakeup. If kswapd | |
2383 | + * is going to sleep, no process should be sleeping on pfmemalloc_wait | |
2384 | + * so wake them now if necessary. If necessary, processes will wake | |
2385 | + * kswapd and get throttled again | |
2386 | + */ | |
2387 | + if (waitqueue_active(&pgdat->pfmemalloc_wait)) { | |
2388 | + wake_up(&pgdat->pfmemalloc_wait); | |
2389 | + return false; | |
2390 | + } | |
2391 | + | |
2290 | 2392 | /* Check the watermark levels */ |
2291 | 2393 | for (i = 0; i <= classzone_idx; i++) { |
2292 | 2394 | struct zone *zone = pgdat->node_zones + i; |
2293 | 2395 | |
... | ... | @@ -2318,9 +2420,9 @@ |
2318 | 2420 | * must be balanced |
2319 | 2421 | */ |
2320 | 2422 | if (order) |
2321 | - return !pgdat_balanced(pgdat, balanced, classzone_idx); | |
2423 | + return pgdat_balanced(pgdat, balanced, classzone_idx); | |
2322 | 2424 | else |
2323 | - return !all_zones_ok; | |
2425 | + return all_zones_ok; | |
2324 | 2426 | } |
2325 | 2427 | |
2326 | 2428 | /* |
... | ... | @@ -2546,6 +2648,16 @@ |
2546 | 2648 | } |
2547 | 2649 | |
2548 | 2650 | } |
2651 | + | |
2652 | + /* | |
2653 | + * If the low watermark is met there is no need for processes | |
2654 | + * to be throttled on pfmemalloc_wait as they should not be | |
2655 | + * able to safely make forward progress. Wake them | |
2656 | + */ | |
2657 | + if (waitqueue_active(&pgdat->pfmemalloc_wait) && | |
2658 | + pfmemalloc_watermark_ok(pgdat)) | |
2659 | + wake_up(&pgdat->pfmemalloc_wait); | |
2660 | + | |
2549 | 2661 | if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) |
2550 | 2662 | break; /* kswapd: all done */ |
2551 | 2663 | /* |
... | ... | @@ -2647,7 +2759,7 @@ |
2647 | 2759 | } |
2648 | 2760 | |
2649 | 2761 | /* |
2650 | - * Return the order we were reclaiming at so sleeping_prematurely() | |
2762 | + * Return the order we were reclaiming at so prepare_kswapd_sleep() | |
2651 | 2763 | * makes a decision on the order we were last reclaiming at. However, |
2652 | 2764 | * if another caller entered the allocator slow path while kswapd |
2653 | 2765 | * was awake, order will remain at the higher level |
... | ... | @@ -2667,7 +2779,7 @@ |
2667 | 2779 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); |
2668 | 2780 | |
2669 | 2781 | /* Try to sleep for a short interval */ |
2670 | - if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { | |
2782 | + if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { | |
2671 | 2783 | remaining = schedule_timeout(HZ/10); |
2672 | 2784 | finish_wait(&pgdat->kswapd_wait, &wait); |
2673 | 2785 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); |
... | ... | @@ -2677,7 +2789,7 @@ |
2677 | 2789 | * After a short sleep, check if it was a premature sleep. If not, then |
2678 | 2790 | * go fully to sleep until explicitly woken up. |
2679 | 2791 | */ |
2680 | - if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { | |
2792 | + if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { | |
2681 | 2793 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); |
2682 | 2794 | |
2683 | 2795 | /* |
-
mentioned in commit 18d930
-
mentioned in commit 18d930
-
mentioned in commit 18d930
-
mentioned in commit 9e5e36
-
mentioned in commit 9e5e36
-
mentioned in commit 9e5e36
-
mentioned in commit 9e5e36
-
mentioned in commit 9e5e36
-
mentioned in commit 9e5e36
-
mentioned in commit 9e5e36
-
mentioned in commit 9e5e36
-
mentioned in commit 9e5e36
-
mentioned in commit 9e5e36
-
mentioned in commit 9e5e36
-
mentioned in commit 9e5e36
-
mentioned in commit 9e5e36