vmscan: shrinker->nr updates race and go wrong

shrink_slab() allows shrinkers to be called in parallel so the struct shrinker can be updated concurrently. It does not provide any exclusio for such updates, so we can get the shrinker->nr value increasing or decreasing incorrectly. As a result, when a shrinker repeatedly returns a value of -1 (e.g. a VFS shrinker called w/ GFP_NOFS), the shrinker->nr goes haywire, sometimes updating with the scan count that wasn't used, sometimes losing it altogether. Worse is when a shrinker does work and that update is lost due to racy updates, which means the shrinker will do the work again! Fix this by making the total_scan calculations independent of shrinker->nr, and making the shrinker->nr updates atomic w.r.t. to other updates via cmpxchg loops. Signed-off-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

vmscan: shrinker->nr updates race and go wrong
shrink_slab() allows shrinkers to be called in parallel so the struct shrinker can be updated concurrently. It does not provide any exclusio for such updates, so we can get the shrinker->nr value increasing or decreasing incorrectly. As a result, when a shrinker repeatedly returns a value of -1 (e.g. a VFS shrinker called w/ GFP_NOFS), the shrinker->nr goes haywire, sometimes updating with the scan count that wasn't used, sometimes losing it altogether. Worse is when a shrinker does work and that update is lost due to racy updates, which means the shrinker will do the work again! Fix this by making the total_scan calculations independent of shrinker->nr, and making the shrinker->nr updates atomic w.r.t. to other updates via cmpxchg loops. Signed-off-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Dave Chinner · Al Viro
1 parent 095760730c
Showing 1 changed file with 32 additions and 13 deletions Side-by-side Diff
mm/vmscan.c
@@ -251,17 +251,29 @@
 		unsigned long total_scan;
 		unsigned long max_pass;
 		int shrink_ret = 0;
+		long nr;
+		long new_nr;
  
+		/*
+		 * copy the current shrinker scan count into a local variable
+		 * and zero it so that other concurrent shrinker invocations
+		 * don't also do this scanning work.
+		 */
+		do {
+			nr = shrinker->nr;
+		} while (cmpxchg(&shrinker->nr, nr, 0) != nr);
+
+		total_scan = nr;
 		max_pass = do_shrinker_shrink(shrinker, shrink, 0);
 		delta = (4 * nr_pages_scanned) / shrinker->seeks;
 		delta *= max_pass;
 		do_div(delta, lru_pages + 1);
-		shrinker->nr += delta;
-		if (shrinker->nr < 0) {
+		total_scan += delta;
+		if (total_scan < 0) {
 			printk(KERN_ERR "shrink_slab: %pF negative objects to "
 			       "delete nr=%ld\n",
-			       shrinker->shrink, shrinker->nr);
-			shrinker->nr = max_pass;
+			       shrinker->shrink, total_scan);
+			total_scan = max_pass;
 		}
  
 		/*
  
@@ -269,13 +281,10 @@
 		 * never try to free more than twice the estimate number of
 		 * freeable entries.
 		 */
-		if (shrinker->nr > max_pass * 2)
-			shrinker->nr = max_pass * 2;
+		if (total_scan > max_pass * 2)
+			total_scan = max_pass * 2;
  
-		total_scan = shrinker->nr;
-		shrinker->nr = 0;
-
-		trace_mm_shrink_slab_start(shrinker, shrink, total_scan,
+		trace_mm_shrink_slab_start(shrinker, shrink, nr,
 					nr_pages_scanned, lru_pages,
 					max_pass, delta, total_scan);
  
@@ -296,9 +305,19 @@
 			cond_resched();
 		}
  
-		shrinker->nr += total_scan;
-		trace_mm_shrink_slab_end(shrinker, shrink_ret, total_scan,
-					 shrinker->nr);
+		/*
+		 * move the unused scan count back into the shrinker in a
+		 * manner that handles concurrent updates. If we exhausted the
+		 * scan, there is no need to do an update.
+		 */
+		do {
+			nr = shrinker->nr;
+			new_nr = total_scan + nr;
+			if (total_scan <= 0)
+				break;
+		} while (cmpxchg(&shrinker->nr, nr, new_nr) != nr);
+
+		trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
 	}
 	up_read(&shrinker_rwsem);
 out:
...	...	@@ -251,17 +251,29 @@
251	251	unsigned long total_scan;
252	252	unsigned long max_pass;
253	253	int shrink_ret = 0;
	254	+ long nr;
	255	+ long new_nr;
254	256
	257	+ /*
	258	+ * copy the current shrinker scan count into a local variable
	259	+ * and zero it so that other concurrent shrinker invocations
	260	+ * don't also do this scanning work.
	261	+ */
	262	+ do {
	263	+ nr = shrinker->nr;
	264	+ } while (cmpxchg(&shrinker->nr, nr, 0) != nr);
	265	+
	266	+ total_scan = nr;
255	267	max_pass = do_shrinker_shrink(shrinker, shrink, 0);
256	268	delta = (4 * nr_pages_scanned) / shrinker->seeks;
257	269	delta *= max_pass;
258	270	do_div(delta, lru_pages + 1);
259		- shrinker->nr += delta;
260		- if (shrinker->nr < 0) {
	271	+ total_scan += delta;
	272	+ if (total_scan < 0) {
261	273	printk(KERN_ERR "shrink_slab: %pF negative objects to "
262	274	"delete nr=%ld\n",
263		- shrinker->shrink, shrinker->nr);
264		- shrinker->nr = max_pass;
	275	+ shrinker->shrink, total_scan);
	276	+ total_scan = max_pass;
265	277	}
266	278
267	279	/*
268	280
...	...	@@ -269,13 +281,10 @@
269	281	* never try to free more than twice the estimate number of
270	282	* freeable entries.
271	283	*/
272		- if (shrinker->nr > max_pass * 2)
273		- shrinker->nr = max_pass * 2;
	284	+ if (total_scan > max_pass * 2)
	285	+ total_scan = max_pass * 2;
274	286
275		- total_scan = shrinker->nr;
276		- shrinker->nr = 0;
277		-
278		- trace_mm_shrink_slab_start(shrinker, shrink, total_scan,
	287	+ trace_mm_shrink_slab_start(shrinker, shrink, nr,
279	288	nr_pages_scanned, lru_pages,
280	289	max_pass, delta, total_scan);
281	290
...	...	@@ -296,9 +305,19 @@
296	305	cond_resched();
297	306	}
298	307
299		- shrinker->nr += total_scan;
300		- trace_mm_shrink_slab_end(shrinker, shrink_ret, total_scan,
301		- shrinker->nr);
	308	+ /*
	309	+ * move the unused scan count back into the shrinker in a
	310	+ * manner that handles concurrent updates. If we exhausted the
	311	+ * scan, there is no need to do an update.
	312	+ */
	313	+ do {
	314	+ nr = shrinker->nr;
	315	+ new_nr = total_scan + nr;
	316	+ if (total_scan <= 0)
	317	+ break;
	318	+ } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr);
	319	+
	320	+ trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
302	321	}
303	322	up_read(&shrinker_rwsem);
304	323	out: