rcu: increase synchronize_sched_expedited() batching

The fix in commit #6a0cc49 requires more than three concurrent instances of synchronize_sched_expedited() before batching is possible. This patch uses a ticket-counter-like approach that is also not unrelated to Lai Jiangshan's Ring RCU to allow sharing of expedited grace periods even when there are only two concurrent instances of synchronize_sched_expedited(). This commit builds on Tejun's original posting, which may be found at http://lkml.org/lkml/2010/11/9/204, adding memory barriers, avoiding overflow of signed integers (other than via atomic_t), and fixing the detection of batching. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

rcu: increase synchronize_sched_expedited() batching
The fix in commit #6a0cc49 requires more than three concurrent instances of synchronize_sched_expedited() before batching is possible. This patch uses a ticket-counter-like approach that is also not unrelated to Lai Jiangshan's Ring RCU to allow sharing of expedited grace periods even when there are only two concurrent instances of synchronize_sched_expedited(). This commit builds on Tejun's original posting, which may be found at http://lkml.org/lkml/2010/11/9/204, adding memory barriers, avoiding overflow of signed integers (other than via atomic_t), and fixing the detection of batching. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tejun Heo · Paul E. McKenney
1 parent 46fdb0937f
Showing 2 changed files with 64 additions and 20 deletions Side-by-side Diff
include/linux/rcupdate.h
kernel/rcutree_plugin.h
@@ -47,6 +47,8 @@
 extern int rcutorture_runnable; /* for sysctl */
 #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
  
+#define UINT_CMP_GE(a, b)	(UINT_MAX / 2 >= (a) - (b))
+#define UINT_CMP_LT(a, b)	(UINT_MAX / 2 < (a) - (b))
 #define ULONG_CMP_GE(a, b)	(ULONG_MAX / 2 >= (a) - (b))
 #define ULONG_CMP_LT(a, b)	(ULONG_MAX / 2 < (a) - (b))
  
@@ -1025,7 +1025,8 @@
  
 #else /* #ifndef CONFIG_SMP */
  
-static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
+static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
+static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
  
 static int synchronize_sched_expedited_cpu_stop(void *data)
 {
@@ -1041,8 +1042,6 @@
 	 * robustness against future implementation changes.
 	 */
 	smp_mb(); /* See above comment block. */
-	if (cpumask_first(cpu_online_mask) == smp_processor_id())
-		atomic_inc(&synchronize_sched_expedited_count);
 	return 0;
 }
  
  
  
  
  
  
  
  
  
@@ -1056,43 +1055,86 @@
  * lock that is acquired by a CPU-hotplug notifier.  Failing to
  * observe this restriction will result in deadlock.
  *
- * The synchronize_sched_expedited_cpu_stop() function is called
- * in stop-CPU context, but in order to keep overhead down to a dull
- * roar, we don't force this function to wait for its counterparts
- * on other CPUs.  One instance of this function will increment the
- * synchronize_sched_expedited_count variable per call to
- * try_stop_cpus(), but there is no guarantee what order this instance
- * will occur in.  The worst case is that it is last on one call
- * to try_stop_cpus(), and the first on the next call.  This means
- * that piggybacking requires that synchronize_sched_expedited_count
- * be incremented by 3: this guarantees that the piggybacking
- * task has waited through an entire cycle of context switches,
- * even in the worst case.
+ * This implementation can be thought of as an application of ticket
+ * locking to RCU, with sync_sched_expedited_started and
+ * sync_sched_expedited_done taking on the roles of the halves
+ * of the ticket-lock word.  Each task atomically increments
+ * sync_sched_expedited_started upon entry, snapshotting the old value,
+ * then attempts to stop all the CPUs.  If this succeeds, then each
+ * CPU will have executed a context switch, resulting in an RCU-sched
+ * grace period.  We are then done, so we use atomic_cmpxchg() to
+ * update sync_sched_expedited_done to match our snapshot -- but
+ * only if someone else has not already advanced past our snapshot.
+ *
+ * On the other hand, if try_stop_cpus() fails, we check the value
+ * of sync_sched_expedited_done.  If it has advanced past our
+ * initial snapshot, then someone else must have forced a grace period
+ * some time after we took our snapshot.  In this case, our work is
+ * done for us, and we can simply return.  Otherwise, we try again,
+ * but keep our initial snapshot for purposes of checking for someone
+ * doing our work for us.
+ *
+ * If we fail too many times in a row, we fall back to synchronize_sched().
  */
 void synchronize_sched_expedited(void)
 {
-	int snap, trycount = 0;
+	int firstsnap, s, snap, trycount = 0;
  
-	smp_mb();  /* ensure prior mod happens before capturing snap. */
-	snap = atomic_read(&synchronize_sched_expedited_count) + 2;
+	/* Note that atomic_inc_return() implies full memory barrier. */
+	firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
 	get_online_cpus();
+
+	/*
+	 * Each pass through the following loop attempts to force a
+	 * context switch on each CPU.
+	 */
 	while (try_stop_cpus(cpu_online_mask,
 			     synchronize_sched_expedited_cpu_stop,
 			     NULL) == -EAGAIN) {
 		put_online_cpus();
+
+		/* No joy, try again later.  Or just synchronize_sched(). */
 		if (trycount++ < 10)
 			udelay(trycount * num_online_cpus());
 		else {
 			synchronize_sched();
 			return;
 		}
-		if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
+
+		/* Check to see if someone else did our work for us. */
+		s = atomic_read(&sync_sched_expedited_done);
+		if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
 			smp_mb(); /* ensure test happens before caller kfree */
 			return;
 		}
+
+		/*
+		 * Refetching sync_sched_expedited_started allows later
+		 * callers to piggyback on our grace period.  We subtract
+		 * 1 to get the same token that the last incrementer got.
+		 * We retry after they started, so our grace period works
+		 * for them, and they started after our first try, so their
+		 * grace period works for us.
+		 */
 		get_online_cpus();
+		snap = atomic_read(&sync_sched_expedited_started) - 1;
+		smp_mb(); /* ensure read is before try_stop_cpus(). */
 	}
-	smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
+
+	/*
+	 * Everyone up to our most recent fetch is covered by our grace
+	 * period.  Update the counter, but only if our work is still
+	 * relevant -- which it won't be if someone who started later
+	 * than we did beat us to the punch.
+	 */
+	do {
+		s = atomic_read(&sync_sched_expedited_done);
+		if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
+			smp_mb(); /* ensure test happens before caller kfree */
+			break;
+		}
+	} while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
+
 	put_online_cpus();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
...	...	@@ -47,6 +47,8 @@
47	47	extern int rcutorture_runnable; /* for sysctl */
48	48	#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
49	49
	50	+#define UINT_CMP_GE(a, b) (UINT_MAX / 2 >= (a) - (b))
	51	+#define UINT_CMP_LT(a, b) (UINT_MAX / 2 < (a) - (b))
50	52	#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
51	53	#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
52	54
...	...	@@ -1025,7 +1025,8 @@
1025	1025
1026	1026	#else /* #ifndef CONFIG_SMP */
1027	1027
1028		-static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
	1028	+static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
	1029	+static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
1029	1030
1030	1031	static int synchronize_sched_expedited_cpu_stop(void *data)
1031	1032	{
...	...	@@ -1041,8 +1042,6 @@
1041	1042	* robustness against future implementation changes.
1042	1043	*/
1043	1044	smp_mb(); /* See above comment block. */
1044		- if (cpumask_first(cpu_online_mask) == smp_processor_id())
1045		- atomic_inc(&synchronize_sched_expedited_count);
1046	1045	return 0;
1047	1046	}
1048	1047
1049	1048
1050	1049
1051	1050
1052	1051
1053	1052
1054	1053
1055	1054
1056	1055
...	...	@@ -1056,43 +1055,86 @@
1056	1055	* lock that is acquired by a CPU-hotplug notifier. Failing to
1057	1056	* observe this restriction will result in deadlock.
1058	1057	*
1059		- * The synchronize_sched_expedited_cpu_stop() function is called
1060		- * in stop-CPU context, but in order to keep overhead down to a dull
1061		- * roar, we don't force this function to wait for its counterparts
1062		- * on other CPUs. One instance of this function will increment the
1063		- * synchronize_sched_expedited_count variable per call to
1064		- * try_stop_cpus(), but there is no guarantee what order this instance
1065		- * will occur in. The worst case is that it is last on one call
1066		- * to try_stop_cpus(), and the first on the next call. This means
1067		- * that piggybacking requires that synchronize_sched_expedited_count
1068		- * be incremented by 3: this guarantees that the piggybacking
1069		- * task has waited through an entire cycle of context switches,
1070		- * even in the worst case.
	1058	+ * This implementation can be thought of as an application of ticket
	1059	+ * locking to RCU, with sync_sched_expedited_started and
	1060	+ * sync_sched_expedited_done taking on the roles of the halves
	1061	+ * of the ticket-lock word. Each task atomically increments
	1062	+ * sync_sched_expedited_started upon entry, snapshotting the old value,
	1063	+ * then attempts to stop all the CPUs. If this succeeds, then each
	1064	+ * CPU will have executed a context switch, resulting in an RCU-sched
	1065	+ * grace period. We are then done, so we use atomic_cmpxchg() to
	1066	+ * update sync_sched_expedited_done to match our snapshot -- but
	1067	+ * only if someone else has not already advanced past our snapshot.
	1068	+ *
	1069	+ * On the other hand, if try_stop_cpus() fails, we check the value
	1070	+ * of sync_sched_expedited_done. If it has advanced past our
	1071	+ * initial snapshot, then someone else must have forced a grace period
	1072	+ * some time after we took our snapshot. In this case, our work is
	1073	+ * done for us, and we can simply return. Otherwise, we try again,
	1074	+ * but keep our initial snapshot for purposes of checking for someone
	1075	+ * doing our work for us.
	1076	+ *
	1077	+ * If we fail too many times in a row, we fall back to synchronize_sched().
1071	1078	*/
1072	1079	void synchronize_sched_expedited(void)
1073	1080	{
1074		- int snap, trycount = 0;
	1081	+ int firstsnap, s, snap, trycount = 0;
1075	1082
1076		- smp_mb(); /* ensure prior mod happens before capturing snap. */
1077		- snap = atomic_read(&synchronize_sched_expedited_count) + 2;
	1083	+ /* Note that atomic_inc_return() implies full memory barrier. */
	1084	+ firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
1078	1085	get_online_cpus();
	1086	+
	1087	+ /*
	1088	+ * Each pass through the following loop attempts to force a
	1089	+ * context switch on each CPU.
	1090	+ */
1079	1091	while (try_stop_cpus(cpu_online_mask,
1080	1092	synchronize_sched_expedited_cpu_stop,
1081	1093	NULL) == -EAGAIN) {
1082	1094	put_online_cpus();
	1095	+
	1096	+ /* No joy, try again later. Or just synchronize_sched(). */
1083	1097	if (trycount++ < 10)
1084	1098	udelay(trycount * num_online_cpus());
1085	1099	else {
1086	1100	synchronize_sched();
1087	1101	return;
1088	1102	}
1089		- if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
	1103	+
	1104	+ /* Check to see if someone else did our work for us. */
	1105	+ s = atomic_read(&sync_sched_expedited_done);
	1106	+ if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
1090	1107	smp_mb(); /* ensure test happens before caller kfree */
1091	1108	return;
1092	1109	}
	1110	+
	1111	+ /*
	1112	+ * Refetching sync_sched_expedited_started allows later
	1113	+ * callers to piggyback on our grace period. We subtract
	1114	+ * 1 to get the same token that the last incrementer got.
	1115	+ * We retry after they started, so our grace period works
	1116	+ * for them, and they started after our first try, so their
	1117	+ * grace period works for us.
	1118	+ */
1093	1119	get_online_cpus();
	1120	+ snap = atomic_read(&sync_sched_expedited_started) - 1;
	1121	+ smp_mb(); /* ensure read is before try_stop_cpus(). */
1094	1122	}
1095		- smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
	1123	+
	1124	+ /*
	1125	+ * Everyone up to our most recent fetch is covered by our grace
	1126	+ * period. Update the counter, but only if our work is still
	1127	+ * relevant -- which it won't be if someone who started later
	1128	+ * than we did beat us to the punch.
	1129	+ */
	1130	+ do {
	1131	+ s = atomic_read(&sync_sched_expedited_done);
	1132	+ if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
	1133	+ smp_mb(); /* ensure test happens before caller kfree */
	1134	+ break;
	1135	+ }
	1136	+ } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
	1137	+
1096	1138	put_online_cpus();
1097	1139	}
1098	1140	EXPORT_SYMBOL_GPL(synchronize_sched_expedited);