Commit e27fc9641e8ddc8146f8e01f06e5eba2469698de

Authored by Tejun Heo
Committed by Paul E. McKenney
1 parent 46fdb0937f

rcu: increase synchronize_sched_expedited() batching

The fix in commit #6a0cc49 requires more than three concurrent instances
of synchronize_sched_expedited() before batching is possible.  This
patch uses a ticket-counter-like approach that is also not unrelated to
Lai Jiangshan's Ring RCU to allow sharing of expedited grace periods even
when there are only two concurrent instances of synchronize_sched_expedited().

This commit builds on Tejun's original posting, which may be found at
http://lkml.org/lkml/2010/11/9/204, adding memory barriers, avoiding
overflow of signed integers (other than via atomic_t), and fixing the
detection of batching.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

Showing 2 changed files with 64 additions and 20 deletions Side-by-side Diff

include/linux/rcupdate.h
... ... @@ -47,6 +47,8 @@
47 47 extern int rcutorture_runnable; /* for sysctl */
48 48 #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
49 49  
  50 +#define UINT_CMP_GE(a, b) (UINT_MAX / 2 >= (a) - (b))
  51 +#define UINT_CMP_LT(a, b) (UINT_MAX / 2 < (a) - (b))
50 52 #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
51 53 #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
52 54  
kernel/rcutree_plugin.h
... ... @@ -1025,7 +1025,8 @@
1025 1025  
1026 1026 #else /* #ifndef CONFIG_SMP */
1027 1027  
1028   -static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
  1028 +static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
  1029 +static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
1029 1030  
1030 1031 static int synchronize_sched_expedited_cpu_stop(void *data)
1031 1032 {
... ... @@ -1041,8 +1042,6 @@
1041 1042 * robustness against future implementation changes.
1042 1043 */
1043 1044 smp_mb(); /* See above comment block. */
1044   - if (cpumask_first(cpu_online_mask) == smp_processor_id())
1045   - atomic_inc(&synchronize_sched_expedited_count);
1046 1045 return 0;
1047 1046 }
1048 1047  
1049 1048  
1050 1049  
1051 1050  
1052 1051  
1053 1052  
1054 1053  
1055 1054  
1056 1055  
... ... @@ -1056,43 +1055,86 @@
1056 1055 * lock that is acquired by a CPU-hotplug notifier. Failing to
1057 1056 * observe this restriction will result in deadlock.
1058 1057 *
1059   - * The synchronize_sched_expedited_cpu_stop() function is called
1060   - * in stop-CPU context, but in order to keep overhead down to a dull
1061   - * roar, we don't force this function to wait for its counterparts
1062   - * on other CPUs. One instance of this function will increment the
1063   - * synchronize_sched_expedited_count variable per call to
1064   - * try_stop_cpus(), but there is no guarantee what order this instance
1065   - * will occur in. The worst case is that it is last on one call
1066   - * to try_stop_cpus(), and the first on the next call. This means
1067   - * that piggybacking requires that synchronize_sched_expedited_count
1068   - * be incremented by 3: this guarantees that the piggybacking
1069   - * task has waited through an entire cycle of context switches,
1070   - * even in the worst case.
  1058 + * This implementation can be thought of as an application of ticket
  1059 + * locking to RCU, with sync_sched_expedited_started and
  1060 + * sync_sched_expedited_done taking on the roles of the halves
  1061 + * of the ticket-lock word. Each task atomically increments
  1062 + * sync_sched_expedited_started upon entry, snapshotting the old value,
  1063 + * then attempts to stop all the CPUs. If this succeeds, then each
  1064 + * CPU will have executed a context switch, resulting in an RCU-sched
  1065 + * grace period. We are then done, so we use atomic_cmpxchg() to
  1066 + * update sync_sched_expedited_done to match our snapshot -- but
  1067 + * only if someone else has not already advanced past our snapshot.
  1068 + *
  1069 + * On the other hand, if try_stop_cpus() fails, we check the value
  1070 + * of sync_sched_expedited_done. If it has advanced past our
  1071 + * initial snapshot, then someone else must have forced a grace period
  1072 + * some time after we took our snapshot. In this case, our work is
  1073 + * done for us, and we can simply return. Otherwise, we try again,
  1074 + * but keep our initial snapshot for purposes of checking for someone
  1075 + * doing our work for us.
  1076 + *
  1077 + * If we fail too many times in a row, we fall back to synchronize_sched().
1071 1078 */
1072 1079 void synchronize_sched_expedited(void)
1073 1080 {
1074   - int snap, trycount = 0;
  1081 + int firstsnap, s, snap, trycount = 0;
1075 1082  
1076   - smp_mb(); /* ensure prior mod happens before capturing snap. */
1077   - snap = atomic_read(&synchronize_sched_expedited_count) + 2;
  1083 + /* Note that atomic_inc_return() implies full memory barrier. */
  1084 + firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
1078 1085 get_online_cpus();
  1086 +
  1087 + /*
  1088 + * Each pass through the following loop attempts to force a
  1089 + * context switch on each CPU.
  1090 + */
1079 1091 while (try_stop_cpus(cpu_online_mask,
1080 1092 synchronize_sched_expedited_cpu_stop,
1081 1093 NULL) == -EAGAIN) {
1082 1094 put_online_cpus();
  1095 +
  1096 + /* No joy, try again later. Or just synchronize_sched(). */
1083 1097 if (trycount++ < 10)
1084 1098 udelay(trycount * num_online_cpus());
1085 1099 else {
1086 1100 synchronize_sched();
1087 1101 return;
1088 1102 }
1089   - if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
  1103 +
  1104 + /* Check to see if someone else did our work for us. */
  1105 + s = atomic_read(&sync_sched_expedited_done);
  1106 + if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
1090 1107 smp_mb(); /* ensure test happens before caller kfree */
1091 1108 return;
1092 1109 }
  1110 +
  1111 + /*
  1112 + * Refetching sync_sched_expedited_started allows later
  1113 + * callers to piggyback on our grace period. We subtract
  1114 + * 1 to get the same token that the last incrementer got.
  1115 + * We retry after they started, so our grace period works
  1116 + * for them, and they started after our first try, so their
  1117 + * grace period works for us.
  1118 + */
1093 1119 get_online_cpus();
  1120 + snap = atomic_read(&sync_sched_expedited_started) - 1;
  1121 + smp_mb(); /* ensure read is before try_stop_cpus(). */
1094 1122 }
1095   - smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
  1123 +
  1124 + /*
  1125 + * Everyone up to our most recent fetch is covered by our grace
  1126 + * period. Update the counter, but only if our work is still
  1127 + * relevant -- which it won't be if someone who started later
  1128 + * than we did beat us to the punch.
  1129 + */
  1130 + do {
  1131 + s = atomic_read(&sync_sched_expedited_done);
  1132 + if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
  1133 + smp_mb(); /* ensure test happens before caller kfree */
  1134 + break;
  1135 + }
  1136 + } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
  1137 +
1096 1138 put_online_cpus();
1097 1139 }
1098 1140 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);