Commit e27fc9641e8ddc8146f8e01f06e5eba2469698de
Committed by
Paul E. McKenney
1 parent
46fdb0937f
Exists in
master
and in
4 other branches
rcu: increase synchronize_sched_expedited() batching
The fix in commit #6a0cc49 requires more than three concurrent instances of synchronize_sched_expedited() before batching is possible. This patch uses a ticket-counter-like approach that is also not unrelated to Lai Jiangshan's Ring RCU to allow sharing of expedited grace periods even when there are only two concurrent instances of synchronize_sched_expedited(). This commit builds on Tejun's original posting, which may be found at http://lkml.org/lkml/2010/11/9/204, adding memory barriers, avoiding overflow of signed integers (other than via atomic_t), and fixing the detection of batching. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Showing 2 changed files with 64 additions and 20 deletions Side-by-side Diff
include/linux/rcupdate.h
... | ... | @@ -47,6 +47,8 @@ |
47 | 47 | extern int rcutorture_runnable; /* for sysctl */ |
48 | 48 | #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ |
49 | 49 | |
50 | +#define UINT_CMP_GE(a, b) (UINT_MAX / 2 >= (a) - (b)) | |
51 | +#define UINT_CMP_LT(a, b) (UINT_MAX / 2 < (a) - (b)) | |
50 | 52 | #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) |
51 | 53 | #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) |
52 | 54 |
kernel/rcutree_plugin.h
... | ... | @@ -1025,7 +1025,8 @@ |
1025 | 1025 | |
1026 | 1026 | #else /* #ifndef CONFIG_SMP */ |
1027 | 1027 | |
1028 | -static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0); | |
1028 | +static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0); | |
1029 | +static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0); | |
1029 | 1030 | |
1030 | 1031 | static int synchronize_sched_expedited_cpu_stop(void *data) |
1031 | 1032 | { |
... | ... | @@ -1041,8 +1042,6 @@ |
1041 | 1042 | * robustness against future implementation changes. |
1042 | 1043 | */ |
1043 | 1044 | smp_mb(); /* See above comment block. */ |
1044 | - if (cpumask_first(cpu_online_mask) == smp_processor_id()) | |
1045 | - atomic_inc(&synchronize_sched_expedited_count); | |
1046 | 1045 | return 0; |
1047 | 1046 | } |
1048 | 1047 | |
1049 | 1048 | |
1050 | 1049 | |
1051 | 1050 | |
1052 | 1051 | |
1053 | 1052 | |
1054 | 1053 | |
1055 | 1054 | |
1056 | 1055 | |
... | ... | @@ -1056,43 +1055,86 @@ |
1056 | 1055 | * lock that is acquired by a CPU-hotplug notifier. Failing to |
1057 | 1056 | * observe this restriction will result in deadlock. |
1058 | 1057 | * |
1059 | - * The synchronize_sched_expedited_cpu_stop() function is called | |
1060 | - * in stop-CPU context, but in order to keep overhead down to a dull | |
1061 | - * roar, we don't force this function to wait for its counterparts | |
1062 | - * on other CPUs. One instance of this function will increment the | |
1063 | - * synchronize_sched_expedited_count variable per call to | |
1064 | - * try_stop_cpus(), but there is no guarantee what order this instance | |
1065 | - * will occur in. The worst case is that it is last on one call | |
1066 | - * to try_stop_cpus(), and the first on the next call. This means | |
1067 | - * that piggybacking requires that synchronize_sched_expedited_count | |
1068 | - * be incremented by 3: this guarantees that the piggybacking | |
1069 | - * task has waited through an entire cycle of context switches, | |
1070 | - * even in the worst case. | |
1058 | + * This implementation can be thought of as an application of ticket | |
1059 | + * locking to RCU, with sync_sched_expedited_started and | |
1060 | + * sync_sched_expedited_done taking on the roles of the halves | |
1061 | + * of the ticket-lock word. Each task atomically increments | |
1062 | + * sync_sched_expedited_started upon entry, snapshotting the old value, | |
1063 | + * then attempts to stop all the CPUs. If this succeeds, then each | |
1064 | + * CPU will have executed a context switch, resulting in an RCU-sched | |
1065 | + * grace period. We are then done, so we use atomic_cmpxchg() to | |
1066 | + * update sync_sched_expedited_done to match our snapshot -- but | |
1067 | + * only if someone else has not already advanced past our snapshot. | |
1068 | + * | |
1069 | + * On the other hand, if try_stop_cpus() fails, we check the value | |
1070 | + * of sync_sched_expedited_done. If it has advanced past our | |
1071 | + * initial snapshot, then someone else must have forced a grace period | |
1072 | + * some time after we took our snapshot. In this case, our work is | |
1073 | + * done for us, and we can simply return. Otherwise, we try again, | |
1074 | + * but keep our initial snapshot for purposes of checking for someone | |
1075 | + * doing our work for us. | |
1076 | + * | |
1077 | + * If we fail too many times in a row, we fall back to synchronize_sched(). | |
1071 | 1078 | */ |
1072 | 1079 | void synchronize_sched_expedited(void) |
1073 | 1080 | { |
1074 | - int snap, trycount = 0; | |
1081 | + int firstsnap, s, snap, trycount = 0; | |
1075 | 1082 | |
1076 | - smp_mb(); /* ensure prior mod happens before capturing snap. */ | |
1077 | - snap = atomic_read(&synchronize_sched_expedited_count) + 2; | |
1083 | + /* Note that atomic_inc_return() implies full memory barrier. */ | |
1084 | + firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); | |
1078 | 1085 | get_online_cpus(); |
1086 | + | |
1087 | + /* | |
1088 | + * Each pass through the following loop attempts to force a | |
1089 | + * context switch on each CPU. | |
1090 | + */ | |
1079 | 1091 | while (try_stop_cpus(cpu_online_mask, |
1080 | 1092 | synchronize_sched_expedited_cpu_stop, |
1081 | 1093 | NULL) == -EAGAIN) { |
1082 | 1094 | put_online_cpus(); |
1095 | + | |
1096 | + /* No joy, try again later. Or just synchronize_sched(). */ | |
1083 | 1097 | if (trycount++ < 10) |
1084 | 1098 | udelay(trycount * num_online_cpus()); |
1085 | 1099 | else { |
1086 | 1100 | synchronize_sched(); |
1087 | 1101 | return; |
1088 | 1102 | } |
1089 | - if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) { | |
1103 | + | |
1104 | + /* Check to see if someone else did our work for us. */ | |
1105 | + s = atomic_read(&sync_sched_expedited_done); | |
1106 | + if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { | |
1090 | 1107 | smp_mb(); /* ensure test happens before caller kfree */ |
1091 | 1108 | return; |
1092 | 1109 | } |
1110 | + | |
1111 | + /* | |
1112 | + * Refetching sync_sched_expedited_started allows later | |
1113 | + * callers to piggyback on our grace period. We subtract | |
1114 | + * 1 to get the same token that the last incrementer got. | |
1115 | + * We retry after they started, so our grace period works | |
1116 | + * for them, and they started after our first try, so their | |
1117 | + * grace period works for us. | |
1118 | + */ | |
1093 | 1119 | get_online_cpus(); |
1120 | + snap = atomic_read(&sync_sched_expedited_started) - 1; | |
1121 | + smp_mb(); /* ensure read is before try_stop_cpus(). */ | |
1094 | 1122 | } |
1095 | - smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */ | |
1123 | + | |
1124 | + /* | |
1125 | + * Everyone up to our most recent fetch is covered by our grace | |
1126 | + * period. Update the counter, but only if our work is still | |
1127 | + * relevant -- which it won't be if someone who started later | |
1128 | + * than we did beat us to the punch. | |
1129 | + */ | |
1130 | + do { | |
1131 | + s = atomic_read(&sync_sched_expedited_done); | |
1132 | + if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { | |
1133 | + smp_mb(); /* ensure test happens before caller kfree */ | |
1134 | + break; | |
1135 | + } | |
1136 | + } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); | |
1137 | + | |
1096 | 1138 | put_online_cpus(); |
1097 | 1139 | } |
1098 | 1140 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); |