Commit b668c9cf3e58739dac54a1d6f42f2b4bdd980b3e
Committed by
Ingo Molnar
1 parent
2f51f9884f
Exists in
master
and in
4 other branches
rcu: Fix grace-period-stall bug on large systems with CPU hotplug
When the last CPU of a given leaf rcu_node structure goes offline, all of the tasks queued on that leaf rcu_node structure (due to having blocked in their current RCU read-side critical sections) are requeued onto the root rcu_node structure. This requeuing is carried out by rcu_preempt_offline_tasks(). However, it is possible that these queued tasks are the only thing preventing the leaf rcu_node structure from reporting a quiescent state up the rcu_node hierarchy. Unfortunately, the old code would fail to do this reporting, resulting in a grace-period stall given the following sequence of events: 1. Kernel built for more than 32 CPUs on 32-bit systems or for more than 64 CPUs on 64-bit systems, so that there is more than one rcu_node structure. (Or CONFIG_RCU_FANOUT is artificially set to a number smaller than CONFIG_NR_CPUS.) 2. The kernel is built with CONFIG_TREE_PREEMPT_RCU. 3. A task running on a CPU associated with a given leaf rcu_node structure blocks while in an RCU read-side critical section -and- that CPU has not yet passed through a quiescent state for the current RCU grace period. This will cause the task to be queued on the leaf rcu_node's blocked_tasks[] array, in particular, on the element of this array corresponding to the current grace period. 4. Each of the remaining CPUs corresponding to this same leaf rcu_node structure pass through a quiescent state. However, the task is still in its RCU read-side critical section, so these quiescent states cannot be reported further up the rcu_node hierarchy. Nevertheless, all bits in the leaf rcu_node structure's ->qsmask field are now zero. 5. Each of the remaining CPUs go offline. (The events in step #4 and #5 can happen in any order as long as each CPU passes through a quiescent state before going offline.) 6. When the last CPU goes offline, __rcu_offline_cpu() will invoke rcu_preempt_offline_tasks(), which will move the task to the root rcu_node structure, but without reporting a quiescent state up the rcu_node hierarchy (and this failure to report a quiescent state is the bug). But because this leaf rcu_node structure's ->qsmask field is already zero and its ->block_tasks[] entries are all empty, force_quiescent_state() will skip this rcu_node structure. Therefore, grace periods are now hung. This patch abstracts some code out of rcu_read_unlock_special(), calling the result task_quiet() by analogy with cpu_quiet(), and invokes task_quiet() from both rcu_read_lock_special() and __rcu_offline_cpu(). Invoking task_quiet() from __rcu_offline_cpu() reports the quiescent state up the rcu_node hierarchy, fixing the bug. This ends up requiring a separate lock_class_key per level of the rcu_node hierarchy, which this patch also provides. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <12589088301770-git-send-email-> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Showing 3 changed files with 85 additions and 43 deletions Side-by-side Diff
kernel/rcutree.c
... | ... | @@ -51,7 +51,7 @@ |
51 | 51 | |
52 | 52 | /* Data structures. */ |
53 | 53 | |
54 | -static struct lock_class_key rcu_root_class; | |
54 | +static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; | |
55 | 55 | |
56 | 56 | #define RCU_STATE_INITIALIZER(name) { \ |
57 | 57 | .level = { &name.node[0] }, \ |
... | ... | @@ -936,6 +936,7 @@ |
936 | 936 | { |
937 | 937 | unsigned long flags; |
938 | 938 | unsigned long mask; |
939 | + int need_quiet = 0; | |
939 | 940 | struct rcu_data *rdp = rsp->rda[cpu]; |
940 | 941 | struct rcu_node *rnp; |
941 | 942 | |
942 | 943 | |
943 | 944 | |
944 | 945 | |
... | ... | @@ -949,29 +950,30 @@ |
949 | 950 | spin_lock(&rnp->lock); /* irqs already disabled. */ |
950 | 951 | rnp->qsmaskinit &= ~mask; |
951 | 952 | if (rnp->qsmaskinit != 0) { |
952 | - spin_unlock(&rnp->lock); /* irqs remain disabled. */ | |
953 | + if (rnp != rdp->mynode) | |
954 | + spin_unlock(&rnp->lock); /* irqs remain disabled. */ | |
953 | 955 | break; |
954 | 956 | } |
955 | - | |
956 | - /* | |
957 | - * If there was a task blocking the current grace period, | |
958 | - * and if all CPUs have checked in, we need to propagate | |
959 | - * the quiescent state up the rcu_node hierarchy. But that | |
960 | - * is inconvenient at the moment due to deadlock issues if | |
961 | - * this should end the current grace period. So set the | |
962 | - * offlined CPU's bit in ->qsmask in order to force the | |
963 | - * next force_quiescent_state() invocation to clean up this | |
964 | - * mess in a deadlock-free manner. | |
965 | - */ | |
966 | - if (rcu_preempt_offline_tasks(rsp, rnp, rdp) && !rnp->qsmask) | |
967 | - rnp->qsmask |= mask; | |
968 | - | |
957 | + if (rnp == rdp->mynode) | |
958 | + need_quiet = rcu_preempt_offline_tasks(rsp, rnp, rdp); | |
959 | + else | |
960 | + spin_unlock(&rnp->lock); /* irqs remain disabled. */ | |
969 | 961 | mask = rnp->grpmask; |
970 | - spin_unlock(&rnp->lock); /* irqs remain disabled. */ | |
971 | 962 | rnp = rnp->parent; |
972 | 963 | } while (rnp != NULL); |
973 | 964 | |
974 | - spin_unlock_irqrestore(&rsp->onofflock, flags); | |
965 | + /* | |
966 | + * We still hold the leaf rcu_node structure lock here, and | |
967 | + * irqs are still disabled. The reason for this subterfuge is | |
968 | + * because invoking task_quiet() with ->onofflock held leads | |
969 | + * to deadlock. | |
970 | + */ | |
971 | + spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ | |
972 | + rnp = rdp->mynode; | |
973 | + if (need_quiet) | |
974 | + task_quiet(rnp, flags); | |
975 | + else | |
976 | + spin_unlock_irqrestore(&rnp->lock, flags); | |
975 | 977 | |
976 | 978 | rcu_adopt_orphan_cbs(rsp); |
977 | 979 | } |
... | ... | @@ -1731,6 +1733,7 @@ |
1731 | 1733 | rnp = rsp->level[i]; |
1732 | 1734 | for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { |
1733 | 1735 | spin_lock_init(&rnp->lock); |
1736 | + lockdep_set_class(&rnp->lock, &rcu_node_class[i]); | |
1734 | 1737 | rnp->gpnum = 0; |
1735 | 1738 | rnp->qsmask = 0; |
1736 | 1739 | rnp->qsmaskinit = 0; |
... | ... | @@ -1753,7 +1756,6 @@ |
1753 | 1756 | INIT_LIST_HEAD(&rnp->blocked_tasks[1]); |
1754 | 1757 | } |
1755 | 1758 | } |
1756 | - lockdep_set_class(&rcu_get_root(rsp)->lock, &rcu_root_class); | |
1757 | 1759 | } |
1758 | 1760 | |
1759 | 1761 | /* |
kernel/rcutree.h
... | ... | @@ -305,6 +305,9 @@ |
305 | 305 | long rcu_batches_completed(void); |
306 | 306 | static void rcu_preempt_note_context_switch(int cpu); |
307 | 307 | static int rcu_preempted_readers(struct rcu_node *rnp); |
308 | +#ifdef CONFIG_HOTPLUG_CPU | |
309 | +static void task_quiet(struct rcu_node *rnp, unsigned long flags); | |
310 | +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ | |
308 | 311 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR |
309 | 312 | static void rcu_print_task_stall(struct rcu_node *rnp); |
310 | 313 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ |
kernel/rcutree_plugin.h
... | ... | @@ -160,11 +160,51 @@ |
160 | 160 | return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); |
161 | 161 | } |
162 | 162 | |
163 | +/* | |
164 | + * Record a quiescent state for all tasks that were previously queued | |
165 | + * on the specified rcu_node structure and that were blocking the current | |
166 | + * RCU grace period. The caller must hold the specified rnp->lock with | |
167 | + * irqs disabled, and this lock is released upon return, but irqs remain | |
168 | + * disabled. | |
169 | + */ | |
170 | +static void task_quiet(struct rcu_node *rnp, unsigned long flags) | |
171 | + __releases(rnp->lock) | |
172 | +{ | |
173 | + unsigned long mask; | |
174 | + struct rcu_node *rnp_p; | |
175 | + | |
176 | + if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { | |
177 | + spin_unlock_irqrestore(&rnp->lock, flags); | |
178 | + return; /* Still need more quiescent states! */ | |
179 | + } | |
180 | + | |
181 | + rnp_p = rnp->parent; | |
182 | + if (rnp_p == NULL) { | |
183 | + /* | |
184 | + * Either there is only one rcu_node in the tree, | |
185 | + * or tasks were kicked up to root rcu_node due to | |
186 | + * CPUs going offline. | |
187 | + */ | |
188 | + cpu_quiet_msk_finish(&rcu_preempt_state, flags); | |
189 | + return; | |
190 | + } | |
191 | + | |
192 | + /* Report up the rest of the hierarchy. */ | |
193 | + mask = rnp->grpmask; | |
194 | + spin_unlock(&rnp->lock); /* irqs remain disabled. */ | |
195 | + spin_lock(&rnp_p->lock); /* irqs already disabled. */ | |
196 | + cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags); | |
197 | +} | |
198 | + | |
199 | +/* | |
200 | + * Handle special cases during rcu_read_unlock(), such as needing to | |
201 | + * notify RCU core processing or task having blocked during the RCU | |
202 | + * read-side critical section. | |
203 | + */ | |
163 | 204 | static void rcu_read_unlock_special(struct task_struct *t) |
164 | 205 | { |
165 | 206 | int empty; |
166 | 207 | unsigned long flags; |
167 | - unsigned long mask; | |
168 | 208 | struct rcu_node *rnp; |
169 | 209 | int special; |
170 | 210 | |
171 | 211 | |
172 | 212 | |
173 | 213 | |
... | ... | @@ -213,30 +253,15 @@ |
213 | 253 | /* |
214 | 254 | * If this was the last task on the current list, and if |
215 | 255 | * we aren't waiting on any CPUs, report the quiescent state. |
216 | - * Note that both cpu_quiet_msk_finish() and cpu_quiet_msk() | |
217 | - * drop rnp->lock and restore irq. | |
256 | + * Note that task_quiet() releases rnp->lock. | |
218 | 257 | */ |
219 | - if (!empty && rnp->qsmask == 0 && | |
220 | - !rcu_preempted_readers(rnp)) { | |
221 | - struct rcu_node *rnp_p; | |
222 | - | |
223 | - if (rnp->parent == NULL) { | |
224 | - /* Only one rcu_node in the tree. */ | |
225 | - cpu_quiet_msk_finish(&rcu_preempt_state, flags); | |
226 | - return; | |
227 | - } | |
228 | - /* Report up the rest of the hierarchy. */ | |
229 | - mask = rnp->grpmask; | |
258 | + if (empty) | |
230 | 259 | spin_unlock_irqrestore(&rnp->lock, flags); |
231 | - rnp_p = rnp->parent; | |
232 | - spin_lock_irqsave(&rnp_p->lock, flags); | |
233 | - WARN_ON_ONCE(rnp->qsmask); | |
234 | - cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags); | |
235 | - return; | |
236 | - } | |
237 | - spin_unlock(&rnp->lock); | |
260 | + else | |
261 | + task_quiet(rnp, flags); | |
262 | + } else { | |
263 | + local_irq_restore(flags); | |
238 | 264 | } |
239 | - local_irq_restore(flags); | |
240 | 265 | } |
241 | 266 | |
242 | 267 | /* |
... | ... | @@ -303,6 +328,8 @@ |
303 | 328 | * rcu_node. The reason for not just moving them to the immediate |
304 | 329 | * parent is to remove the need for rcu_read_unlock_special() to |
305 | 330 | * make more than two attempts to acquire the target rcu_node's lock. |
331 | + * Returns true if there were tasks blocking the current RCU grace | |
332 | + * period. | |
306 | 333 | * |
307 | 334 | * Returns 1 if there was previously a task blocking the current grace |
308 | 335 | * period on the specified rcu_node structure. |
... | ... | @@ -316,7 +343,7 @@ |
316 | 343 | int i; |
317 | 344 | struct list_head *lp; |
318 | 345 | struct list_head *lp_root; |
319 | - int retval = rcu_preempted_readers(rnp); | |
346 | + int retval; | |
320 | 347 | struct rcu_node *rnp_root = rcu_get_root(rsp); |
321 | 348 | struct task_struct *tp; |
322 | 349 | |
... | ... | @@ -334,6 +361,7 @@ |
334 | 361 | * rcu_nodes in terms of gp_num value. This fact allows us to |
335 | 362 | * move the blocked_tasks[] array directly, element by element. |
336 | 363 | */ |
364 | + retval = rcu_preempted_readers(rnp); | |
337 | 365 | for (i = 0; i < 2; i++) { |
338 | 366 | lp = &rnp->blocked_tasks[i]; |
339 | 367 | lp_root = &rnp_root->blocked_tasks[i]; |
... | ... | @@ -346,7 +374,6 @@ |
346 | 374 | spin_unlock(&rnp_root->lock); /* irqs remain disabled */ |
347 | 375 | } |
348 | 376 | } |
349 | - | |
350 | 377 | return retval; |
351 | 378 | } |
352 | 379 | |
... | ... | @@ -511,6 +538,16 @@ |
511 | 538 | { |
512 | 539 | return 0; |
513 | 540 | } |
541 | + | |
542 | +#ifdef CONFIG_HOTPLUG_CPU | |
543 | + | |
544 | +/* Because preemptible RCU does not exist, no quieting of tasks. */ | |
545 | +static void task_quiet(struct rcu_node *rnp, unsigned long flags) | |
546 | +{ | |
547 | + spin_unlock_irqrestore(&rnp->lock, flags); | |
548 | +} | |
549 | + | |
550 | +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ | |
514 | 551 | |
515 | 552 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR |
516 | 553 |