Commit b668c9cf3e58739dac54a1d6f42f2b4bdd980b3e

Authored by Paul E. McKenney
Committed by Ingo Molnar
1 parent 2f51f9884f

rcu: Fix grace-period-stall bug on large systems with CPU hotplug

When the last CPU of a given leaf rcu_node structure goes
offline, all of the tasks queued on that leaf rcu_node structure
(due to having blocked in their current RCU read-side critical
sections) are requeued onto the root rcu_node structure.  This
requeuing is carried out by rcu_preempt_offline_tasks().
However, it is possible that these queued tasks are the only
thing preventing the leaf rcu_node structure from reporting a
quiescent state up the rcu_node hierarchy.  Unfortunately, the
old code would fail to do this reporting, resulting in a
grace-period stall given the following sequence of events:

1.	Kernel built for more than 32 CPUs on 32-bit systems or for more
	than 64 CPUs on 64-bit systems, so that there is more than one
	rcu_node structure.  (Or CONFIG_RCU_FANOUT is artificially set
	to a number smaller than CONFIG_NR_CPUS.)

2.	The kernel is built with CONFIG_TREE_PREEMPT_RCU.

3.	A task running on a CPU associated with a given leaf rcu_node
	structure blocks while in an RCU read-side critical section
	-and- that CPU has not yet passed through a quiescent state
	for the current RCU grace period.  This will cause the task
	to be queued on the leaf rcu_node's blocked_tasks[] array, in
	particular, on the element of this array corresponding to the
	current grace period.

4.	Each of the remaining CPUs corresponding to this same leaf rcu_node
	structure pass through a quiescent state.  However, the task is
	still in its RCU read-side critical section, so these quiescent
	states cannot be reported further up the rcu_node hierarchy.
	Nevertheless, all bits in the leaf rcu_node structure's ->qsmask
	field are now zero.

5.	Each of the remaining CPUs go offline.  (The events in step
	#4 and #5 can happen in any order as long as each CPU passes
	through a quiescent state before going offline.)

6.	When the last CPU goes offline, __rcu_offline_cpu() will invoke
	rcu_preempt_offline_tasks(), which will move the task to the
	root rcu_node structure, but without reporting a quiescent state
	up the rcu_node hierarchy (and this failure to report a quiescent
	state is the bug).

	But because this leaf rcu_node structure's ->qsmask field is
	already zero and its ->block_tasks[] entries are all empty,
	force_quiescent_state() will skip this rcu_node structure.

	Therefore, grace periods are now hung.

This patch abstracts some code out of rcu_read_unlock_special(),
calling the result task_quiet() by analogy with cpu_quiet(), and
invokes task_quiet() from both rcu_read_lock_special() and
__rcu_offline_cpu().  Invoking task_quiet() from
__rcu_offline_cpu() reports the quiescent state up the rcu_node
hierarchy, fixing the bug.  This ends up requiring a separate
lock_class_key per level of the rcu_node hierarchy, which this
patch also provides.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <12589088301770-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>

Showing 3 changed files with 85 additions and 43 deletions Side-by-side Diff

... ... @@ -51,7 +51,7 @@
51 51  
52 52 /* Data structures. */
53 53  
54   -static struct lock_class_key rcu_root_class;
  54 +static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
55 55  
56 56 #define RCU_STATE_INITIALIZER(name) { \
57 57 .level = { &name.node[0] }, \
... ... @@ -936,6 +936,7 @@
936 936 {
937 937 unsigned long flags;
938 938 unsigned long mask;
  939 + int need_quiet = 0;
939 940 struct rcu_data *rdp = rsp->rda[cpu];
940 941 struct rcu_node *rnp;
941 942  
942 943  
943 944  
944 945  
... ... @@ -949,29 +950,30 @@
949 950 spin_lock(&rnp->lock); /* irqs already disabled. */
950 951 rnp->qsmaskinit &= ~mask;
951 952 if (rnp->qsmaskinit != 0) {
952   - spin_unlock(&rnp->lock); /* irqs remain disabled. */
  953 + if (rnp != rdp->mynode)
  954 + spin_unlock(&rnp->lock); /* irqs remain disabled. */
953 955 break;
954 956 }
955   -
956   - /*
957   - * If there was a task blocking the current grace period,
958   - * and if all CPUs have checked in, we need to propagate
959   - * the quiescent state up the rcu_node hierarchy. But that
960   - * is inconvenient at the moment due to deadlock issues if
961   - * this should end the current grace period. So set the
962   - * offlined CPU's bit in ->qsmask in order to force the
963   - * next force_quiescent_state() invocation to clean up this
964   - * mess in a deadlock-free manner.
965   - */
966   - if (rcu_preempt_offline_tasks(rsp, rnp, rdp) && !rnp->qsmask)
967   - rnp->qsmask |= mask;
968   -
  957 + if (rnp == rdp->mynode)
  958 + need_quiet = rcu_preempt_offline_tasks(rsp, rnp, rdp);
  959 + else
  960 + spin_unlock(&rnp->lock); /* irqs remain disabled. */
969 961 mask = rnp->grpmask;
970   - spin_unlock(&rnp->lock); /* irqs remain disabled. */
971 962 rnp = rnp->parent;
972 963 } while (rnp != NULL);
973 964  
974   - spin_unlock_irqrestore(&rsp->onofflock, flags);
  965 + /*
  966 + * We still hold the leaf rcu_node structure lock here, and
  967 + * irqs are still disabled. The reason for this subterfuge is
  968 + * because invoking task_quiet() with ->onofflock held leads
  969 + * to deadlock.
  970 + */
  971 + spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
  972 + rnp = rdp->mynode;
  973 + if (need_quiet)
  974 + task_quiet(rnp, flags);
  975 + else
  976 + spin_unlock_irqrestore(&rnp->lock, flags);
975 977  
976 978 rcu_adopt_orphan_cbs(rsp);
977 979 }
... ... @@ -1731,6 +1733,7 @@
1731 1733 rnp = rsp->level[i];
1732 1734 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1733 1735 spin_lock_init(&rnp->lock);
  1736 + lockdep_set_class(&rnp->lock, &rcu_node_class[i]);
1734 1737 rnp->gpnum = 0;
1735 1738 rnp->qsmask = 0;
1736 1739 rnp->qsmaskinit = 0;
... ... @@ -1753,7 +1756,6 @@
1753 1756 INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
1754 1757 }
1755 1758 }
1756   - lockdep_set_class(&rcu_get_root(rsp)->lock, &rcu_root_class);
1757 1759 }
1758 1760  
1759 1761 /*
... ... @@ -305,6 +305,9 @@
305 305 long rcu_batches_completed(void);
306 306 static void rcu_preempt_note_context_switch(int cpu);
307 307 static int rcu_preempted_readers(struct rcu_node *rnp);
  308 +#ifdef CONFIG_HOTPLUG_CPU
  309 +static void task_quiet(struct rcu_node *rnp, unsigned long flags);
  310 +#endif /* #ifdef CONFIG_HOTPLUG_CPU */
308 311 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
309 312 static void rcu_print_task_stall(struct rcu_node *rnp);
310 313 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
kernel/rcutree_plugin.h
... ... @@ -160,11 +160,51 @@
160 160 return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
161 161 }
162 162  
  163 +/*
  164 + * Record a quiescent state for all tasks that were previously queued
  165 + * on the specified rcu_node structure and that were blocking the current
  166 + * RCU grace period. The caller must hold the specified rnp->lock with
  167 + * irqs disabled, and this lock is released upon return, but irqs remain
  168 + * disabled.
  169 + */
  170 +static void task_quiet(struct rcu_node *rnp, unsigned long flags)
  171 + __releases(rnp->lock)
  172 +{
  173 + unsigned long mask;
  174 + struct rcu_node *rnp_p;
  175 +
  176 + if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
  177 + spin_unlock_irqrestore(&rnp->lock, flags);
  178 + return; /* Still need more quiescent states! */
  179 + }
  180 +
  181 + rnp_p = rnp->parent;
  182 + if (rnp_p == NULL) {
  183 + /*
  184 + * Either there is only one rcu_node in the tree,
  185 + * or tasks were kicked up to root rcu_node due to
  186 + * CPUs going offline.
  187 + */
  188 + cpu_quiet_msk_finish(&rcu_preempt_state, flags);
  189 + return;
  190 + }
  191 +
  192 + /* Report up the rest of the hierarchy. */
  193 + mask = rnp->grpmask;
  194 + spin_unlock(&rnp->lock); /* irqs remain disabled. */
  195 + spin_lock(&rnp_p->lock); /* irqs already disabled. */
  196 + cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags);
  197 +}
  198 +
  199 +/*
  200 + * Handle special cases during rcu_read_unlock(), such as needing to
  201 + * notify RCU core processing or task having blocked during the RCU
  202 + * read-side critical section.
  203 + */
163 204 static void rcu_read_unlock_special(struct task_struct *t)
164 205 {
165 206 int empty;
166 207 unsigned long flags;
167   - unsigned long mask;
168 208 struct rcu_node *rnp;
169 209 int special;
170 210  
171 211  
172 212  
173 213  
... ... @@ -213,30 +253,15 @@
213 253 /*
214 254 * If this was the last task on the current list, and if
215 255 * we aren't waiting on any CPUs, report the quiescent state.
216   - * Note that both cpu_quiet_msk_finish() and cpu_quiet_msk()
217   - * drop rnp->lock and restore irq.
  256 + * Note that task_quiet() releases rnp->lock.
218 257 */
219   - if (!empty && rnp->qsmask == 0 &&
220   - !rcu_preempted_readers(rnp)) {
221   - struct rcu_node *rnp_p;
222   -
223   - if (rnp->parent == NULL) {
224   - /* Only one rcu_node in the tree. */
225   - cpu_quiet_msk_finish(&rcu_preempt_state, flags);
226   - return;
227   - }
228   - /* Report up the rest of the hierarchy. */
229   - mask = rnp->grpmask;
  258 + if (empty)
230 259 spin_unlock_irqrestore(&rnp->lock, flags);
231   - rnp_p = rnp->parent;
232   - spin_lock_irqsave(&rnp_p->lock, flags);
233   - WARN_ON_ONCE(rnp->qsmask);
234   - cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags);
235   - return;
236   - }
237   - spin_unlock(&rnp->lock);
  260 + else
  261 + task_quiet(rnp, flags);
  262 + } else {
  263 + local_irq_restore(flags);
238 264 }
239   - local_irq_restore(flags);
240 265 }
241 266  
242 267 /*
... ... @@ -303,6 +328,8 @@
303 328 * rcu_node. The reason for not just moving them to the immediate
304 329 * parent is to remove the need for rcu_read_unlock_special() to
305 330 * make more than two attempts to acquire the target rcu_node's lock.
  331 + * Returns true if there were tasks blocking the current RCU grace
  332 + * period.
306 333 *
307 334 * Returns 1 if there was previously a task blocking the current grace
308 335 * period on the specified rcu_node structure.
... ... @@ -316,7 +343,7 @@
316 343 int i;
317 344 struct list_head *lp;
318 345 struct list_head *lp_root;
319   - int retval = rcu_preempted_readers(rnp);
  346 + int retval;
320 347 struct rcu_node *rnp_root = rcu_get_root(rsp);
321 348 struct task_struct *tp;
322 349  
... ... @@ -334,6 +361,7 @@
334 361 * rcu_nodes in terms of gp_num value. This fact allows us to
335 362 * move the blocked_tasks[] array directly, element by element.
336 363 */
  364 + retval = rcu_preempted_readers(rnp);
337 365 for (i = 0; i < 2; i++) {
338 366 lp = &rnp->blocked_tasks[i];
339 367 lp_root = &rnp_root->blocked_tasks[i];
... ... @@ -346,7 +374,6 @@
346 374 spin_unlock(&rnp_root->lock); /* irqs remain disabled */
347 375 }
348 376 }
349   -
350 377 return retval;
351 378 }
352 379  
... ... @@ -511,6 +538,16 @@
511 538 {
512 539 return 0;
513 540 }
  541 +
  542 +#ifdef CONFIG_HOTPLUG_CPU
  543 +
  544 +/* Because preemptible RCU does not exist, no quieting of tasks. */
  545 +static void task_quiet(struct rcu_node *rnp, unsigned long flags)
  546 +{
  547 + spin_unlock_irqrestore(&rnp->lock, flags);
  548 +}
  549 +
  550 +#endif /* #ifdef CONFIG_HOTPLUG_CPU */
514 551  
515 552 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
516 553