Commit b8c7f1dc5ca4e0d10709182233cdab932cef593d

Authored by Linus Torvalds

Merge branch 'core-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel…

…/git/tip/linux-2.6-tip

* 'core-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
  rcu: Fix whitespace inconsistencies
  rcu: Fix thinko, actually initialize full tree
  rcu: Apply results of code inspection of kernel/rcutree_plugin.h
  rcu: Add WARN_ON_ONCE() consistency checks covering state transitions
  rcu: Fix synchronize_rcu() for TREE_PREEMPT_RCU
  rcu: Simplify rcu_read_unlock_special() quiescent-state accounting
  rcu: Add debug checks to TREE_PREEMPT_RCU for premature grace periods
  rcu: Kconfig help needs to say that TREE_PREEMPT_RCU scales down
  rcutorture: Occasionally delay readers enough to make RCU force_quiescent_state
  rcu: Initialize multi-level RCU grace periods holding locks
  rcu: Need to update rnp->gpnum if preemptable RCU is to be reliable

Showing 11 changed files Side-by-side Diff

include/linux/rculist_nulls.h
... ... @@ -102,7 +102,7 @@
102 102 */
103 103 #define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \
104 104 for (pos = rcu_dereference((head)->first); \
105   - (!is_a_nulls(pos)) && \
  105 + (!is_a_nulls(pos)) && \
106 106 ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \
107 107 pos = rcu_dereference(pos->next))
108 108  
include/linux/rcupdate.h
1 1 /*
2   - * Read-Copy Update mechanism for mutual exclusion
  2 + * Read-Copy Update mechanism for mutual exclusion
3 3 *
4 4 * This program is free software; you can redistribute it and/or modify
5 5 * it under the terms of the GNU General Public License as published by
... ... @@ -18,7 +18,7 @@
18 18 * Copyright IBM Corporation, 2001
19 19 *
20 20 * Author: Dipankar Sarma <dipankar@in.ibm.com>
21   - *
  21 + *
22 22 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
23 23 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
24 24 * Papers:
... ... @@ -26,7 +26,7 @@
26 26 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
27 27 *
28 28 * For detailed explanation of Read-Copy Update mechanism see -
29   - * http://lse.sourceforge.net/locking/rcupdate.html
  29 + * http://lse.sourceforge.net/locking/rcupdate.html
30 30 *
31 31 */
32 32  
33 33  
34 34  
... ... @@ -52,8 +52,13 @@
52 52 };
53 53  
54 54 /* Exported common interfaces */
  55 +#ifdef CONFIG_TREE_PREEMPT_RCU
55 56 extern void synchronize_rcu(void);
  57 +#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
  58 +#define synchronize_rcu synchronize_sched
  59 +#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
56 60 extern void synchronize_rcu_bh(void);
  61 +extern void synchronize_sched(void);
57 62 extern void rcu_barrier(void);
58 63 extern void rcu_barrier_bh(void);
59 64 extern void rcu_barrier_sched(void);
... ... @@ -260,24 +265,6 @@
260 265 };
261 266  
262 267 extern void wakeme_after_rcu(struct rcu_head *head);
263   -
264   -/**
265   - * synchronize_sched - block until all CPUs have exited any non-preemptive
266   - * kernel code sequences.
267   - *
268   - * This means that all preempt_disable code sequences, including NMI and
269   - * hardware-interrupt handlers, in progress on entry will have completed
270   - * before this primitive returns. However, this does not guarantee that
271   - * softirq handlers will have completed, since in some kernels, these
272   - * handlers can run in process context, and can block.
273   - *
274   - * This primitive provides the guarantees made by the (now removed)
275   - * synchronize_kernel() API. In contrast, synchronize_rcu() only
276   - * guarantees that rcu_read_lock() sections will have completed.
277   - * In "classic RCU", these two guarantees happen to be one and
278   - * the same, but can differ in realtime RCU implementations.
279   - */
280   -#define synchronize_sched() __synchronize_sched()
281 268  
282 269 /**
283 270 * call_rcu - Queue an RCU callback for invocation after a grace period.
include/linux/rcutree.h
... ... @@ -24,7 +24,7 @@
24 24 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
25 25 *
26 26 * For detailed explanation of Read-Copy Update mechanism see -
27   - * Documentation/RCU
  27 + * Documentation/RCU
28 28 */
29 29  
30 30 #ifndef __LINUX_RCUTREE_H
... ... @@ -53,6 +53,8 @@
53 53 preempt_enable();
54 54 }
55 55  
  56 +#define __synchronize_sched() synchronize_rcu()
  57 +
56 58 static inline void exit_rcu(void)
57 59 {
58 60 }
... ... @@ -67,8 +69,6 @@
67 69 {
68 70 local_bh_enable();
69 71 }
70   -
71   -#define __synchronize_sched() synchronize_rcu()
72 72  
73 73 extern void call_rcu_sched(struct rcu_head *head,
74 74 void (*func)(struct rcu_head *rcu));
include/linux/sched.h
... ... @@ -1755,7 +1755,6 @@
1755 1755  
1756 1756 #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
1757 1757 #define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
1758   -#define RCU_READ_UNLOCK_GOT_QS (1 << 2) /* CPU has responded to RCU core. */
1759 1758  
1760 1759 static inline void rcu_copy_process(struct task_struct *p)
1761 1760 {
... ... @@ -331,7 +331,8 @@
331 331 This option selects the RCU implementation that is
332 332 designed for very large SMP systems with hundreds or
333 333 thousands of CPUs, but for which real-time response
334   - is also required.
  334 + is also required. It also scales down nicely to
  335 + smaller systems.
335 336  
336 337 endchoice
337 338  
... ... @@ -19,7 +19,7 @@
19 19 *
20 20 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
21 21 * Manfred Spraul <manfred@colorfullife.com>
22   - *
  22 + *
23 23 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
24 24 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
25 25 * Papers:
... ... @@ -27,7 +27,7 @@
27 27 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
28 28 *
29 29 * For detailed explanation of Read-Copy Update mechanism see -
30   - * http://lse.sourceforge.net/locking/rcupdate.html
  30 + * http://lse.sourceforge.net/locking/rcupdate.html
31 31 *
32 32 */
33 33 #include <linux/types.h>
... ... @@ -74,6 +74,8 @@
74 74 complete(&rcu->completion);
75 75 }
76 76  
  77 +#ifdef CONFIG_TREE_PREEMPT_RCU
  78 +
77 79 /**
78 80 * synchronize_rcu - wait until a grace period has elapsed.
79 81 *
... ... @@ -87,7 +89,7 @@
87 89 {
88 90 struct rcu_synchronize rcu;
89 91  
90   - if (rcu_blocking_is_gp())
  92 + if (!rcu_scheduler_active)
91 93 return;
92 94  
93 95 init_completion(&rcu.completion);
... ... @@ -97,6 +99,46 @@
97 99 wait_for_completion(&rcu.completion);
98 100 }
99 101 EXPORT_SYMBOL_GPL(synchronize_rcu);
  102 +
  103 +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
  104 +
  105 +/**
  106 + * synchronize_sched - wait until an rcu-sched grace period has elapsed.
  107 + *
  108 + * Control will return to the caller some time after a full rcu-sched
  109 + * grace period has elapsed, in other words after all currently executing
  110 + * rcu-sched read-side critical sections have completed. These read-side
  111 + * critical sections are delimited by rcu_read_lock_sched() and
  112 + * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(),
  113 + * local_irq_disable(), and so on may be used in place of
  114 + * rcu_read_lock_sched().
  115 + *
  116 + * This means that all preempt_disable code sequences, including NMI and
  117 + * hardware-interrupt handlers, in progress on entry will have completed
  118 + * before this primitive returns. However, this does not guarantee that
  119 + * softirq handlers will have completed, since in some kernels, these
  120 + * handlers can run in process context, and can block.
  121 + *
  122 + * This primitive provides the guarantees made by the (now removed)
  123 + * synchronize_kernel() API. In contrast, synchronize_rcu() only
  124 + * guarantees that rcu_read_lock() sections will have completed.
  125 + * In "classic RCU", these two guarantees happen to be one and
  126 + * the same, but can differ in realtime RCU implementations.
  127 + */
  128 +void synchronize_sched(void)
  129 +{
  130 + struct rcu_synchronize rcu;
  131 +
  132 + if (rcu_blocking_is_gp())
  133 + return;
  134 +
  135 + init_completion(&rcu.completion);
  136 + /* Will wake me after RCU finished. */
  137 + call_rcu_sched(&rcu.head, wakeme_after_rcu);
  138 + /* Wait for it. */
  139 + wait_for_completion(&rcu.completion);
  140 +}
  141 +EXPORT_SYMBOL_GPL(synchronize_sched);
100 142  
101 143 /**
102 144 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
... ... @@ -18,7 +18,7 @@
18 18 * Copyright (C) IBM Corporation, 2005, 2006
19 19 *
20 20 * Authors: Paul E. McKenney <paulmck@us.ibm.com>
21   - * Josh Triplett <josh@freedesktop.org>
  21 + * Josh Triplett <josh@freedesktop.org>
22 22 *
23 23 * See also: Documentation/RCU/torture.txt
24 24 */
... ... @@ -50,7 +50,7 @@
50 50  
51 51 MODULE_LICENSE("GPL");
52 52 MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
53   - "Josh Triplett <josh@freedesktop.org>");
  53 + "Josh Triplett <josh@freedesktop.org>");
54 54  
55 55 static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */
56 56 static int nfakewriters = 4; /* # fake writer threads */
... ... @@ -110,8 +110,8 @@
110 110 };
111 111  
112 112 static LIST_HEAD(rcu_torture_freelist);
113   -static struct rcu_torture *rcu_torture_current = NULL;
114   -static long rcu_torture_current_version = 0;
  113 +static struct rcu_torture *rcu_torture_current;
  114 +static long rcu_torture_current_version;
115 115 static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
116 116 static DEFINE_SPINLOCK(rcu_torture_lock);
117 117 static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
118 118  
... ... @@ -124,11 +124,11 @@
124 124 static atomic_t n_rcu_torture_free;
125 125 static atomic_t n_rcu_torture_mberror;
126 126 static atomic_t n_rcu_torture_error;
127   -static long n_rcu_torture_timers = 0;
  127 +static long n_rcu_torture_timers;
128 128 static struct list_head rcu_torture_removed;
129 129 static cpumask_var_t shuffle_tmp_mask;
130 130  
131   -static int stutter_pause_test = 0;
  131 +static int stutter_pause_test;
132 132  
133 133 #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
134 134 #define RCUTORTURE_RUNNABLE_INIT 1
135 135  
... ... @@ -267,8 +267,9 @@
267 267 int irq_capable;
268 268 char *name;
269 269 };
270   -static struct rcu_torture_ops *cur_ops = NULL;
271 270  
  271 +static struct rcu_torture_ops *cur_ops;
  272 +
272 273 /*
273 274 * Definitions for rcu torture testing.
274 275 */
275 276  
276 277  
... ... @@ -281,14 +282,17 @@
281 282  
282 283 static void rcu_read_delay(struct rcu_random_state *rrsp)
283 284 {
284   - long delay;
285   - const long longdelay = 200;
  285 + const unsigned long shortdelay_us = 200;
  286 + const unsigned long longdelay_ms = 50;
286 287  
287   - /* We want there to be long-running readers, but not all the time. */
  288 + /* We want a short delay sometimes to make a reader delay the grace
  289 + * period, and we want a long delay occasionally to trigger
  290 + * force_quiescent_state. */
288 291  
289   - delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay);
290   - if (!delay)
291   - udelay(longdelay);
  292 + if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms)))
  293 + mdelay(longdelay_ms);
  294 + if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
  295 + udelay(shortdelay_us);
292 296 }
293 297  
294 298 static void rcu_torture_read_unlock(int idx) __releases(RCU)
... ... @@ -339,8 +343,8 @@
339 343 .sync = synchronize_rcu,
340 344 .cb_barrier = rcu_barrier,
341 345 .stats = NULL,
342   - .irq_capable = 1,
343   - .name = "rcu"
  346 + .irq_capable = 1,
  347 + .name = "rcu"
344 348 };
345 349  
346 350 static void rcu_sync_torture_deferred_free(struct rcu_torture *p)
... ... @@ -638,7 +642,8 @@
638 642  
639 643 do {
640 644 schedule_timeout_uninterruptible(1);
641   - if ((rp = rcu_torture_alloc()) == NULL)
  645 + rp = rcu_torture_alloc();
  646 + if (rp == NULL)
642 647 continue;
643 648 rp->rtort_pipe_count = 0;
644 649 udelay(rcu_random(&rand) & 0x3ff);
... ... @@ -1110,7 +1115,7 @@
1110 1115 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
1111 1116 torture_type);
1112 1117 mutex_unlock(&fullstop_mutex);
1113   - return (-EINVAL);
  1118 + return -EINVAL;
1114 1119 }
1115 1120 if (cur_ops->init)
1116 1121 cur_ops->init(); /* no "goto unwind" prior to this point!!! */
... ... @@ -1161,7 +1166,7 @@
1161 1166 goto unwind;
1162 1167 }
1163 1168 fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
1164   - GFP_KERNEL);
  1169 + GFP_KERNEL);
1165 1170 if (fakewriter_tasks == NULL) {
1166 1171 VERBOSE_PRINTK_ERRSTRING("out of memory");
1167 1172 firsterr = -ENOMEM;
... ... @@ -1170,7 +1175,7 @@
1170 1175 for (i = 0; i < nfakewriters; i++) {
1171 1176 VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task");
1172 1177 fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL,
1173   - "rcu_torture_fakewriter");
  1178 + "rcu_torture_fakewriter");
1174 1179 if (IS_ERR(fakewriter_tasks[i])) {
1175 1180 firsterr = PTR_ERR(fakewriter_tasks[i]);
1176 1181 VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter");
... ... @@ -25,7 +25,7 @@
25 25 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
26 26 *
27 27 * For detailed explanation of Read-Copy Update mechanism see -
28   - * Documentation/RCU
  28 + * Documentation/RCU
29 29 */
30 30 #include <linux/types.h>
31 31 #include <linux/kernel.h>
32 32  
33 33  
34 34  
35 35  
36 36  
37 37  
38 38  
... ... @@ -107,27 +107,23 @@
107 107 */
108 108 void rcu_sched_qs(int cpu)
109 109 {
110   - unsigned long flags;
111 110 struct rcu_data *rdp;
112 111  
113   - local_irq_save(flags);
114 112 rdp = &per_cpu(rcu_sched_data, cpu);
115   - rdp->passed_quiesc = 1;
116 113 rdp->passed_quiesc_completed = rdp->completed;
117   - rcu_preempt_qs(cpu);
118   - local_irq_restore(flags);
  114 + barrier();
  115 + rdp->passed_quiesc = 1;
  116 + rcu_preempt_note_context_switch(cpu);
119 117 }
120 118  
121 119 void rcu_bh_qs(int cpu)
122 120 {
123   - unsigned long flags;
124 121 struct rcu_data *rdp;
125 122  
126   - local_irq_save(flags);
127 123 rdp = &per_cpu(rcu_bh_data, cpu);
128   - rdp->passed_quiesc = 1;
129 124 rdp->passed_quiesc_completed = rdp->completed;
130   - local_irq_restore(flags);
  125 + barrier();
  126 + rdp->passed_quiesc = 1;
131 127 }
132 128  
133 129 #ifdef CONFIG_NO_HZ
... ... @@ -605,8 +601,6 @@
605 601 {
606 602 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
607 603 struct rcu_node *rnp = rcu_get_root(rsp);
608   - struct rcu_node *rnp_cur;
609   - struct rcu_node *rnp_end;
610 604  
611 605 if (!cpu_needs_another_gp(rsp, rdp)) {
612 606 spin_unlock_irqrestore(&rnp->lock, flags);
... ... @@ -615,6 +609,7 @@
615 609  
616 610 /* Advance to a new grace period and initialize state. */
617 611 rsp->gpnum++;
  612 + WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT);
618 613 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
619 614 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
620 615 record_gp_stall_check_time(rsp);
621 616  
... ... @@ -631,7 +626,9 @@
631 626  
632 627 /* Special-case the common single-level case. */
633 628 if (NUM_RCU_NODES == 1) {
  629 + rcu_preempt_check_blocked_tasks(rnp);
634 630 rnp->qsmask = rnp->qsmaskinit;
  631 + rnp->gpnum = rsp->gpnum;
635 632 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
636 633 spin_unlock_irqrestore(&rnp->lock, flags);
637 634 return;
638 635  
639 636  
640 637  
... ... @@ -644,42 +641,28 @@
644 641 spin_lock(&rsp->onofflock); /* irqs already disabled. */
645 642  
646 643 /*
647   - * Set the quiescent-state-needed bits in all the non-leaf RCU
648   - * nodes for all currently online CPUs. This operation relies
649   - * on the layout of the hierarchy within the rsp->node[] array.
650   - * Note that other CPUs will access only the leaves of the
651   - * hierarchy, which still indicate that no grace period is in
652   - * progress. In addition, we have excluded CPU-hotplug operations.
  644 + * Set the quiescent-state-needed bits in all the rcu_node
  645 + * structures for all currently online CPUs in breadth-first
  646 + * order, starting from the root rcu_node structure. This
  647 + * operation relies on the layout of the hierarchy within the
  648 + * rsp->node[] array. Note that other CPUs will access only
  649 + * the leaves of the hierarchy, which still indicate that no
  650 + * grace period is in progress, at least until the corresponding
  651 + * leaf node has been initialized. In addition, we have excluded
  652 + * CPU-hotplug operations.
653 653 *
654   - * We therefore do not need to hold any locks. Any required
655   - * memory barriers will be supplied by the locks guarding the
656   - * leaf rcu_nodes in the hierarchy.
657   - */
658   -
659   - rnp_end = rsp->level[NUM_RCU_LVLS - 1];
660   - for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++)
661   - rnp_cur->qsmask = rnp_cur->qsmaskinit;
662   -
663   - /*
664   - * Now set up the leaf nodes. Here we must be careful. First,
665   - * we need to hold the lock in order to exclude other CPUs, which
666   - * might be contending for the leaf nodes' locks. Second, as
667   - * soon as we initialize a given leaf node, its CPUs might run
668   - * up the rest of the hierarchy. We must therefore acquire locks
669   - * for each node that we touch during this stage. (But we still
670   - * are excluding CPU-hotplug operations.)
671   - *
672 654 * Note that the grace period cannot complete until we finish
673 655 * the initialization process, as there will be at least one
674 656 * qsmask bit set in the root node until that time, namely the
675   - * one corresponding to this CPU.
  657 + * one corresponding to this CPU, due to the fact that we have
  658 + * irqs disabled.
676 659 */
677   - rnp_end = &rsp->node[NUM_RCU_NODES];
678   - rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
679   - for (; rnp_cur < rnp_end; rnp_cur++) {
680   - spin_lock(&rnp_cur->lock); /* irqs already disabled. */
681   - rnp_cur->qsmask = rnp_cur->qsmaskinit;
682   - spin_unlock(&rnp_cur->lock); /* irqs already disabled. */
  660 + for (rnp = &rsp->node[0]; rnp < &rsp->node[NUM_RCU_NODES]; rnp++) {
  661 + spin_lock(&rnp->lock); /* irqs already disabled. */
  662 + rcu_preempt_check_blocked_tasks(rnp);
  663 + rnp->qsmask = rnp->qsmaskinit;
  664 + rnp->gpnum = rsp->gpnum;
  665 + spin_unlock(&rnp->lock); /* irqs already disabled. */
683 666 }
684 667  
685 668 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
... ... @@ -722,6 +705,7 @@
722 705 static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags)
723 706 __releases(rnp->lock)
724 707 {
  708 + WARN_ON_ONCE(rsp->completed == rsp->gpnum);
725 709 rsp->completed = rsp->gpnum;
726 710 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
727 711 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
... ... @@ -739,6 +723,8 @@
739 723 unsigned long flags)
740 724 __releases(rnp->lock)
741 725 {
  726 + struct rcu_node *rnp_c;
  727 +
742 728 /* Walk up the rcu_node hierarchy. */
743 729 for (;;) {
744 730 if (!(rnp->qsmask & mask)) {
745 731  
... ... @@ -762,8 +748,10 @@
762 748 break;
763 749 }
764 750 spin_unlock_irqrestore(&rnp->lock, flags);
  751 + rnp_c = rnp;
765 752 rnp = rnp->parent;
766 753 spin_lock_irqsave(&rnp->lock, flags);
  754 + WARN_ON_ONCE(rnp_c->qsmask);
767 755 }
768 756  
769 757 /*
... ... @@ -776,10 +764,10 @@
776 764  
777 765 /*
778 766 * Record a quiescent state for the specified CPU, which must either be
779   - * the current CPU or an offline CPU. The lastcomp argument is used to
780   - * make sure we are still in the grace period of interest. We don't want
781   - * to end the current grace period based on quiescent states detected in
782   - * an earlier grace period!
  767 + * the current CPU. The lastcomp argument is used to make sure we are
  768 + * still in the grace period of interest. We don't want to end the current
  769 + * grace period based on quiescent states detected in an earlier grace
  770 + * period!
783 771 */
784 772 static void
785 773 cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
... ... @@ -814,7 +802,6 @@
814 802 * This GP can't end until cpu checks in, so all of our
815 803 * callbacks can be processed during the next GP.
816 804 */
817   - rdp = rsp->rda[smp_processor_id()];
818 805 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
819 806  
820 807 cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */
... ... @@ -872,7 +859,7 @@
872 859 spin_lock_irqsave(&rsp->onofflock, flags);
873 860  
874 861 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
875   - rnp = rdp->mynode;
  862 + rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */
876 863 mask = rdp->grpmask; /* rnp->grplo is constant. */
877 864 do {
878 865 spin_lock(&rnp->lock); /* irqs already disabled. */
... ... @@ -881,7 +868,7 @@
881 868 spin_unlock(&rnp->lock); /* irqs remain disabled. */
882 869 break;
883 870 }
884   - rcu_preempt_offline_tasks(rsp, rnp);
  871 + rcu_preempt_offline_tasks(rsp, rnp, rdp);
885 872 mask = rnp->grpmask;
886 873 spin_unlock(&rnp->lock); /* irqs remain disabled. */
887 874 rnp = rnp->parent;
... ... @@ -890,9 +877,6 @@
890 877  
891 878 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
892 879  
893   - /* Being offline is a quiescent state, so go record it. */
894   - cpu_quiet(cpu, rsp, rdp, lastcomp);
895   -
896 880 /*
897 881 * Move callbacks from the outgoing CPU to the running CPU.
898 882 * Note that the outgoing CPU is now quiscent, so it is now
... ... @@ -1457,20 +1441,7 @@
1457 1441 rnp = rnp->parent;
1458 1442 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
1459 1443  
1460   - spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
1461   -
1462   - /*
1463   - * A new grace period might start here. If so, we will be part of
1464   - * it, and its gpnum will be greater than ours, so we will
1465   - * participate. It is also possible for the gpnum to have been
1466   - * incremented before this function was called, and the bitmasks
1467   - * to not be filled out until now, in which case we will also
1468   - * participate due to our gpnum being behind.
1469   - */
1470   -
1471   - /* Since it is coming online, the CPU is in a quiescent state. */
1472   - cpu_quiet(cpu, rsp, rdp, lastcomp);
1473   - local_irq_restore(flags);
  1444 + spin_unlock_irqrestore(&rsp->onofflock, flags);
1474 1445 }
1475 1446  
1476 1447 static void __cpuinit rcu_online_cpu(int cpu)
... ... @@ -142,7 +142,7 @@
142 142 */
143 143 struct rcu_head *nxtlist;
144 144 struct rcu_head **nxttail[RCU_NEXT_SIZE];
145   - long qlen; /* # of queued callbacks */
  145 + long qlen; /* # of queued callbacks */
146 146 long blimit; /* Upper limit on a processed batch */
147 147  
148 148 #ifdef CONFIG_NO_HZ
kernel/rcutree_plugin.h
... ... @@ -64,22 +64,31 @@
64 64 * not in a quiescent state. There might be any number of tasks blocked
65 65 * while in an RCU read-side critical section.
66 66 */
67   -static void rcu_preempt_qs_record(int cpu)
  67 +static void rcu_preempt_qs(int cpu)
68 68 {
69 69 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
70   - rdp->passed_quiesc = 1;
71 70 rdp->passed_quiesc_completed = rdp->completed;
  71 + barrier();
  72 + rdp->passed_quiesc = 1;
72 73 }
73 74  
74 75 /*
75   - * We have entered the scheduler or are between softirqs in ksoftirqd.
76   - * If we are in an RCU read-side critical section, we need to reflect
77   - * that in the state of the rcu_node structure corresponding to this CPU.
78   - * Caller must disable hardirqs.
  76 + * We have entered the scheduler, and the current task might soon be
  77 + * context-switched away from. If this task is in an RCU read-side
  78 + * critical section, we will no longer be able to rely on the CPU to
  79 + * record that fact, so we enqueue the task on the appropriate entry
  80 + * of the blocked_tasks[] array. The task will dequeue itself when
  81 + * it exits the outermost enclosing RCU read-side critical section.
  82 + * Therefore, the current grace period cannot be permitted to complete
  83 + * until the blocked_tasks[] entry indexed by the low-order bit of
  84 + * rnp->gpnum empties.
  85 + *
  86 + * Caller must disable preemption.
79 87 */
80   -static void rcu_preempt_qs(int cpu)
  88 +static void rcu_preempt_note_context_switch(int cpu)
81 89 {
82 90 struct task_struct *t = current;
  91 + unsigned long flags;
83 92 int phase;
84 93 struct rcu_data *rdp;
85 94 struct rcu_node *rnp;
... ... @@ -90,7 +99,7 @@
90 99 /* Possibly blocking in an RCU read-side critical section. */
91 100 rdp = rcu_preempt_state.rda[cpu];
92 101 rnp = rdp->mynode;
93   - spin_lock(&rnp->lock);
  102 + spin_lock_irqsave(&rnp->lock, flags);
94 103 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
95 104 t->rcu_blocked_node = rnp;
96 105  
97 106  
98 107  
... ... @@ -103,11 +112,15 @@
103 112 * state for the current grace period), then as long
104 113 * as that task remains queued, the current grace period
105 114 * cannot end.
  115 + *
  116 + * But first, note that the current CPU must still be
  117 + * on line!
106 118 */
107   - phase = !(rnp->qsmask & rdp->grpmask) ^ (rnp->gpnum & 0x1);
  119 + WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
  120 + WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
  121 + phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1;
108 122 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
109   - smp_mb(); /* Ensure later ctxt swtch seen after above. */
110   - spin_unlock(&rnp->lock);
  123 + spin_unlock_irqrestore(&rnp->lock, flags);
111 124 }
112 125  
113 126 /*
... ... @@ -119,9 +132,10 @@
119 132 * grace period, then the fact that the task has been enqueued
120 133 * means that we continue to block the current grace period.
121 134 */
122   - rcu_preempt_qs_record(cpu);
123   - t->rcu_read_unlock_special &= ~(RCU_READ_UNLOCK_NEED_QS |
124   - RCU_READ_UNLOCK_GOT_QS);
  135 + rcu_preempt_qs(cpu);
  136 + local_irq_save(flags);
  137 + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
  138 + local_irq_restore(flags);
125 139 }
126 140  
127 141 /*
... ... @@ -157,7 +171,7 @@
157 171 special = t->rcu_read_unlock_special;
158 172 if (special & RCU_READ_UNLOCK_NEED_QS) {
159 173 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
160   - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_GOT_QS;
  174 + rcu_preempt_qs(smp_processor_id());
161 175 }
162 176  
163 177 /* Hardware IRQ handlers cannot block. */
164 178  
... ... @@ -177,10 +191,10 @@
177 191 */
178 192 for (;;) {
179 193 rnp = t->rcu_blocked_node;
180   - spin_lock(&rnp->lock);
  194 + spin_lock(&rnp->lock); /* irqs already disabled. */
181 195 if (rnp == t->rcu_blocked_node)
182 196 break;
183   - spin_unlock(&rnp->lock);
  197 + spin_unlock(&rnp->lock); /* irqs remain disabled. */
184 198 }
185 199 empty = list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
186 200 list_del_init(&t->rcu_node_entry);
... ... @@ -194,9 +208,8 @@
194 208 */
195 209 if (!empty && rnp->qsmask == 0 &&
196 210 list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])) {
197   - t->rcu_read_unlock_special &=
198   - ~(RCU_READ_UNLOCK_NEED_QS |
199   - RCU_READ_UNLOCK_GOT_QS);
  211 + struct rcu_node *rnp_p;
  212 +
200 213 if (rnp->parent == NULL) {
201 214 /* Only one rcu_node in the tree. */
202 215 cpu_quiet_msk_finish(&rcu_preempt_state, flags);
... ... @@ -205,9 +218,10 @@
205 218 /* Report up the rest of the hierarchy. */
206 219 mask = rnp->grpmask;
207 220 spin_unlock_irqrestore(&rnp->lock, flags);
208   - rnp = rnp->parent;
209   - spin_lock_irqsave(&rnp->lock, flags);
210   - cpu_quiet_msk(mask, &rcu_preempt_state, rnp, flags);
  221 + rnp_p = rnp->parent;
  222 + spin_lock_irqsave(&rnp_p->lock, flags);
  223 + WARN_ON_ONCE(rnp->qsmask);
  224 + cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags);
211 225 return;
212 226 }
213 227 spin_unlock(&rnp->lock);
... ... @@ -259,6 +273,19 @@
259 273 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
260 274  
261 275 /*
  276 + * Check that the list of blocked tasks for the newly completed grace
  277 + * period is in fact empty. It is a serious bug to complete a grace
  278 + * period that still has RCU readers blocked! This function must be
  279 + * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
  280 + * must be held by the caller.
  281 + */
  282 +static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
  283 +{
  284 + WARN_ON_ONCE(!list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]));
  285 + WARN_ON_ONCE(rnp->qsmask);
  286 +}
  287 +
  288 +/*
262 289 * Check for preempted RCU readers for the specified rcu_node structure.
263 290 * If the caller needs a reliable answer, it must hold the rcu_node's
264 291 * >lock.
... ... @@ -280,7 +307,8 @@
280 307 * The caller must hold rnp->lock with irqs disabled.
281 308 */
282 309 static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
283   - struct rcu_node *rnp)
  310 + struct rcu_node *rnp,
  311 + struct rcu_data *rdp)
284 312 {
285 313 int i;
286 314 struct list_head *lp;
... ... @@ -292,6 +320,9 @@
292 320 WARN_ONCE(1, "Last CPU thought to be offlined?");
293 321 return; /* Shouldn't happen: at least one CPU online. */
294 322 }
  323 + WARN_ON_ONCE(rnp != rdp->mynode &&
  324 + (!list_empty(&rnp->blocked_tasks[0]) ||
  325 + !list_empty(&rnp->blocked_tasks[1])));
295 326  
296 327 /*
297 328 * Move tasks up to root rcu_node. Rely on the fact that the
298 329  
... ... @@ -335,20 +366,12 @@
335 366 struct task_struct *t = current;
336 367  
337 368 if (t->rcu_read_lock_nesting == 0) {
338   - t->rcu_read_unlock_special &=
339   - ~(RCU_READ_UNLOCK_NEED_QS | RCU_READ_UNLOCK_GOT_QS);
340   - rcu_preempt_qs_record(cpu);
  369 + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
  370 + rcu_preempt_qs(cpu);
341 371 return;
342 372 }
343   - if (per_cpu(rcu_preempt_data, cpu).qs_pending) {
344   - if (t->rcu_read_unlock_special & RCU_READ_UNLOCK_GOT_QS) {
345   - rcu_preempt_qs_record(cpu);
346   - t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_GOT_QS;
347   - } else if (!(t->rcu_read_unlock_special &
348   - RCU_READ_UNLOCK_NEED_QS)) {
349   - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
350   - }
351   - }
  373 + if (per_cpu(rcu_preempt_data, cpu).qs_pending)
  374 + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
352 375 }
353 376  
354 377 /*
... ... @@ -434,7 +457,7 @@
434 457 * Because preemptable RCU does not exist, we never have to check for
435 458 * CPUs being in quiescent states.
436 459 */
437   -static void rcu_preempt_qs(int cpu)
  460 +static void rcu_preempt_note_context_switch(int cpu)
438 461 {
439 462 }
440 463  
... ... @@ -451,6 +474,16 @@
451 474 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
452 475  
453 476 /*
  477 + * Because there is no preemptable RCU, there can be no readers blocked,
  478 + * so there is no need to check for blocked tasks. So check only for
  479 + * bogus qsmask values.
  480 + */
  481 +static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
  482 +{
  483 + WARN_ON_ONCE(rnp->qsmask);
  484 +}
  485 +
  486 +/*
454 487 * Because preemptable RCU does not exist, there are never any preempted
455 488 * RCU readers.
456 489 */
... ... @@ -466,7 +499,8 @@
466 499 * tasks that were blocked within RCU read-side critical sections.
467 500 */
468 501 static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
469   - struct rcu_node *rnp)
  502 + struct rcu_node *rnp,
  503 + struct rcu_data *rdp)
470 504 {
471 505 }
472 506  
kernel/rcutree_trace.c
... ... @@ -20,7 +20,7 @@
20 20 * Papers: http://www.rdrop.com/users/paulmck/RCU
21 21 *
22 22 * For detailed explanation of Read-Copy Update mechanism see -
23   - * Documentation/RCU
  23 + * Documentation/RCU
24 24 *
25 25 */
26 26 #include <linux/types.h>