Commit b8c7f1dc5ca4e0d10709182233cdab932cef593d
Exists in
master
and in
7 other branches
Merge branch 'core-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel…
…/git/tip/linux-2.6-tip * 'core-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: rcu: Fix whitespace inconsistencies rcu: Fix thinko, actually initialize full tree rcu: Apply results of code inspection of kernel/rcutree_plugin.h rcu: Add WARN_ON_ONCE() consistency checks covering state transitions rcu: Fix synchronize_rcu() for TREE_PREEMPT_RCU rcu: Simplify rcu_read_unlock_special() quiescent-state accounting rcu: Add debug checks to TREE_PREEMPT_RCU for premature grace periods rcu: Kconfig help needs to say that TREE_PREEMPT_RCU scales down rcutorture: Occasionally delay readers enough to make RCU force_quiescent_state rcu: Initialize multi-level RCU grace periods holding locks rcu: Need to update rnp->gpnum if preemptable RCU is to be reliable
Showing 11 changed files Side-by-side Diff
include/linux/rculist_nulls.h
... | ... | @@ -102,7 +102,7 @@ |
102 | 102 | */ |
103 | 103 | #define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \ |
104 | 104 | for (pos = rcu_dereference((head)->first); \ |
105 | - (!is_a_nulls(pos)) && \ | |
105 | + (!is_a_nulls(pos)) && \ | |
106 | 106 | ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \ |
107 | 107 | pos = rcu_dereference(pos->next)) |
108 | 108 |
include/linux/rcupdate.h
1 | 1 | /* |
2 | - * Read-Copy Update mechanism for mutual exclusion | |
2 | + * Read-Copy Update mechanism for mutual exclusion | |
3 | 3 | * |
4 | 4 | * This program is free software; you can redistribute it and/or modify |
5 | 5 | * it under the terms of the GNU General Public License as published by |
... | ... | @@ -18,7 +18,7 @@ |
18 | 18 | * Copyright IBM Corporation, 2001 |
19 | 19 | * |
20 | 20 | * Author: Dipankar Sarma <dipankar@in.ibm.com> |
21 | - * | |
21 | + * | |
22 | 22 | * Based on the original work by Paul McKenney <paulmck@us.ibm.com> |
23 | 23 | * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. |
24 | 24 | * Papers: |
... | ... | @@ -26,7 +26,7 @@ |
26 | 26 | * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) |
27 | 27 | * |
28 | 28 | * For detailed explanation of Read-Copy Update mechanism see - |
29 | - * http://lse.sourceforge.net/locking/rcupdate.html | |
29 | + * http://lse.sourceforge.net/locking/rcupdate.html | |
30 | 30 | * |
31 | 31 | */ |
32 | 32 | |
33 | 33 | |
34 | 34 | |
... | ... | @@ -52,8 +52,13 @@ |
52 | 52 | }; |
53 | 53 | |
54 | 54 | /* Exported common interfaces */ |
55 | +#ifdef CONFIG_TREE_PREEMPT_RCU | |
55 | 56 | extern void synchronize_rcu(void); |
57 | +#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | |
58 | +#define synchronize_rcu synchronize_sched | |
59 | +#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ | |
56 | 60 | extern void synchronize_rcu_bh(void); |
61 | +extern void synchronize_sched(void); | |
57 | 62 | extern void rcu_barrier(void); |
58 | 63 | extern void rcu_barrier_bh(void); |
59 | 64 | extern void rcu_barrier_sched(void); |
... | ... | @@ -260,24 +265,6 @@ |
260 | 265 | }; |
261 | 266 | |
262 | 267 | extern void wakeme_after_rcu(struct rcu_head *head); |
263 | - | |
264 | -/** | |
265 | - * synchronize_sched - block until all CPUs have exited any non-preemptive | |
266 | - * kernel code sequences. | |
267 | - * | |
268 | - * This means that all preempt_disable code sequences, including NMI and | |
269 | - * hardware-interrupt handlers, in progress on entry will have completed | |
270 | - * before this primitive returns. However, this does not guarantee that | |
271 | - * softirq handlers will have completed, since in some kernels, these | |
272 | - * handlers can run in process context, and can block. | |
273 | - * | |
274 | - * This primitive provides the guarantees made by the (now removed) | |
275 | - * synchronize_kernel() API. In contrast, synchronize_rcu() only | |
276 | - * guarantees that rcu_read_lock() sections will have completed. | |
277 | - * In "classic RCU", these two guarantees happen to be one and | |
278 | - * the same, but can differ in realtime RCU implementations. | |
279 | - */ | |
280 | -#define synchronize_sched() __synchronize_sched() | |
281 | 268 | |
282 | 269 | /** |
283 | 270 | * call_rcu - Queue an RCU callback for invocation after a grace period. |
include/linux/rcutree.h
... | ... | @@ -24,7 +24,7 @@ |
24 | 24 | * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. |
25 | 25 | * |
26 | 26 | * For detailed explanation of Read-Copy Update mechanism see - |
27 | - * Documentation/RCU | |
27 | + * Documentation/RCU | |
28 | 28 | */ |
29 | 29 | |
30 | 30 | #ifndef __LINUX_RCUTREE_H |
... | ... | @@ -53,6 +53,8 @@ |
53 | 53 | preempt_enable(); |
54 | 54 | } |
55 | 55 | |
56 | +#define __synchronize_sched() synchronize_rcu() | |
57 | + | |
56 | 58 | static inline void exit_rcu(void) |
57 | 59 | { |
58 | 60 | } |
... | ... | @@ -67,8 +69,6 @@ |
67 | 69 | { |
68 | 70 | local_bh_enable(); |
69 | 71 | } |
70 | - | |
71 | -#define __synchronize_sched() synchronize_rcu() | |
72 | 72 | |
73 | 73 | extern void call_rcu_sched(struct rcu_head *head, |
74 | 74 | void (*func)(struct rcu_head *rcu)); |
include/linux/sched.h
... | ... | @@ -1755,7 +1755,6 @@ |
1755 | 1755 | |
1756 | 1756 | #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */ |
1757 | 1757 | #define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */ |
1758 | -#define RCU_READ_UNLOCK_GOT_QS (1 << 2) /* CPU has responded to RCU core. */ | |
1759 | 1758 | |
1760 | 1759 | static inline void rcu_copy_process(struct task_struct *p) |
1761 | 1760 | { |
init/Kconfig
... | ... | @@ -331,7 +331,8 @@ |
331 | 331 | This option selects the RCU implementation that is |
332 | 332 | designed for very large SMP systems with hundreds or |
333 | 333 | thousands of CPUs, but for which real-time response |
334 | - is also required. | |
334 | + is also required. It also scales down nicely to | |
335 | + smaller systems. | |
335 | 336 | |
336 | 337 | endchoice |
337 | 338 |
kernel/rcupdate.c
... | ... | @@ -19,7 +19,7 @@ |
19 | 19 | * |
20 | 20 | * Authors: Dipankar Sarma <dipankar@in.ibm.com> |
21 | 21 | * Manfred Spraul <manfred@colorfullife.com> |
22 | - * | |
22 | + * | |
23 | 23 | * Based on the original work by Paul McKenney <paulmck@us.ibm.com> |
24 | 24 | * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. |
25 | 25 | * Papers: |
... | ... | @@ -27,7 +27,7 @@ |
27 | 27 | * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) |
28 | 28 | * |
29 | 29 | * For detailed explanation of Read-Copy Update mechanism see - |
30 | - * http://lse.sourceforge.net/locking/rcupdate.html | |
30 | + * http://lse.sourceforge.net/locking/rcupdate.html | |
31 | 31 | * |
32 | 32 | */ |
33 | 33 | #include <linux/types.h> |
... | ... | @@ -74,6 +74,8 @@ |
74 | 74 | complete(&rcu->completion); |
75 | 75 | } |
76 | 76 | |
77 | +#ifdef CONFIG_TREE_PREEMPT_RCU | |
78 | + | |
77 | 79 | /** |
78 | 80 | * synchronize_rcu - wait until a grace period has elapsed. |
79 | 81 | * |
... | ... | @@ -87,7 +89,7 @@ |
87 | 89 | { |
88 | 90 | struct rcu_synchronize rcu; |
89 | 91 | |
90 | - if (rcu_blocking_is_gp()) | |
92 | + if (!rcu_scheduler_active) | |
91 | 93 | return; |
92 | 94 | |
93 | 95 | init_completion(&rcu.completion); |
... | ... | @@ -97,6 +99,46 @@ |
97 | 99 | wait_for_completion(&rcu.completion); |
98 | 100 | } |
99 | 101 | EXPORT_SYMBOL_GPL(synchronize_rcu); |
102 | + | |
103 | +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | |
104 | + | |
105 | +/** | |
106 | + * synchronize_sched - wait until an rcu-sched grace period has elapsed. | |
107 | + * | |
108 | + * Control will return to the caller some time after a full rcu-sched | |
109 | + * grace period has elapsed, in other words after all currently executing | |
110 | + * rcu-sched read-side critical sections have completed. These read-side | |
111 | + * critical sections are delimited by rcu_read_lock_sched() and | |
112 | + * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(), | |
113 | + * local_irq_disable(), and so on may be used in place of | |
114 | + * rcu_read_lock_sched(). | |
115 | + * | |
116 | + * This means that all preempt_disable code sequences, including NMI and | |
117 | + * hardware-interrupt handlers, in progress on entry will have completed | |
118 | + * before this primitive returns. However, this does not guarantee that | |
119 | + * softirq handlers will have completed, since in some kernels, these | |
120 | + * handlers can run in process context, and can block. | |
121 | + * | |
122 | + * This primitive provides the guarantees made by the (now removed) | |
123 | + * synchronize_kernel() API. In contrast, synchronize_rcu() only | |
124 | + * guarantees that rcu_read_lock() sections will have completed. | |
125 | + * In "classic RCU", these two guarantees happen to be one and | |
126 | + * the same, but can differ in realtime RCU implementations. | |
127 | + */ | |
128 | +void synchronize_sched(void) | |
129 | +{ | |
130 | + struct rcu_synchronize rcu; | |
131 | + | |
132 | + if (rcu_blocking_is_gp()) | |
133 | + return; | |
134 | + | |
135 | + init_completion(&rcu.completion); | |
136 | + /* Will wake me after RCU finished. */ | |
137 | + call_rcu_sched(&rcu.head, wakeme_after_rcu); | |
138 | + /* Wait for it. */ | |
139 | + wait_for_completion(&rcu.completion); | |
140 | +} | |
141 | +EXPORT_SYMBOL_GPL(synchronize_sched); | |
100 | 142 | |
101 | 143 | /** |
102 | 144 | * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed. |
kernel/rcutorture.c
... | ... | @@ -18,7 +18,7 @@ |
18 | 18 | * Copyright (C) IBM Corporation, 2005, 2006 |
19 | 19 | * |
20 | 20 | * Authors: Paul E. McKenney <paulmck@us.ibm.com> |
21 | - * Josh Triplett <josh@freedesktop.org> | |
21 | + * Josh Triplett <josh@freedesktop.org> | |
22 | 22 | * |
23 | 23 | * See also: Documentation/RCU/torture.txt |
24 | 24 | */ |
... | ... | @@ -50,7 +50,7 @@ |
50 | 50 | |
51 | 51 | MODULE_LICENSE("GPL"); |
52 | 52 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " |
53 | - "Josh Triplett <josh@freedesktop.org>"); | |
53 | + "Josh Triplett <josh@freedesktop.org>"); | |
54 | 54 | |
55 | 55 | static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ |
56 | 56 | static int nfakewriters = 4; /* # fake writer threads */ |
... | ... | @@ -110,8 +110,8 @@ |
110 | 110 | }; |
111 | 111 | |
112 | 112 | static LIST_HEAD(rcu_torture_freelist); |
113 | -static struct rcu_torture *rcu_torture_current = NULL; | |
114 | -static long rcu_torture_current_version = 0; | |
113 | +static struct rcu_torture *rcu_torture_current; | |
114 | +static long rcu_torture_current_version; | |
115 | 115 | static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; |
116 | 116 | static DEFINE_SPINLOCK(rcu_torture_lock); |
117 | 117 | static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = |
118 | 118 | |
... | ... | @@ -124,11 +124,11 @@ |
124 | 124 | static atomic_t n_rcu_torture_free; |
125 | 125 | static atomic_t n_rcu_torture_mberror; |
126 | 126 | static atomic_t n_rcu_torture_error; |
127 | -static long n_rcu_torture_timers = 0; | |
127 | +static long n_rcu_torture_timers; | |
128 | 128 | static struct list_head rcu_torture_removed; |
129 | 129 | static cpumask_var_t shuffle_tmp_mask; |
130 | 130 | |
131 | -static int stutter_pause_test = 0; | |
131 | +static int stutter_pause_test; | |
132 | 132 | |
133 | 133 | #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) |
134 | 134 | #define RCUTORTURE_RUNNABLE_INIT 1 |
135 | 135 | |
... | ... | @@ -267,8 +267,9 @@ |
267 | 267 | int irq_capable; |
268 | 268 | char *name; |
269 | 269 | }; |
270 | -static struct rcu_torture_ops *cur_ops = NULL; | |
271 | 270 | |
271 | +static struct rcu_torture_ops *cur_ops; | |
272 | + | |
272 | 273 | /* |
273 | 274 | * Definitions for rcu torture testing. |
274 | 275 | */ |
275 | 276 | |
276 | 277 | |
... | ... | @@ -281,14 +282,17 @@ |
281 | 282 | |
282 | 283 | static void rcu_read_delay(struct rcu_random_state *rrsp) |
283 | 284 | { |
284 | - long delay; | |
285 | - const long longdelay = 200; | |
285 | + const unsigned long shortdelay_us = 200; | |
286 | + const unsigned long longdelay_ms = 50; | |
286 | 287 | |
287 | - /* We want there to be long-running readers, but not all the time. */ | |
288 | + /* We want a short delay sometimes to make a reader delay the grace | |
289 | + * period, and we want a long delay occasionally to trigger | |
290 | + * force_quiescent_state. */ | |
288 | 291 | |
289 | - delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay); | |
290 | - if (!delay) | |
291 | - udelay(longdelay); | |
292 | + if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) | |
293 | + mdelay(longdelay_ms); | |
294 | + if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) | |
295 | + udelay(shortdelay_us); | |
292 | 296 | } |
293 | 297 | |
294 | 298 | static void rcu_torture_read_unlock(int idx) __releases(RCU) |
... | ... | @@ -339,8 +343,8 @@ |
339 | 343 | .sync = synchronize_rcu, |
340 | 344 | .cb_barrier = rcu_barrier, |
341 | 345 | .stats = NULL, |
342 | - .irq_capable = 1, | |
343 | - .name = "rcu" | |
346 | + .irq_capable = 1, | |
347 | + .name = "rcu" | |
344 | 348 | }; |
345 | 349 | |
346 | 350 | static void rcu_sync_torture_deferred_free(struct rcu_torture *p) |
... | ... | @@ -638,7 +642,8 @@ |
638 | 642 | |
639 | 643 | do { |
640 | 644 | schedule_timeout_uninterruptible(1); |
641 | - if ((rp = rcu_torture_alloc()) == NULL) | |
645 | + rp = rcu_torture_alloc(); | |
646 | + if (rp == NULL) | |
642 | 647 | continue; |
643 | 648 | rp->rtort_pipe_count = 0; |
644 | 649 | udelay(rcu_random(&rand) & 0x3ff); |
... | ... | @@ -1110,7 +1115,7 @@ |
1110 | 1115 | printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", |
1111 | 1116 | torture_type); |
1112 | 1117 | mutex_unlock(&fullstop_mutex); |
1113 | - return (-EINVAL); | |
1118 | + return -EINVAL; | |
1114 | 1119 | } |
1115 | 1120 | if (cur_ops->init) |
1116 | 1121 | cur_ops->init(); /* no "goto unwind" prior to this point!!! */ |
... | ... | @@ -1161,7 +1166,7 @@ |
1161 | 1166 | goto unwind; |
1162 | 1167 | } |
1163 | 1168 | fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), |
1164 | - GFP_KERNEL); | |
1169 | + GFP_KERNEL); | |
1165 | 1170 | if (fakewriter_tasks == NULL) { |
1166 | 1171 | VERBOSE_PRINTK_ERRSTRING("out of memory"); |
1167 | 1172 | firsterr = -ENOMEM; |
... | ... | @@ -1170,7 +1175,7 @@ |
1170 | 1175 | for (i = 0; i < nfakewriters; i++) { |
1171 | 1176 | VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); |
1172 | 1177 | fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, |
1173 | - "rcu_torture_fakewriter"); | |
1178 | + "rcu_torture_fakewriter"); | |
1174 | 1179 | if (IS_ERR(fakewriter_tasks[i])) { |
1175 | 1180 | firsterr = PTR_ERR(fakewriter_tasks[i]); |
1176 | 1181 | VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter"); |
kernel/rcutree.c
... | ... | @@ -25,7 +25,7 @@ |
25 | 25 | * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. |
26 | 26 | * |
27 | 27 | * For detailed explanation of Read-Copy Update mechanism see - |
28 | - * Documentation/RCU | |
28 | + * Documentation/RCU | |
29 | 29 | */ |
30 | 30 | #include <linux/types.h> |
31 | 31 | #include <linux/kernel.h> |
32 | 32 | |
33 | 33 | |
34 | 34 | |
35 | 35 | |
36 | 36 | |
37 | 37 | |
38 | 38 | |
... | ... | @@ -107,27 +107,23 @@ |
107 | 107 | */ |
108 | 108 | void rcu_sched_qs(int cpu) |
109 | 109 | { |
110 | - unsigned long flags; | |
111 | 110 | struct rcu_data *rdp; |
112 | 111 | |
113 | - local_irq_save(flags); | |
114 | 112 | rdp = &per_cpu(rcu_sched_data, cpu); |
115 | - rdp->passed_quiesc = 1; | |
116 | 113 | rdp->passed_quiesc_completed = rdp->completed; |
117 | - rcu_preempt_qs(cpu); | |
118 | - local_irq_restore(flags); | |
114 | + barrier(); | |
115 | + rdp->passed_quiesc = 1; | |
116 | + rcu_preempt_note_context_switch(cpu); | |
119 | 117 | } |
120 | 118 | |
121 | 119 | void rcu_bh_qs(int cpu) |
122 | 120 | { |
123 | - unsigned long flags; | |
124 | 121 | struct rcu_data *rdp; |
125 | 122 | |
126 | - local_irq_save(flags); | |
127 | 123 | rdp = &per_cpu(rcu_bh_data, cpu); |
128 | - rdp->passed_quiesc = 1; | |
129 | 124 | rdp->passed_quiesc_completed = rdp->completed; |
130 | - local_irq_restore(flags); | |
125 | + barrier(); | |
126 | + rdp->passed_quiesc = 1; | |
131 | 127 | } |
132 | 128 | |
133 | 129 | #ifdef CONFIG_NO_HZ |
... | ... | @@ -605,8 +601,6 @@ |
605 | 601 | { |
606 | 602 | struct rcu_data *rdp = rsp->rda[smp_processor_id()]; |
607 | 603 | struct rcu_node *rnp = rcu_get_root(rsp); |
608 | - struct rcu_node *rnp_cur; | |
609 | - struct rcu_node *rnp_end; | |
610 | 604 | |
611 | 605 | if (!cpu_needs_another_gp(rsp, rdp)) { |
612 | 606 | spin_unlock_irqrestore(&rnp->lock, flags); |
... | ... | @@ -615,6 +609,7 @@ |
615 | 609 | |
616 | 610 | /* Advance to a new grace period and initialize state. */ |
617 | 611 | rsp->gpnum++; |
612 | + WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT); | |
618 | 613 | rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ |
619 | 614 | rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; |
620 | 615 | record_gp_stall_check_time(rsp); |
621 | 616 | |
... | ... | @@ -631,7 +626,9 @@ |
631 | 626 | |
632 | 627 | /* Special-case the common single-level case. */ |
633 | 628 | if (NUM_RCU_NODES == 1) { |
629 | + rcu_preempt_check_blocked_tasks(rnp); | |
634 | 630 | rnp->qsmask = rnp->qsmaskinit; |
631 | + rnp->gpnum = rsp->gpnum; | |
635 | 632 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ |
636 | 633 | spin_unlock_irqrestore(&rnp->lock, flags); |
637 | 634 | return; |
638 | 635 | |
639 | 636 | |
640 | 637 | |
... | ... | @@ -644,42 +641,28 @@ |
644 | 641 | spin_lock(&rsp->onofflock); /* irqs already disabled. */ |
645 | 642 | |
646 | 643 | /* |
647 | - * Set the quiescent-state-needed bits in all the non-leaf RCU | |
648 | - * nodes for all currently online CPUs. This operation relies | |
649 | - * on the layout of the hierarchy within the rsp->node[] array. | |
650 | - * Note that other CPUs will access only the leaves of the | |
651 | - * hierarchy, which still indicate that no grace period is in | |
652 | - * progress. In addition, we have excluded CPU-hotplug operations. | |
644 | + * Set the quiescent-state-needed bits in all the rcu_node | |
645 | + * structures for all currently online CPUs in breadth-first | |
646 | + * order, starting from the root rcu_node structure. This | |
647 | + * operation relies on the layout of the hierarchy within the | |
648 | + * rsp->node[] array. Note that other CPUs will access only | |
649 | + * the leaves of the hierarchy, which still indicate that no | |
650 | + * grace period is in progress, at least until the corresponding | |
651 | + * leaf node has been initialized. In addition, we have excluded | |
652 | + * CPU-hotplug operations. | |
653 | 653 | * |
654 | - * We therefore do not need to hold any locks. Any required | |
655 | - * memory barriers will be supplied by the locks guarding the | |
656 | - * leaf rcu_nodes in the hierarchy. | |
657 | - */ | |
658 | - | |
659 | - rnp_end = rsp->level[NUM_RCU_LVLS - 1]; | |
660 | - for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++) | |
661 | - rnp_cur->qsmask = rnp_cur->qsmaskinit; | |
662 | - | |
663 | - /* | |
664 | - * Now set up the leaf nodes. Here we must be careful. First, | |
665 | - * we need to hold the lock in order to exclude other CPUs, which | |
666 | - * might be contending for the leaf nodes' locks. Second, as | |
667 | - * soon as we initialize a given leaf node, its CPUs might run | |
668 | - * up the rest of the hierarchy. We must therefore acquire locks | |
669 | - * for each node that we touch during this stage. (But we still | |
670 | - * are excluding CPU-hotplug operations.) | |
671 | - * | |
672 | 654 | * Note that the grace period cannot complete until we finish |
673 | 655 | * the initialization process, as there will be at least one |
674 | 656 | * qsmask bit set in the root node until that time, namely the |
675 | - * one corresponding to this CPU. | |
657 | + * one corresponding to this CPU, due to the fact that we have | |
658 | + * irqs disabled. | |
676 | 659 | */ |
677 | - rnp_end = &rsp->node[NUM_RCU_NODES]; | |
678 | - rnp_cur = rsp->level[NUM_RCU_LVLS - 1]; | |
679 | - for (; rnp_cur < rnp_end; rnp_cur++) { | |
680 | - spin_lock(&rnp_cur->lock); /* irqs already disabled. */ | |
681 | - rnp_cur->qsmask = rnp_cur->qsmaskinit; | |
682 | - spin_unlock(&rnp_cur->lock); /* irqs already disabled. */ | |
660 | + for (rnp = &rsp->node[0]; rnp < &rsp->node[NUM_RCU_NODES]; rnp++) { | |
661 | + spin_lock(&rnp->lock); /* irqs already disabled. */ | |
662 | + rcu_preempt_check_blocked_tasks(rnp); | |
663 | + rnp->qsmask = rnp->qsmaskinit; | |
664 | + rnp->gpnum = rsp->gpnum; | |
665 | + spin_unlock(&rnp->lock); /* irqs already disabled. */ | |
683 | 666 | } |
684 | 667 | |
685 | 668 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ |
... | ... | @@ -722,6 +705,7 @@ |
722 | 705 | static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags) |
723 | 706 | __releases(rnp->lock) |
724 | 707 | { |
708 | + WARN_ON_ONCE(rsp->completed == rsp->gpnum); | |
725 | 709 | rsp->completed = rsp->gpnum; |
726 | 710 | rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]); |
727 | 711 | rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ |
... | ... | @@ -739,6 +723,8 @@ |
739 | 723 | unsigned long flags) |
740 | 724 | __releases(rnp->lock) |
741 | 725 | { |
726 | + struct rcu_node *rnp_c; | |
727 | + | |
742 | 728 | /* Walk up the rcu_node hierarchy. */ |
743 | 729 | for (;;) { |
744 | 730 | if (!(rnp->qsmask & mask)) { |
745 | 731 | |
... | ... | @@ -762,8 +748,10 @@ |
762 | 748 | break; |
763 | 749 | } |
764 | 750 | spin_unlock_irqrestore(&rnp->lock, flags); |
751 | + rnp_c = rnp; | |
765 | 752 | rnp = rnp->parent; |
766 | 753 | spin_lock_irqsave(&rnp->lock, flags); |
754 | + WARN_ON_ONCE(rnp_c->qsmask); | |
767 | 755 | } |
768 | 756 | |
769 | 757 | /* |
... | ... | @@ -776,10 +764,10 @@ |
776 | 764 | |
777 | 765 | /* |
778 | 766 | * Record a quiescent state for the specified CPU, which must either be |
779 | - * the current CPU or an offline CPU. The lastcomp argument is used to | |
780 | - * make sure we are still in the grace period of interest. We don't want | |
781 | - * to end the current grace period based on quiescent states detected in | |
782 | - * an earlier grace period! | |
767 | + * the current CPU. The lastcomp argument is used to make sure we are | |
768 | + * still in the grace period of interest. We don't want to end the current | |
769 | + * grace period based on quiescent states detected in an earlier grace | |
770 | + * period! | |
783 | 771 | */ |
784 | 772 | static void |
785 | 773 | cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) |
... | ... | @@ -814,7 +802,6 @@ |
814 | 802 | * This GP can't end until cpu checks in, so all of our |
815 | 803 | * callbacks can be processed during the next GP. |
816 | 804 | */ |
817 | - rdp = rsp->rda[smp_processor_id()]; | |
818 | 805 | rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; |
819 | 806 | |
820 | 807 | cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */ |
... | ... | @@ -872,7 +859,7 @@ |
872 | 859 | spin_lock_irqsave(&rsp->onofflock, flags); |
873 | 860 | |
874 | 861 | /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ |
875 | - rnp = rdp->mynode; | |
862 | + rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */ | |
876 | 863 | mask = rdp->grpmask; /* rnp->grplo is constant. */ |
877 | 864 | do { |
878 | 865 | spin_lock(&rnp->lock); /* irqs already disabled. */ |
... | ... | @@ -881,7 +868,7 @@ |
881 | 868 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
882 | 869 | break; |
883 | 870 | } |
884 | - rcu_preempt_offline_tasks(rsp, rnp); | |
871 | + rcu_preempt_offline_tasks(rsp, rnp, rdp); | |
885 | 872 | mask = rnp->grpmask; |
886 | 873 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
887 | 874 | rnp = rnp->parent; |
... | ... | @@ -890,9 +877,6 @@ |
890 | 877 | |
891 | 878 | spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ |
892 | 879 | |
893 | - /* Being offline is a quiescent state, so go record it. */ | |
894 | - cpu_quiet(cpu, rsp, rdp, lastcomp); | |
895 | - | |
896 | 880 | /* |
897 | 881 | * Move callbacks from the outgoing CPU to the running CPU. |
898 | 882 | * Note that the outgoing CPU is now quiscent, so it is now |
... | ... | @@ -1457,20 +1441,7 @@ |
1457 | 1441 | rnp = rnp->parent; |
1458 | 1442 | } while (rnp != NULL && !(rnp->qsmaskinit & mask)); |
1459 | 1443 | |
1460 | - spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ | |
1461 | - | |
1462 | - /* | |
1463 | - * A new grace period might start here. If so, we will be part of | |
1464 | - * it, and its gpnum will be greater than ours, so we will | |
1465 | - * participate. It is also possible for the gpnum to have been | |
1466 | - * incremented before this function was called, and the bitmasks | |
1467 | - * to not be filled out until now, in which case we will also | |
1468 | - * participate due to our gpnum being behind. | |
1469 | - */ | |
1470 | - | |
1471 | - /* Since it is coming online, the CPU is in a quiescent state. */ | |
1472 | - cpu_quiet(cpu, rsp, rdp, lastcomp); | |
1473 | - local_irq_restore(flags); | |
1444 | + spin_unlock_irqrestore(&rsp->onofflock, flags); | |
1474 | 1445 | } |
1475 | 1446 | |
1476 | 1447 | static void __cpuinit rcu_online_cpu(int cpu) |
kernel/rcutree.h
... | ... | @@ -142,7 +142,7 @@ |
142 | 142 | */ |
143 | 143 | struct rcu_head *nxtlist; |
144 | 144 | struct rcu_head **nxttail[RCU_NEXT_SIZE]; |
145 | - long qlen; /* # of queued callbacks */ | |
145 | + long qlen; /* # of queued callbacks */ | |
146 | 146 | long blimit; /* Upper limit on a processed batch */ |
147 | 147 | |
148 | 148 | #ifdef CONFIG_NO_HZ |
kernel/rcutree_plugin.h
... | ... | @@ -64,22 +64,31 @@ |
64 | 64 | * not in a quiescent state. There might be any number of tasks blocked |
65 | 65 | * while in an RCU read-side critical section. |
66 | 66 | */ |
67 | -static void rcu_preempt_qs_record(int cpu) | |
67 | +static void rcu_preempt_qs(int cpu) | |
68 | 68 | { |
69 | 69 | struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); |
70 | - rdp->passed_quiesc = 1; | |
71 | 70 | rdp->passed_quiesc_completed = rdp->completed; |
71 | + barrier(); | |
72 | + rdp->passed_quiesc = 1; | |
72 | 73 | } |
73 | 74 | |
74 | 75 | /* |
75 | - * We have entered the scheduler or are between softirqs in ksoftirqd. | |
76 | - * If we are in an RCU read-side critical section, we need to reflect | |
77 | - * that in the state of the rcu_node structure corresponding to this CPU. | |
78 | - * Caller must disable hardirqs. | |
76 | + * We have entered the scheduler, and the current task might soon be | |
77 | + * context-switched away from. If this task is in an RCU read-side | |
78 | + * critical section, we will no longer be able to rely on the CPU to | |
79 | + * record that fact, so we enqueue the task on the appropriate entry | |
80 | + * of the blocked_tasks[] array. The task will dequeue itself when | |
81 | + * it exits the outermost enclosing RCU read-side critical section. | |
82 | + * Therefore, the current grace period cannot be permitted to complete | |
83 | + * until the blocked_tasks[] entry indexed by the low-order bit of | |
84 | + * rnp->gpnum empties. | |
85 | + * | |
86 | + * Caller must disable preemption. | |
79 | 87 | */ |
80 | -static void rcu_preempt_qs(int cpu) | |
88 | +static void rcu_preempt_note_context_switch(int cpu) | |
81 | 89 | { |
82 | 90 | struct task_struct *t = current; |
91 | + unsigned long flags; | |
83 | 92 | int phase; |
84 | 93 | struct rcu_data *rdp; |
85 | 94 | struct rcu_node *rnp; |
... | ... | @@ -90,7 +99,7 @@ |
90 | 99 | /* Possibly blocking in an RCU read-side critical section. */ |
91 | 100 | rdp = rcu_preempt_state.rda[cpu]; |
92 | 101 | rnp = rdp->mynode; |
93 | - spin_lock(&rnp->lock); | |
102 | + spin_lock_irqsave(&rnp->lock, flags); | |
94 | 103 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; |
95 | 104 | t->rcu_blocked_node = rnp; |
96 | 105 | |
97 | 106 | |
98 | 107 | |
... | ... | @@ -103,11 +112,15 @@ |
103 | 112 | * state for the current grace period), then as long |
104 | 113 | * as that task remains queued, the current grace period |
105 | 114 | * cannot end. |
115 | + * | |
116 | + * But first, note that the current CPU must still be | |
117 | + * on line! | |
106 | 118 | */ |
107 | - phase = !(rnp->qsmask & rdp->grpmask) ^ (rnp->gpnum & 0x1); | |
119 | + WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); | |
120 | + WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); | |
121 | + phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; | |
108 | 122 | list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); |
109 | - smp_mb(); /* Ensure later ctxt swtch seen after above. */ | |
110 | - spin_unlock(&rnp->lock); | |
123 | + spin_unlock_irqrestore(&rnp->lock, flags); | |
111 | 124 | } |
112 | 125 | |
113 | 126 | /* |
... | ... | @@ -119,9 +132,10 @@ |
119 | 132 | * grace period, then the fact that the task has been enqueued |
120 | 133 | * means that we continue to block the current grace period. |
121 | 134 | */ |
122 | - rcu_preempt_qs_record(cpu); | |
123 | - t->rcu_read_unlock_special &= ~(RCU_READ_UNLOCK_NEED_QS | | |
124 | - RCU_READ_UNLOCK_GOT_QS); | |
135 | + rcu_preempt_qs(cpu); | |
136 | + local_irq_save(flags); | |
137 | + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; | |
138 | + local_irq_restore(flags); | |
125 | 139 | } |
126 | 140 | |
127 | 141 | /* |
... | ... | @@ -157,7 +171,7 @@ |
157 | 171 | special = t->rcu_read_unlock_special; |
158 | 172 | if (special & RCU_READ_UNLOCK_NEED_QS) { |
159 | 173 | t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; |
160 | - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_GOT_QS; | |
174 | + rcu_preempt_qs(smp_processor_id()); | |
161 | 175 | } |
162 | 176 | |
163 | 177 | /* Hardware IRQ handlers cannot block. */ |
164 | 178 | |
... | ... | @@ -177,10 +191,10 @@ |
177 | 191 | */ |
178 | 192 | for (;;) { |
179 | 193 | rnp = t->rcu_blocked_node; |
180 | - spin_lock(&rnp->lock); | |
194 | + spin_lock(&rnp->lock); /* irqs already disabled. */ | |
181 | 195 | if (rnp == t->rcu_blocked_node) |
182 | 196 | break; |
183 | - spin_unlock(&rnp->lock); | |
197 | + spin_unlock(&rnp->lock); /* irqs remain disabled. */ | |
184 | 198 | } |
185 | 199 | empty = list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); |
186 | 200 | list_del_init(&t->rcu_node_entry); |
... | ... | @@ -194,9 +208,8 @@ |
194 | 208 | */ |
195 | 209 | if (!empty && rnp->qsmask == 0 && |
196 | 210 | list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])) { |
197 | - t->rcu_read_unlock_special &= | |
198 | - ~(RCU_READ_UNLOCK_NEED_QS | | |
199 | - RCU_READ_UNLOCK_GOT_QS); | |
211 | + struct rcu_node *rnp_p; | |
212 | + | |
200 | 213 | if (rnp->parent == NULL) { |
201 | 214 | /* Only one rcu_node in the tree. */ |
202 | 215 | cpu_quiet_msk_finish(&rcu_preempt_state, flags); |
... | ... | @@ -205,9 +218,10 @@ |
205 | 218 | /* Report up the rest of the hierarchy. */ |
206 | 219 | mask = rnp->grpmask; |
207 | 220 | spin_unlock_irqrestore(&rnp->lock, flags); |
208 | - rnp = rnp->parent; | |
209 | - spin_lock_irqsave(&rnp->lock, flags); | |
210 | - cpu_quiet_msk(mask, &rcu_preempt_state, rnp, flags); | |
221 | + rnp_p = rnp->parent; | |
222 | + spin_lock_irqsave(&rnp_p->lock, flags); | |
223 | + WARN_ON_ONCE(rnp->qsmask); | |
224 | + cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags); | |
211 | 225 | return; |
212 | 226 | } |
213 | 227 | spin_unlock(&rnp->lock); |
... | ... | @@ -259,6 +273,19 @@ |
259 | 273 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ |
260 | 274 | |
261 | 275 | /* |
276 | + * Check that the list of blocked tasks for the newly completed grace | |
277 | + * period is in fact empty. It is a serious bug to complete a grace | |
278 | + * period that still has RCU readers blocked! This function must be | |
279 | + * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock | |
280 | + * must be held by the caller. | |
281 | + */ | |
282 | +static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) | |
283 | +{ | |
284 | + WARN_ON_ONCE(!list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])); | |
285 | + WARN_ON_ONCE(rnp->qsmask); | |
286 | +} | |
287 | + | |
288 | +/* | |
262 | 289 | * Check for preempted RCU readers for the specified rcu_node structure. |
263 | 290 | * If the caller needs a reliable answer, it must hold the rcu_node's |
264 | 291 | * >lock. |
... | ... | @@ -280,7 +307,8 @@ |
280 | 307 | * The caller must hold rnp->lock with irqs disabled. |
281 | 308 | */ |
282 | 309 | static void rcu_preempt_offline_tasks(struct rcu_state *rsp, |
283 | - struct rcu_node *rnp) | |
310 | + struct rcu_node *rnp, | |
311 | + struct rcu_data *rdp) | |
284 | 312 | { |
285 | 313 | int i; |
286 | 314 | struct list_head *lp; |
... | ... | @@ -292,6 +320,9 @@ |
292 | 320 | WARN_ONCE(1, "Last CPU thought to be offlined?"); |
293 | 321 | return; /* Shouldn't happen: at least one CPU online. */ |
294 | 322 | } |
323 | + WARN_ON_ONCE(rnp != rdp->mynode && | |
324 | + (!list_empty(&rnp->blocked_tasks[0]) || | |
325 | + !list_empty(&rnp->blocked_tasks[1]))); | |
295 | 326 | |
296 | 327 | /* |
297 | 328 | * Move tasks up to root rcu_node. Rely on the fact that the |
298 | 329 | |
... | ... | @@ -335,20 +366,12 @@ |
335 | 366 | struct task_struct *t = current; |
336 | 367 | |
337 | 368 | if (t->rcu_read_lock_nesting == 0) { |
338 | - t->rcu_read_unlock_special &= | |
339 | - ~(RCU_READ_UNLOCK_NEED_QS | RCU_READ_UNLOCK_GOT_QS); | |
340 | - rcu_preempt_qs_record(cpu); | |
369 | + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; | |
370 | + rcu_preempt_qs(cpu); | |
341 | 371 | return; |
342 | 372 | } |
343 | - if (per_cpu(rcu_preempt_data, cpu).qs_pending) { | |
344 | - if (t->rcu_read_unlock_special & RCU_READ_UNLOCK_GOT_QS) { | |
345 | - rcu_preempt_qs_record(cpu); | |
346 | - t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_GOT_QS; | |
347 | - } else if (!(t->rcu_read_unlock_special & | |
348 | - RCU_READ_UNLOCK_NEED_QS)) { | |
349 | - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; | |
350 | - } | |
351 | - } | |
373 | + if (per_cpu(rcu_preempt_data, cpu).qs_pending) | |
374 | + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; | |
352 | 375 | } |
353 | 376 | |
354 | 377 | /* |
... | ... | @@ -434,7 +457,7 @@ |
434 | 457 | * Because preemptable RCU does not exist, we never have to check for |
435 | 458 | * CPUs being in quiescent states. |
436 | 459 | */ |
437 | -static void rcu_preempt_qs(int cpu) | |
460 | +static void rcu_preempt_note_context_switch(int cpu) | |
438 | 461 | { |
439 | 462 | } |
440 | 463 | |
... | ... | @@ -451,6 +474,16 @@ |
451 | 474 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ |
452 | 475 | |
453 | 476 | /* |
477 | + * Because there is no preemptable RCU, there can be no readers blocked, | |
478 | + * so there is no need to check for blocked tasks. So check only for | |
479 | + * bogus qsmask values. | |
480 | + */ | |
481 | +static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) | |
482 | +{ | |
483 | + WARN_ON_ONCE(rnp->qsmask); | |
484 | +} | |
485 | + | |
486 | +/* | |
454 | 487 | * Because preemptable RCU does not exist, there are never any preempted |
455 | 488 | * RCU readers. |
456 | 489 | */ |
... | ... | @@ -466,7 +499,8 @@ |
466 | 499 | * tasks that were blocked within RCU read-side critical sections. |
467 | 500 | */ |
468 | 501 | static void rcu_preempt_offline_tasks(struct rcu_state *rsp, |
469 | - struct rcu_node *rnp) | |
502 | + struct rcu_node *rnp, | |
503 | + struct rcu_data *rdp) | |
470 | 504 | { |
471 | 505 | } |
472 | 506 |
kernel/rcutree_trace.c