Commit 2af49b6058d857fa5b476db642d4452bf5833ecd

Authored by Linus Torvalds

Merge branch 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
  rcu: remove unused __list_for_each_rcu() macro
  rculist: fix borked __list_for_each_rcu() macro
  rcu: reduce __call_rcu()-induced contention on rcu_node structures
  rcu: limit rcu_node leaf-level fanout
  rcu: fine-tune grace-period begin/end checks
  rcu: Keep gpnum and completed fields synchronized
  rcu: Stop chasing QS if another CPU did it for us
  rcu: increase synchronize_sched_expedited() batching
  rcu: Make synchronize_srcu_expedited() fast if running readers
  rcu: fix race condition in synchronize_sched_expedited()
  rcu: update documentation/comments for Lai's adoption patch
  rcu,cleanup: simplify the code when cpu is dying
  rcu,cleanup: move synchronize_sched_expedited() out of sched.c
  rcu: get rid of obsolete "classic" names in TREE_RCU tracing
  rcu: Distinguish between boosting and boosted
  rcu: document TINY_RCU and TINY_PREEMPT_RCU tracing.
  rcu: add tracing for TINY_RCU and TINY_PREEMPT_RCU
  rcu: priority boosting for TINY_PREEMPT_RCU
  rcu: move TINY_RCU from softirq to kthread
  rcu: add priority-inversion testing to rcutorture

Showing 17 changed files Side-by-side Diff

Documentation/RCU/trace.txt
1 1 CONFIG_RCU_TRACE debugfs Files and Formats
2 2  
3 3  
4   -The rcutree implementation of RCU provides debugfs trace output that
5   -summarizes counters and state. This information is useful for debugging
6   -RCU itself, and can sometimes also help to debug abuses of RCU.
7   -The following sections describe the debugfs files and formats.
  4 +The rcutree and rcutiny implementations of RCU provide debugfs trace
  5 +output that summarizes counters and state. This information is useful for
  6 +debugging RCU itself, and can sometimes also help to debug abuses of RCU.
  7 +The following sections describe the debugfs files and formats, first
  8 +for rcutree and next for rcutiny.
8 9  
9 10  
10   -Hierarchical RCU debugfs Files and Formats
  11 +CONFIG_TREE_RCU and CONFIG_TREE_PREEMPT_RCU debugfs Files and Formats
11 12  
12   -This implementation of RCU provides three debugfs files under the
  13 +These implementations of RCU provides five debugfs files under the
13 14 top-level directory RCU: rcu/rcudata (which displays fields in struct
14   -rcu_data), rcu/rcugp (which displays grace-period counters), and
15   -rcu/rcuhier (which displays the struct rcu_node hierarchy).
  15 +rcu_data), rcu/rcudata.csv (which is a .csv spreadsheet version of
  16 +rcu/rcudata), rcu/rcugp (which displays grace-period counters),
  17 +rcu/rcuhier (which displays the struct rcu_node hierarchy), and
  18 +rcu/rcu_pending (which displays counts of the reasons that the
  19 +rcu_pending() function decided that there was core RCU work to do).
16 20  
17 21 The output of "cat rcu/rcudata" looks as follows:
18 22  
... ... @@ -130,7 +134,8 @@
130 134 been registered in absence of CPU-hotplug activity.
131 135  
132 136 o "co" is the number of RCU callbacks that have been orphaned due to
133   - this CPU going offline.
  137 + this CPU going offline. These orphaned callbacks have been moved
  138 + to an arbitrarily chosen online CPU.
134 139  
135 140 o "ca" is the number of RCU callbacks that have been adopted due to
136 141 other CPUs going offline. Note that ci+co-ca+ql is the number of
137 142  
... ... @@ -168,12 +173,12 @@
168 173  
169 174 The output of "cat rcu/rcuhier" looks as follows, with very long lines:
170 175  
171   -c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 oqlen=0
  176 +c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6
172 177 1/1 .>. 0:127 ^0
173 178 3/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3
174 179 3/3f .>. 0:5 ^0 2/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3
175 180 rcu_bh:
176   -c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 oqlen=0
  181 +c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0
177 182 0/1 .>. 0:127 ^0
178 183 0/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3
179 184 0/3f .>. 0:5 ^0 0/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3
... ... @@ -212,11 +217,6 @@
212 217 exited immediately (without even being counted in nfqs above)
213 218 due to contention on ->fqslock.
214 219  
215   -o "oqlen" is the number of callbacks on the "orphan" callback
216   - list. RCU callbacks are placed on this list by CPUs going
217   - offline, and are "adopted" either by the CPU helping the outgoing
218   - CPU or by the next rcu_barrier*() call, whichever comes first.
219   -
220 220 o Each element of the form "1/1 0:127 ^0" represents one struct
221 221 rcu_node. Each line represents one level of the hierarchy, from
222 222 root to leaves. It is best to think of the rcu_data structures
... ... @@ -326,4 +326,116 @@
326 326 readers will note that the rcu "nn" number for a given CPU very
327 327 closely matches the rcu_bh "np" number for that same CPU. This
328 328 is due to short-circuit evaluation in rcu_pending().
  329 +
  330 +
  331 +CONFIG_TINY_RCU and CONFIG_TINY_PREEMPT_RCU debugfs Files and Formats
  332 +
  333 +These implementations of RCU provides a single debugfs file under the
  334 +top-level directory RCU, namely rcu/rcudata, which displays fields in
  335 +rcu_bh_ctrlblk, rcu_sched_ctrlblk and, for CONFIG_TINY_PREEMPT_RCU,
  336 +rcu_preempt_ctrlblk.
  337 +
  338 +The output of "cat rcu/rcudata" is as follows:
  339 +
  340 +rcu_preempt: qlen=24 gp=1097669 g197/p197/c197 tasks=...
  341 + ttb=. btg=no ntb=184 neb=0 nnb=183 j=01f7 bt=0274
  342 + normal balk: nt=1097669 gt=0 bt=371 b=0 ny=25073378 nos=0
  343 + exp balk: bt=0 nos=0
  344 +rcu_sched: qlen: 0
  345 +rcu_bh: qlen: 0
  346 +
  347 +This is split into rcu_preempt, rcu_sched, and rcu_bh sections, with the
  348 +rcu_preempt section appearing only in CONFIG_TINY_PREEMPT_RCU builds.
  349 +The last three lines of the rcu_preempt section appear only in
  350 +CONFIG_RCU_BOOST kernel builds. The fields are as follows:
  351 +
  352 +o "qlen" is the number of RCU callbacks currently waiting either
  353 + for an RCU grace period or waiting to be invoked. This is the
  354 + only field present for rcu_sched and rcu_bh, due to the
  355 + short-circuiting of grace period in those two cases.
  356 +
  357 +o "gp" is the number of grace periods that have completed.
  358 +
  359 +o "g197/p197/c197" displays the grace-period state, with the
  360 + "g" number being the number of grace periods that have started
  361 + (mod 256), the "p" number being the number of grace periods
  362 + that the CPU has responded to (also mod 256), and the "c"
  363 + number being the number of grace periods that have completed
  364 + (once again mode 256).
  365 +
  366 + Why have both "gp" and "g"? Because the data flowing into
  367 + "gp" is only present in a CONFIG_RCU_TRACE kernel.
  368 +
  369 +o "tasks" is a set of bits. The first bit is "T" if there are
  370 + currently tasks that have recently blocked within an RCU
  371 + read-side critical section, the second bit is "N" if any of the
  372 + aforementioned tasks are blocking the current RCU grace period,
  373 + and the third bit is "E" if any of the aforementioned tasks are
  374 + blocking the current expedited grace period. Each bit is "."
  375 + if the corresponding condition does not hold.
  376 +
  377 +o "ttb" is a single bit. It is "B" if any of the blocked tasks
  378 + need to be priority boosted and "." otherwise.
  379 +
  380 +o "btg" indicates whether boosting has been carried out during
  381 + the current grace period, with "exp" indicating that boosting
  382 + is in progress for an expedited grace period, "no" indicating
  383 + that boosting has not yet started for a normal grace period,
  384 + "begun" indicating that boosting has bebug for a normal grace
  385 + period, and "done" indicating that boosting has completed for
  386 + a normal grace period.
  387 +
  388 +o "ntb" is the total number of tasks subjected to RCU priority boosting
  389 + periods since boot.
  390 +
  391 +o "neb" is the number of expedited grace periods that have had
  392 + to resort to RCU priority boosting since boot.
  393 +
  394 +o "nnb" is the number of normal grace periods that have had
  395 + to resort to RCU priority boosting since boot.
  396 +
  397 +o "j" is the low-order 12 bits of the jiffies counter in hexadecimal.
  398 +
  399 +o "bt" is the low-order 12 bits of the value that the jiffies counter
  400 + will have at the next time that boosting is scheduled to begin.
  401 +
  402 +o In the line beginning with "normal balk", the fields are as follows:
  403 +
  404 + o "nt" is the number of times that the system balked from
  405 + boosting because there were no blocked tasks to boost.
  406 + Note that the system will balk from boosting even if the
  407 + grace period is overdue when the currently running task
  408 + is looping within an RCU read-side critical section.
  409 + There is no point in boosting in this case, because
  410 + boosting a running task won't make it run any faster.
  411 +
  412 + o "gt" is the number of times that the system balked
  413 + from boosting because, although there were blocked tasks,
  414 + none of them were preventing the current grace period
  415 + from completing.
  416 +
  417 + o "bt" is the number of times that the system balked
  418 + from boosting because boosting was already in progress.
  419 +
  420 + o "b" is the number of times that the system balked from
  421 + boosting because boosting had already completed for
  422 + the grace period in question.
  423 +
  424 + o "ny" is the number of times that the system balked from
  425 + boosting because it was not yet time to start boosting
  426 + the grace period in question.
  427 +
  428 + o "nos" is the number of times that the system balked from
  429 + boosting for inexplicable ("not otherwise specified")
  430 + reasons. This can actually happen due to races involving
  431 + increments of the jiffies counter.
  432 +
  433 +o In the line beginning with "exp balk", the fields are as follows:
  434 +
  435 + o "bt" is the number of times that the system balked from
  436 + boosting because there were no blocked tasks to boost.
  437 +
  438 + o "nos" is the number of times that the system balked from
  439 + boosting for inexplicable ("not otherwise specified")
  440 + reasons.
include/linux/init_task.h
... ... @@ -83,6 +83,12 @@
83 83 */
84 84 # define CAP_INIT_BSET CAP_FULL_SET
85 85  
  86 +#ifdef CONFIG_RCU_BOOST
  87 +#define INIT_TASK_RCU_BOOST() \
  88 + .rcu_boost_mutex = NULL,
  89 +#else
  90 +#define INIT_TASK_RCU_BOOST()
  91 +#endif
86 92 #ifdef CONFIG_TREE_PREEMPT_RCU
87 93 #define INIT_TASK_RCU_TREE_PREEMPT() \
88 94 .rcu_blocked_node = NULL,
... ... @@ -94,7 +100,8 @@
94 100 .rcu_read_lock_nesting = 0, \
95 101 .rcu_read_unlock_special = 0, \
96 102 .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry), \
97   - INIT_TASK_RCU_TREE_PREEMPT()
  103 + INIT_TASK_RCU_TREE_PREEMPT() \
  104 + INIT_TASK_RCU_BOOST()
98 105 #else
99 106 #define INIT_TASK_RCU_PREEMPT(tsk)
100 107 #endif
include/linux/rculist.h
... ... @@ -241,11 +241,6 @@
241 241 #define list_first_entry_rcu(ptr, type, member) \
242 242 list_entry_rcu((ptr)->next, type, member)
243 243  
244   -#define __list_for_each_rcu(pos, head) \
245   - for (pos = rcu_dereference_raw(list_next_rcu(head)); \
246   - pos != (head); \
247   - pos = rcu_dereference_raw(list_next_rcu((pos)))
248   -
249 244 /**
250 245 * list_for_each_entry_rcu - iterate over rcu list of given type
251 246 * @pos: the type * to use as a loop cursor.
include/linux/rcupdate.h
... ... @@ -47,6 +47,8 @@
47 47 extern int rcutorture_runnable; /* for sysctl */
48 48 #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
49 49  
  50 +#define UINT_CMP_GE(a, b) (UINT_MAX / 2 >= (a) - (b))
  51 +#define UINT_CMP_LT(a, b) (UINT_MAX / 2 < (a) - (b))
50 52 #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
51 53 #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
52 54  
... ... @@ -66,7 +68,6 @@
66 68 extern void synchronize_sched(void);
67 69 extern void rcu_barrier_bh(void);
68 70 extern void rcu_barrier_sched(void);
69   -extern void synchronize_sched_expedited(void);
70 71 extern int sched_expedited_torture_stats(char *page);
71 72  
72 73 static inline void __rcu_read_lock_bh(void)
... ... @@ -118,7 +119,6 @@
118 119 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
119 120  
120 121 /* Internal to kernel */
121   -extern void rcu_init(void);
122 122 extern void rcu_sched_qs(int cpu);
123 123 extern void rcu_bh_qs(int cpu);
124 124 extern void rcu_check_callbacks(int cpu, int user);
include/linux/rcutiny.h
... ... @@ -27,7 +27,9 @@
27 27  
28 28 #include <linux/cache.h>
29 29  
30   -#define rcu_init_sched() do { } while (0)
  30 +static inline void rcu_init(void)
  31 +{
  32 +}
31 33  
32 34 #ifdef CONFIG_TINY_RCU
33 35  
... ... @@ -58,6 +60,11 @@
58 60 synchronize_sched();
59 61 }
60 62  
  63 +static inline void synchronize_sched_expedited(void)
  64 +{
  65 + synchronize_sched();
  66 +}
  67 +
61 68 #ifdef CONFIG_TINY_RCU
62 69  
63 70 static inline void rcu_preempt_note_context_switch(void)
64 71  
65 72  
66 73  
... ... @@ -125,16 +132,12 @@
125 132 }
126 133  
127 134 #ifdef CONFIG_DEBUG_LOCK_ALLOC
128   -
129 135 extern int rcu_scheduler_active __read_mostly;
130 136 extern void rcu_scheduler_starting(void);
131   -
132 137 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
133   -
134 138 static inline void rcu_scheduler_starting(void)
135 139 {
136 140 }
137   -
138 141 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
139 142  
140 143 #endif /* __LINUX_RCUTINY_H */
include/linux/rcutree.h
... ... @@ -30,6 +30,7 @@
30 30 #ifndef __LINUX_RCUTREE_H
31 31 #define __LINUX_RCUTREE_H
32 32  
  33 +extern void rcu_init(void);
33 34 extern void rcu_note_context_switch(int cpu);
34 35 extern int rcu_needs_cpu(int cpu);
35 36 extern void rcu_cpu_stall_reset(void);
... ... @@ -47,6 +48,7 @@
47 48 #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
48 49  
49 50 extern void synchronize_rcu_bh(void);
  51 +extern void synchronize_sched_expedited(void);
50 52 extern void synchronize_rcu_expedited(void);
51 53  
52 54 static inline void synchronize_rcu_bh_expedited(void)
include/linux/sched.h
... ... @@ -1229,6 +1229,9 @@
1229 1229 #ifdef CONFIG_TREE_PREEMPT_RCU
1230 1230 struct rcu_node *rcu_blocked_node;
1231 1231 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
  1232 +#ifdef CONFIG_RCU_BOOST
  1233 + struct rt_mutex *rcu_boost_mutex;
  1234 +#endif /* #ifdef CONFIG_RCU_BOOST */
1232 1235  
1233 1236 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1234 1237 struct sched_info sched_info;
... ... @@ -1759,7 +1762,8 @@
1759 1762 #ifdef CONFIG_PREEMPT_RCU
1760 1763  
1761 1764 #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
1762   -#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
  1765 +#define RCU_READ_UNLOCK_BOOSTED (1 << 1) /* boosted while in RCU read-side. */
  1766 +#define RCU_READ_UNLOCK_NEED_QS (1 << 2) /* RCU core needs CPU response. */
1763 1767  
1764 1768 static inline void rcu_copy_process(struct task_struct *p)
1765 1769 {
... ... @@ -1767,7 +1771,10 @@
1767 1771 p->rcu_read_unlock_special = 0;
1768 1772 #ifdef CONFIG_TREE_PREEMPT_RCU
1769 1773 p->rcu_blocked_node = NULL;
1770   -#endif
  1774 +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
  1775 +#ifdef CONFIG_RCU_BOOST
  1776 + p->rcu_boost_mutex = NULL;
  1777 +#endif /* #ifdef CONFIG_RCU_BOOST */
1771 1778 INIT_LIST_HEAD(&p->rcu_node_entry);
1772 1779 }
1773 1780  
... ... @@ -393,7 +393,6 @@
393 393  
394 394 config RCU_TRACE
395 395 bool "Enable tracing for RCU"
396   - depends on TREE_RCU || TREE_PREEMPT_RCU
397 396 help
398 397 This option provides tracing in RCU which presents stats
399 398 in debugfs for debugging RCU implementation.
... ... @@ -458,6 +457,60 @@
458 457 This option provides tracing for the TREE_RCU and
459 458 TREE_PREEMPT_RCU implementations, permitting Makefile to
460 459 trivially select kernel/rcutree_trace.c.
  460 +
  461 +config RCU_BOOST
  462 + bool "Enable RCU priority boosting"
  463 + depends on RT_MUTEXES && TINY_PREEMPT_RCU
  464 + default n
  465 + help
  466 + This option boosts the priority of preempted RCU readers that
  467 + block the current preemptible RCU grace period for too long.
  468 + This option also prevents heavy loads from blocking RCU
  469 + callback invocation for all flavors of RCU.
  470 +
  471 + Say Y here if you are working with real-time apps or heavy loads
  472 + Say N here if you are unsure.
  473 +
  474 +config RCU_BOOST_PRIO
  475 + int "Real-time priority to boost RCU readers to"
  476 + range 1 99
  477 + depends on RCU_BOOST
  478 + default 1
  479 + help
  480 + This option specifies the real-time priority to which preempted
  481 + RCU readers are to be boosted. If you are working with CPU-bound
  482 + real-time applications, you should specify a priority higher then
  483 + the highest-priority CPU-bound application.
  484 +
  485 + Specify the real-time priority, or take the default if unsure.
  486 +
  487 +config RCU_BOOST_DELAY
  488 + int "Milliseconds to delay boosting after RCU grace-period start"
  489 + range 0 3000
  490 + depends on RCU_BOOST
  491 + default 500
  492 + help
  493 + This option specifies the time to wait after the beginning of
  494 + a given grace period before priority-boosting preempted RCU
  495 + readers blocking that grace period. Note that any RCU reader
  496 + blocking an expedited RCU grace period is boosted immediately.
  497 +
  498 + Accept the default if unsure.
  499 +
  500 +config SRCU_SYNCHRONIZE_DELAY
  501 + int "Microseconds to delay before waiting for readers"
  502 + range 0 20
  503 + default 10
  504 + help
  505 + This option controls how long SRCU delays before entering its
  506 + loop waiting on SRCU readers. The purpose of this loop is
  507 + to avoid the unconditional context-switch penalty that would
  508 + otherwise be incurred if there was an active SRCU reader,
  509 + in a manner similar to adaptive locking schemes. This should
  510 + be set to be a bit longer than the common-case SRCU read-side
  511 + critical-section overhead.
  512 +
  513 + Accept the default if unsure.
461 514  
462 515 endmenu # "RCU Subsystem"
463 516  
... ... @@ -36,31 +36,16 @@
36 36 #include <linux/time.h>
37 37 #include <linux/cpu.h>
38 38  
39   -/* Global control variables for rcupdate callback mechanism. */
40   -struct rcu_ctrlblk {
41   - struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
42   - struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
43   - struct rcu_head **curtail; /* ->next pointer of last CB. */
44   -};
  39 +/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
  40 +static struct task_struct *rcu_kthread_task;
  41 +static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
  42 +static unsigned long have_rcu_kthread_work;
  43 +static void invoke_rcu_kthread(void);
45 44  
46   -/* Definition for rcupdate control block. */
47   -static struct rcu_ctrlblk rcu_sched_ctrlblk = {
48   - .donetail = &rcu_sched_ctrlblk.rcucblist,
49   - .curtail = &rcu_sched_ctrlblk.rcucblist,
50   -};
51   -
52   -static struct rcu_ctrlblk rcu_bh_ctrlblk = {
53   - .donetail = &rcu_bh_ctrlblk.rcucblist,
54   - .curtail = &rcu_bh_ctrlblk.rcucblist,
55   -};
56   -
57   -#ifdef CONFIG_DEBUG_LOCK_ALLOC
58   -int rcu_scheduler_active __read_mostly;
59   -EXPORT_SYMBOL_GPL(rcu_scheduler_active);
60   -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
61   -
62 45 /* Forward declarations for rcutiny_plugin.h. */
63   -static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
  46 +struct rcu_ctrlblk;
  47 +static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
  48 +static int rcu_kthread(void *arg);
64 49 static void __call_rcu(struct rcu_head *head,
65 50 void (*func)(struct rcu_head *rcu),
66 51 struct rcu_ctrlblk *rcp);
... ... @@ -123,7 +108,7 @@
123 108 {
124 109 if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
125 110 rcu_qsctr_help(&rcu_bh_ctrlblk))
126   - raise_softirq(RCU_SOFTIRQ);
  111 + invoke_rcu_kthread();
127 112 }
128 113  
129 114 /*
... ... @@ -132,7 +117,7 @@
132 117 void rcu_bh_qs(int cpu)
133 118 {
134 119 if (rcu_qsctr_help(&rcu_bh_ctrlblk))
135   - raise_softirq(RCU_SOFTIRQ);
  120 + invoke_rcu_kthread();
136 121 }
137 122  
138 123 /*
139 124  
140 125  
... ... @@ -152,13 +137,14 @@
152 137 }
153 138  
154 139 /*
155   - * Helper function for rcu_process_callbacks() that operates on the
156   - * specified rcu_ctrlkblk structure.
  140 + * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
  141 + * whose grace period has elapsed.
157 142 */
158   -static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
  143 +static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
159 144 {
160 145 struct rcu_head *next, *list;
161 146 unsigned long flags;
  147 + RCU_TRACE(int cb_count = 0);
162 148  
163 149 /* If no RCU callbacks ready to invoke, just return. */
164 150 if (&rcp->rcucblist == rcp->donetail)
165 151  
166 152  
167 153  
168 154  
169 155  
170 156  
171 157  
... ... @@ -180,22 +166,61 @@
180 166 next = list->next;
181 167 prefetch(next);
182 168 debug_rcu_head_unqueue(list);
  169 + local_bh_disable();
183 170 list->func(list);
  171 + local_bh_enable();
184 172 list = next;
  173 + RCU_TRACE(cb_count++);
185 174 }
  175 + RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
186 176 }
187 177  
188 178 /*
189   - * Invoke any callbacks whose grace period has completed.
  179 + * This kthread invokes RCU callbacks whose grace periods have
  180 + * elapsed. It is awakened as needed, and takes the place of the
  181 + * RCU_SOFTIRQ that was used previously for this purpose.
  182 + * This is a kthread, but it is never stopped, at least not until
  183 + * the system goes down.
190 184 */
191   -static void rcu_process_callbacks(struct softirq_action *unused)
  185 +static int rcu_kthread(void *arg)
192 186 {
193   - __rcu_process_callbacks(&rcu_sched_ctrlblk);
194   - __rcu_process_callbacks(&rcu_bh_ctrlblk);
195   - rcu_preempt_process_callbacks();
  187 + unsigned long work;
  188 + unsigned long morework;
  189 + unsigned long flags;
  190 +
  191 + for (;;) {
  192 + wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0);
  193 + morework = rcu_boost();
  194 + local_irq_save(flags);
  195 + work = have_rcu_kthread_work;
  196 + have_rcu_kthread_work = morework;
  197 + local_irq_restore(flags);
  198 + if (work) {
  199 + rcu_process_callbacks(&rcu_sched_ctrlblk);
  200 + rcu_process_callbacks(&rcu_bh_ctrlblk);
  201 + rcu_preempt_process_callbacks();
  202 + }
  203 + schedule_timeout_interruptible(1); /* Leave CPU for others. */
  204 + }
  205 +
  206 + return 0; /* Not reached, but needed to shut gcc up. */
196 207 }
197 208  
198 209 /*
  210 + * Wake up rcu_kthread() to process callbacks now eligible for invocation
  211 + * or to boost readers.
  212 + */
  213 +static void invoke_rcu_kthread(void)
  214 +{
  215 + unsigned long flags;
  216 +
  217 + local_irq_save(flags);
  218 + have_rcu_kthread_work = 1;
  219 + wake_up(&rcu_kthread_wq);
  220 + local_irq_restore(flags);
  221 +}
  222 +
  223 +/*
199 224 * Wait for a grace period to elapse. But it is illegal to invoke
200 225 * synchronize_sched() from within an RCU read-side critical section.
201 226 * Therefore, any legal call to synchronize_sched() is a quiescent
... ... @@ -230,6 +255,7 @@
230 255 local_irq_save(flags);
231 256 *rcp->curtail = head;
232 257 rcp->curtail = &head->next;
  258 + RCU_TRACE(rcp->qlen++);
233 259 local_irq_restore(flags);
234 260 }
235 261  
236 262  
237 263  
... ... @@ -282,8 +308,17 @@
282 308 }
283 309 EXPORT_SYMBOL_GPL(rcu_barrier_sched);
284 310  
285   -void __init rcu_init(void)
  311 +/*
  312 + * Spawn the kthread that invokes RCU callbacks.
  313 + */
  314 +static int __init rcu_spawn_kthreads(void)
286 315 {
287   - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
  316 + struct sched_param sp;
  317 +
  318 + rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
  319 + sp.sched_priority = RCU_BOOST_PRIO;
  320 + sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
  321 + return 0;
288 322 }
  323 +early_initcall(rcu_spawn_kthreads);
kernel/rcutiny_plugin.h
... ... @@ -22,6 +22,40 @@
22 22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
23 23 */
24 24  
  25 +#include <linux/kthread.h>
  26 +#include <linux/debugfs.h>
  27 +#include <linux/seq_file.h>
  28 +
  29 +#ifdef CONFIG_RCU_TRACE
  30 +#define RCU_TRACE(stmt) stmt
  31 +#else /* #ifdef CONFIG_RCU_TRACE */
  32 +#define RCU_TRACE(stmt)
  33 +#endif /* #else #ifdef CONFIG_RCU_TRACE */
  34 +
  35 +/* Global control variables for rcupdate callback mechanism. */
  36 +struct rcu_ctrlblk {
  37 + struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
  38 + struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
  39 + struct rcu_head **curtail; /* ->next pointer of last CB. */
  40 + RCU_TRACE(long qlen); /* Number of pending CBs. */
  41 +};
  42 +
  43 +/* Definition for rcupdate control block. */
  44 +static struct rcu_ctrlblk rcu_sched_ctrlblk = {
  45 + .donetail = &rcu_sched_ctrlblk.rcucblist,
  46 + .curtail = &rcu_sched_ctrlblk.rcucblist,
  47 +};
  48 +
  49 +static struct rcu_ctrlblk rcu_bh_ctrlblk = {
  50 + .donetail = &rcu_bh_ctrlblk.rcucblist,
  51 + .curtail = &rcu_bh_ctrlblk.rcucblist,
  52 +};
  53 +
  54 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  55 +int rcu_scheduler_active __read_mostly;
  56 +EXPORT_SYMBOL_GPL(rcu_scheduler_active);
  57 +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
  58 +
25 59 #ifdef CONFIG_TINY_PREEMPT_RCU
26 60  
27 61 #include <linux/delay.h>
28 62  
29 63  
... ... @@ -46,17 +80,45 @@
46 80 struct list_head *gp_tasks;
47 81 /* Pointer to the first task blocking the */
48 82 /* current grace period, or NULL if there */
49   - /* is not such task. */
  83 + /* is no such task. */
50 84 struct list_head *exp_tasks;
51 85 /* Pointer to first task blocking the */
52 86 /* current expedited grace period, or NULL */
53 87 /* if there is no such task. If there */
54 88 /* is no current expedited grace period, */
55 89 /* then there cannot be any such task. */
  90 +#ifdef CONFIG_RCU_BOOST
  91 + struct list_head *boost_tasks;
  92 + /* Pointer to first task that needs to be */
  93 + /* priority-boosted, or NULL if no priority */
  94 + /* boosting is needed. If there is no */
  95 + /* current or expedited grace period, there */
  96 + /* can be no such task. */
  97 +#endif /* #ifdef CONFIG_RCU_BOOST */
56 98 u8 gpnum; /* Current grace period. */
57 99 u8 gpcpu; /* Last grace period blocked by the CPU. */
58 100 u8 completed; /* Last grace period completed. */
59 101 /* If all three are equal, RCU is idle. */
  102 +#ifdef CONFIG_RCU_BOOST
  103 + s8 boosted_this_gp; /* Has boosting already happened? */
  104 + unsigned long boost_time; /* When to start boosting (jiffies) */
  105 +#endif /* #ifdef CONFIG_RCU_BOOST */
  106 +#ifdef CONFIG_RCU_TRACE
  107 + unsigned long n_grace_periods;
  108 +#ifdef CONFIG_RCU_BOOST
  109 + unsigned long n_tasks_boosted;
  110 + unsigned long n_exp_boosts;
  111 + unsigned long n_normal_boosts;
  112 + unsigned long n_normal_balk_blkd_tasks;
  113 + unsigned long n_normal_balk_gp_tasks;
  114 + unsigned long n_normal_balk_boost_tasks;
  115 + unsigned long n_normal_balk_boosted;
  116 + unsigned long n_normal_balk_notyet;
  117 + unsigned long n_normal_balk_nos;
  118 + unsigned long n_exp_balk_blkd_tasks;
  119 + unsigned long n_exp_balk_nos;
  120 +#endif /* #ifdef CONFIG_RCU_BOOST */
  121 +#endif /* #ifdef CONFIG_RCU_TRACE */
60 122 };
61 123  
62 124 static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
... ... @@ -122,6 +184,210 @@
122 184 }
123 185  
124 186 /*
  187 + * Advance a ->blkd_tasks-list pointer to the next entry, instead
  188 + * returning NULL if at the end of the list.
  189 + */
  190 +static struct list_head *rcu_next_node_entry(struct task_struct *t)
  191 +{
  192 + struct list_head *np;
  193 +
  194 + np = t->rcu_node_entry.next;
  195 + if (np == &rcu_preempt_ctrlblk.blkd_tasks)
  196 + np = NULL;
  197 + return np;
  198 +}
  199 +
  200 +#ifdef CONFIG_RCU_TRACE
  201 +
  202 +#ifdef CONFIG_RCU_BOOST
  203 +static void rcu_initiate_boost_trace(void);
  204 +static void rcu_initiate_exp_boost_trace(void);
  205 +#endif /* #ifdef CONFIG_RCU_BOOST */
  206 +
  207 +/*
  208 + * Dump additional statistice for TINY_PREEMPT_RCU.
  209 + */
  210 +static void show_tiny_preempt_stats(struct seq_file *m)
  211 +{
  212 + seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n",
  213 + rcu_preempt_ctrlblk.rcb.qlen,
  214 + rcu_preempt_ctrlblk.n_grace_periods,
  215 + rcu_preempt_ctrlblk.gpnum,
  216 + rcu_preempt_ctrlblk.gpcpu,
  217 + rcu_preempt_ctrlblk.completed,
  218 + "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)],
  219 + "N."[!rcu_preempt_ctrlblk.gp_tasks],
  220 + "E."[!rcu_preempt_ctrlblk.exp_tasks]);
  221 +#ifdef CONFIG_RCU_BOOST
  222 + seq_printf(m, " ttb=%c btg=",
  223 + "B."[!rcu_preempt_ctrlblk.boost_tasks]);
  224 + switch (rcu_preempt_ctrlblk.boosted_this_gp) {
  225 + case -1:
  226 + seq_puts(m, "exp");
  227 + break;
  228 + case 0:
  229 + seq_puts(m, "no");
  230 + break;
  231 + case 1:
  232 + seq_puts(m, "begun");
  233 + break;
  234 + case 2:
  235 + seq_puts(m, "done");
  236 + break;
  237 + default:
  238 + seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp);
  239 + }
  240 + seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
  241 + rcu_preempt_ctrlblk.n_tasks_boosted,
  242 + rcu_preempt_ctrlblk.n_exp_boosts,
  243 + rcu_preempt_ctrlblk.n_normal_boosts,
  244 + (int)(jiffies & 0xffff),
  245 + (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
  246 + seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n",
  247 + "normal balk",
  248 + rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks,
  249 + rcu_preempt_ctrlblk.n_normal_balk_gp_tasks,
  250 + rcu_preempt_ctrlblk.n_normal_balk_boost_tasks,
  251 + rcu_preempt_ctrlblk.n_normal_balk_boosted,
  252 + rcu_preempt_ctrlblk.n_normal_balk_notyet,
  253 + rcu_preempt_ctrlblk.n_normal_balk_nos);
  254 + seq_printf(m, " exp balk: bt=%lu nos=%lu\n",
  255 + rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks,
  256 + rcu_preempt_ctrlblk.n_exp_balk_nos);
  257 +#endif /* #ifdef CONFIG_RCU_BOOST */
  258 +}
  259 +
  260 +#endif /* #ifdef CONFIG_RCU_TRACE */
  261 +
  262 +#ifdef CONFIG_RCU_BOOST
  263 +
  264 +#include "rtmutex_common.h"
  265 +
  266 +/*
  267 + * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
  268 + * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
  269 + */
  270 +static int rcu_boost(void)
  271 +{
  272 + unsigned long flags;
  273 + struct rt_mutex mtx;
  274 + struct list_head *np;
  275 + struct task_struct *t;
  276 +
  277 + if (rcu_preempt_ctrlblk.boost_tasks == NULL)
  278 + return 0; /* Nothing to boost. */
  279 + raw_local_irq_save(flags);
  280 + rcu_preempt_ctrlblk.boosted_this_gp++;
  281 + t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct,
  282 + rcu_node_entry);
  283 + np = rcu_next_node_entry(t);
  284 + rt_mutex_init_proxy_locked(&mtx, t);
  285 + t->rcu_boost_mutex = &mtx;
  286 + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
  287 + raw_local_irq_restore(flags);
  288 + rt_mutex_lock(&mtx);
  289 + RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
  290 + rcu_preempt_ctrlblk.boosted_this_gp++;
  291 + rt_mutex_unlock(&mtx);
  292 + return rcu_preempt_ctrlblk.boost_tasks != NULL;
  293 +}
  294 +
  295 +/*
  296 + * Check to see if it is now time to start boosting RCU readers blocking
  297 + * the current grace period, and, if so, tell the rcu_kthread_task to
  298 + * start boosting them. If there is an expedited boost in progress,
  299 + * we wait for it to complete.
  300 + *
  301 + * If there are no blocked readers blocking the current grace period,
  302 + * return 0 to let the caller know, otherwise return 1. Note that this
  303 + * return value is independent of whether or not boosting was done.
  304 + */
  305 +static int rcu_initiate_boost(void)
  306 +{
  307 + if (!rcu_preempt_blocked_readers_cgp()) {
  308 + RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++);
  309 + return 0;
  310 + }
  311 + if (rcu_preempt_ctrlblk.gp_tasks != NULL &&
  312 + rcu_preempt_ctrlblk.boost_tasks == NULL &&
  313 + rcu_preempt_ctrlblk.boosted_this_gp == 0 &&
  314 + ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) {
  315 + rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks;
  316 + invoke_rcu_kthread();
  317 + RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
  318 + } else
  319 + RCU_TRACE(rcu_initiate_boost_trace());
  320 + return 1;
  321 +}
  322 +
  323 +/*
  324 + * Initiate boosting for an expedited grace period.
  325 + */
  326 +static void rcu_initiate_expedited_boost(void)
  327 +{
  328 + unsigned long flags;
  329 +
  330 + raw_local_irq_save(flags);
  331 + if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) {
  332 + rcu_preempt_ctrlblk.boost_tasks =
  333 + rcu_preempt_ctrlblk.blkd_tasks.next;
  334 + rcu_preempt_ctrlblk.boosted_this_gp = -1;
  335 + invoke_rcu_kthread();
  336 + RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
  337 + } else
  338 + RCU_TRACE(rcu_initiate_exp_boost_trace());
  339 + raw_local_irq_restore(flags);
  340 +}
  341 +
  342 +#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000);
  343 +
  344 +/*
  345 + * Do priority-boost accounting for the start of a new grace period.
  346 + */
  347 +static void rcu_preempt_boost_start_gp(void)
  348 +{
  349 + rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
  350 + if (rcu_preempt_ctrlblk.boosted_this_gp > 0)
  351 + rcu_preempt_ctrlblk.boosted_this_gp = 0;
  352 +}
  353 +
  354 +#else /* #ifdef CONFIG_RCU_BOOST */
  355 +
  356 +/*
  357 + * If there is no RCU priority boosting, we don't boost.
  358 + */
  359 +static int rcu_boost(void)
  360 +{
  361 + return 0;
  362 +}
  363 +
  364 +/*
  365 + * If there is no RCU priority boosting, we don't initiate boosting,
  366 + * but we do indicate whether there are blocked readers blocking the
  367 + * current grace period.
  368 + */
  369 +static int rcu_initiate_boost(void)
  370 +{
  371 + return rcu_preempt_blocked_readers_cgp();
  372 +}
  373 +
  374 +/*
  375 + * If there is no RCU priority boosting, we don't initiate expedited boosting.
  376 + */
  377 +static void rcu_initiate_expedited_boost(void)
  378 +{
  379 +}
  380 +
  381 +/*
  382 + * If there is no RCU priority boosting, nothing to do at grace-period start.
  383 + */
  384 +static void rcu_preempt_boost_start_gp(void)
  385 +{
  386 +}
  387 +
  388 +#endif /* else #ifdef CONFIG_RCU_BOOST */
  389 +
  390 +/*
125 391 * Record a preemptible-RCU quiescent state for the specified CPU. Note
126 392 * that this just means that the task currently running on the CPU is
127 393 * in a quiescent state. There might be any number of tasks blocked
128 394  
129 395  
... ... @@ -148,11 +414,14 @@
148 414 rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
149 415 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
150 416  
  417 + /* If there is no GP then there is nothing more to do. */
  418 + if (!rcu_preempt_gp_in_progress())
  419 + return;
151 420 /*
152   - * If there is no GP, or if blocked readers are still blocking GP,
153   - * then there is nothing more to do.
  421 + * Check up on boosting. If there are no readers blocking the
  422 + * current grace period, leave.
154 423 */
155   - if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp())
  424 + if (rcu_initiate_boost())
156 425 return;
157 426  
158 427 /* Advance callbacks. */
159 428  
... ... @@ -164,9 +433,9 @@
164 433 if (!rcu_preempt_blocked_readers_any())
165 434 rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
166 435  
167   - /* If there are done callbacks, make RCU_SOFTIRQ process them. */
  436 + /* If there are done callbacks, cause them to be invoked. */
168 437 if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
169   - raise_softirq(RCU_SOFTIRQ);
  438 + invoke_rcu_kthread();
170 439 }
171 440  
172 441 /*
173 442  
... ... @@ -178,12 +447,16 @@
178 447  
179 448 /* Official start of GP. */
180 449 rcu_preempt_ctrlblk.gpnum++;
  450 + RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
181 451  
182 452 /* Any blocked RCU readers block new GP. */
183 453 if (rcu_preempt_blocked_readers_any())
184 454 rcu_preempt_ctrlblk.gp_tasks =
185 455 rcu_preempt_ctrlblk.blkd_tasks.next;
186 456  
  457 + /* Set up for RCU priority boosting. */
  458 + rcu_preempt_boost_start_gp();
  459 +
187 460 /* If there is no running reader, CPU is done with GP. */
188 461 if (!rcu_preempt_running_reader())
189 462 rcu_preempt_cpu_qs();
190 463  
... ... @@ -304,14 +577,16 @@
304 577 */
305 578 empty = !rcu_preempt_blocked_readers_cgp();
306 579 empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
307   - np = t->rcu_node_entry.next;
308   - if (np == &rcu_preempt_ctrlblk.blkd_tasks)
309   - np = NULL;
  580 + np = rcu_next_node_entry(t);
310 581 list_del(&t->rcu_node_entry);
311 582 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
312 583 rcu_preempt_ctrlblk.gp_tasks = np;
313 584 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
314 585 rcu_preempt_ctrlblk.exp_tasks = np;
  586 +#ifdef CONFIG_RCU_BOOST
  587 + if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
  588 + rcu_preempt_ctrlblk.boost_tasks = np;
  589 +#endif /* #ifdef CONFIG_RCU_BOOST */
315 590 INIT_LIST_HEAD(&t->rcu_node_entry);
316 591  
317 592 /*
... ... @@ -331,6 +606,14 @@
331 606 if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
332 607 rcu_report_exp_done();
333 608 }
  609 +#ifdef CONFIG_RCU_BOOST
  610 + /* Unboost self if was boosted. */
  611 + if (special & RCU_READ_UNLOCK_BOOSTED) {
  612 + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
  613 + rt_mutex_unlock(t->rcu_boost_mutex);
  614 + t->rcu_boost_mutex = NULL;
  615 + }
  616 +#endif /* #ifdef CONFIG_RCU_BOOST */
334 617 local_irq_restore(flags);
335 618 }
336 619  
... ... @@ -374,7 +657,7 @@
374 657 rcu_preempt_cpu_qs();
375 658 if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
376 659 rcu_preempt_ctrlblk.rcb.donetail)
377   - raise_softirq(RCU_SOFTIRQ);
  660 + invoke_rcu_kthread();
378 661 if (rcu_preempt_gp_in_progress() &&
379 662 rcu_cpu_blocking_cur_gp() &&
380 663 rcu_preempt_running_reader())
... ... @@ -383,7 +666,7 @@
383 666  
384 667 /*
385 668 * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
386   - * update, so this is invoked from __rcu_process_callbacks() to
  669 + * update, so this is invoked from rcu_process_callbacks() to
387 670 * handle that case. Of course, it is invoked for all flavors of
388 671 * RCU, but RCU callbacks can appear only on one of the lists, and
389 672 * neither ->nexttail nor ->donetail can possibly be NULL, so there
... ... @@ -400,7 +683,7 @@
400 683 */
401 684 static void rcu_preempt_process_callbacks(void)
402 685 {
403   - __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
  686 + rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
404 687 }
405 688  
406 689 /*
... ... @@ -417,6 +700,7 @@
417 700 local_irq_save(flags);
418 701 *rcu_preempt_ctrlblk.nexttail = head;
419 702 rcu_preempt_ctrlblk.nexttail = &head->next;
  703 + RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++);
420 704 rcu_preempt_start_gp(); /* checks to see if GP needed. */
421 705 local_irq_restore(flags);
422 706 }
... ... @@ -532,6 +816,7 @@
532 816  
533 817 /* Wait for tail of ->blkd_tasks list to drain. */
534 818 if (rcu_preempted_readers_exp())
  819 + rcu_initiate_expedited_boost();
535 820 wait_event(sync_rcu_preempt_exp_wq,
536 821 !rcu_preempted_readers_exp());
537 822  
538 823  
... ... @@ -572,7 +857,28 @@
572 857  
573 858 #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
574 859  
  860 +#ifdef CONFIG_RCU_TRACE
  861 +
575 862 /*
  863 + * Because preemptible RCU does not exist, it is not necessary to
  864 + * dump out its statistics.
  865 + */
  866 +static void show_tiny_preempt_stats(struct seq_file *m)
  867 +{
  868 +}
  869 +
  870 +#endif /* #ifdef CONFIG_RCU_TRACE */
  871 +
  872 +/*
  873 + * Because preemptible RCU does not exist, it is never necessary to
  874 + * boost preempted RCU readers.
  875 + */
  876 +static int rcu_boost(void)
  877 +{
  878 + return 0;
  879 +}
  880 +
  881 +/*
576 882 * Because preemptible RCU does not exist, it never has any callbacks
577 883 * to check.
578 884 */
579 885  
580 886  
... ... @@ -599,18 +905,117 @@
599 905 #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
600 906  
601 907 #ifdef CONFIG_DEBUG_LOCK_ALLOC
602   -
603 908 #include <linux/kernel_stat.h>
604 909  
605 910 /*
606 911 * During boot, we forgive RCU lockdep issues. After this function is
607 912 * invoked, we start taking RCU lockdep issues seriously.
608 913 */
609   -void rcu_scheduler_starting(void)
  914 +void __init rcu_scheduler_starting(void)
610 915 {
611 916 WARN_ON(nr_context_switches() > 0);
612 917 rcu_scheduler_active = 1;
613 918 }
614 919  
615 920 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
  921 +
  922 +#ifdef CONFIG_RCU_BOOST
  923 +#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
  924 +#else /* #ifdef CONFIG_RCU_BOOST */
  925 +#define RCU_BOOST_PRIO 1
  926 +#endif /* #else #ifdef CONFIG_RCU_BOOST */
  927 +
  928 +#ifdef CONFIG_RCU_TRACE
  929 +
  930 +#ifdef CONFIG_RCU_BOOST
  931 +
  932 +static void rcu_initiate_boost_trace(void)
  933 +{
  934 + if (rcu_preempt_ctrlblk.gp_tasks == NULL)
  935 + rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++;
  936 + else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
  937 + rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++;
  938 + else if (rcu_preempt_ctrlblk.boosted_this_gp != 0)
  939 + rcu_preempt_ctrlblk.n_normal_balk_boosted++;
  940 + else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
  941 + rcu_preempt_ctrlblk.n_normal_balk_notyet++;
  942 + else
  943 + rcu_preempt_ctrlblk.n_normal_balk_nos++;
  944 +}
  945 +
  946 +static void rcu_initiate_exp_boost_trace(void)
  947 +{
  948 + if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
  949 + rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++;
  950 + else
  951 + rcu_preempt_ctrlblk.n_exp_balk_nos++;
  952 +}
  953 +
  954 +#endif /* #ifdef CONFIG_RCU_BOOST */
  955 +
  956 +static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
  957 +{
  958 + unsigned long flags;
  959 +
  960 + raw_local_irq_save(flags);
  961 + rcp->qlen -= n;
  962 + raw_local_irq_restore(flags);
  963 +}
  964 +
  965 +/*
  966 + * Dump statistics for TINY_RCU, such as they are.
  967 + */
  968 +static int show_tiny_stats(struct seq_file *m, void *unused)
  969 +{
  970 + show_tiny_preempt_stats(m);
  971 + seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
  972 + seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
  973 + return 0;
  974 +}
  975 +
  976 +static int show_tiny_stats_open(struct inode *inode, struct file *file)
  977 +{
  978 + return single_open(file, show_tiny_stats, NULL);
  979 +}
  980 +
  981 +static const struct file_operations show_tiny_stats_fops = {
  982 + .owner = THIS_MODULE,
  983 + .open = show_tiny_stats_open,
  984 + .read = seq_read,
  985 + .llseek = seq_lseek,
  986 + .release = single_release,
  987 +};
  988 +
  989 +static struct dentry *rcudir;
  990 +
  991 +static int __init rcutiny_trace_init(void)
  992 +{
  993 + struct dentry *retval;
  994 +
  995 + rcudir = debugfs_create_dir("rcu", NULL);
  996 + if (!rcudir)
  997 + goto free_out;
  998 + retval = debugfs_create_file("rcudata", 0444, rcudir,
  999 + NULL, &show_tiny_stats_fops);
  1000 + if (!retval)
  1001 + goto free_out;
  1002 + return 0;
  1003 +free_out:
  1004 + debugfs_remove_recursive(rcudir);
  1005 + return 1;
  1006 +}
  1007 +
  1008 +static void __exit rcutiny_trace_cleanup(void)
  1009 +{
  1010 + debugfs_remove_recursive(rcudir);
  1011 +}
  1012 +
  1013 +module_init(rcutiny_trace_init);
  1014 +module_exit(rcutiny_trace_cleanup);
  1015 +
  1016 +MODULE_AUTHOR("Paul E. McKenney");
  1017 +MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
  1018 +MODULE_LICENSE("GPL");
  1019 +
  1020 +#endif /* #ifdef CONFIG_RCU_TRACE */
... ... @@ -47,6 +47,7 @@
47 47 #include <linux/srcu.h>
48 48 #include <linux/slab.h>
49 49 #include <asm/byteorder.h>
  50 +#include <linux/sched.h>
50 51  
51 52 MODULE_LICENSE("GPL");
52 53 MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
... ... @@ -64,6 +65,9 @@
64 65 static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */
65 66 static int fqs_holdoff = 0; /* Hold time within burst (us). */
66 67 static int fqs_stutter = 3; /* Wait time between bursts (s). */
  68 +static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
  69 +static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
  70 +static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
67 71 static char *torture_type = "rcu"; /* What RCU implementation to torture. */
68 72  
69 73 module_param(nreaders, int, 0444);
... ... @@ -88,6 +92,12 @@
88 92 MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
89 93 module_param(fqs_stutter, int, 0444);
90 94 MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
  95 +module_param(test_boost, int, 0444);
  96 +MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
  97 +module_param(test_boost_interval, int, 0444);
  98 +MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
  99 +module_param(test_boost_duration, int, 0444);
  100 +MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
91 101 module_param(torture_type, charp, 0444);
92 102 MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
93 103  
... ... @@ -109,6 +119,7 @@
109 119 static struct task_struct *shuffler_task;
110 120 static struct task_struct *stutter_task;
111 121 static struct task_struct *fqs_task;
  122 +static struct task_struct *boost_tasks[NR_CPUS];
112 123  
113 124 #define RCU_TORTURE_PIPE_LEN 10
114 125  
... ... @@ -134,6 +145,12 @@
134 145 static atomic_t n_rcu_torture_free;
135 146 static atomic_t n_rcu_torture_mberror;
136 147 static atomic_t n_rcu_torture_error;
  148 +static long n_rcu_torture_boost_ktrerror;
  149 +static long n_rcu_torture_boost_rterror;
  150 +static long n_rcu_torture_boost_allocerror;
  151 +static long n_rcu_torture_boost_afferror;
  152 +static long n_rcu_torture_boost_failure;
  153 +static long n_rcu_torture_boosts;
137 154 static long n_rcu_torture_timers;
138 155 static struct list_head rcu_torture_removed;
139 156 static cpumask_var_t shuffle_tmp_mask;
... ... @@ -147,6 +164,16 @@
147 164 #endif
148 165 int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
149 166  
  167 +#ifdef CONFIG_RCU_BOOST
  168 +#define rcu_can_boost() 1
  169 +#else /* #ifdef CONFIG_RCU_BOOST */
  170 +#define rcu_can_boost() 0
  171 +#endif /* #else #ifdef CONFIG_RCU_BOOST */
  172 +
  173 +static unsigned long boost_starttime; /* jiffies of next boost test start. */
  174 +DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
  175 + /* and boost task create/destroy. */
  176 +
150 177 /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */
151 178  
152 179 #define FULLSTOP_DONTSTOP 0 /* Normal operation. */
... ... @@ -277,6 +304,7 @@
277 304 void (*fqs)(void);
278 305 int (*stats)(char *page);
279 306 int irq_capable;
  307 + int can_boost;
280 308 char *name;
281 309 };
282 310  
... ... @@ -366,6 +394,7 @@
366 394 .fqs = rcu_force_quiescent_state,
367 395 .stats = NULL,
368 396 .irq_capable = 1,
  397 + .can_boost = rcu_can_boost(),
369 398 .name = "rcu"
370 399 };
371 400  
... ... @@ -408,6 +437,7 @@
408 437 .fqs = rcu_force_quiescent_state,
409 438 .stats = NULL,
410 439 .irq_capable = 1,
  440 + .can_boost = rcu_can_boost(),
411 441 .name = "rcu_sync"
412 442 };
413 443  
... ... @@ -424,6 +454,7 @@
424 454 .fqs = rcu_force_quiescent_state,
425 455 .stats = NULL,
426 456 .irq_capable = 1,
  457 + .can_boost = rcu_can_boost(),
427 458 .name = "rcu_expedited"
428 459 };
429 460  
... ... @@ -684,6 +715,110 @@
684 715 };
685 716  
686 717 /*
  718 + * RCU torture priority-boost testing. Runs one real-time thread per
  719 + * CPU for moderate bursts, repeatedly registering RCU callbacks and
  720 + * spinning waiting for them to be invoked. If a given callback takes
  721 + * too long to be invoked, we assume that priority inversion has occurred.
  722 + */
  723 +
  724 +struct rcu_boost_inflight {
  725 + struct rcu_head rcu;
  726 + int inflight;
  727 +};
  728 +
  729 +static void rcu_torture_boost_cb(struct rcu_head *head)
  730 +{
  731 + struct rcu_boost_inflight *rbip =
  732 + container_of(head, struct rcu_boost_inflight, rcu);
  733 +
  734 + smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
  735 + rbip->inflight = 0;
  736 +}
  737 +
  738 +static int rcu_torture_boost(void *arg)
  739 +{
  740 + unsigned long call_rcu_time;
  741 + unsigned long endtime;
  742 + unsigned long oldstarttime;
  743 + struct rcu_boost_inflight rbi = { .inflight = 0 };
  744 + struct sched_param sp;
  745 +
  746 + VERBOSE_PRINTK_STRING("rcu_torture_boost started");
  747 +
  748 + /* Set real-time priority. */
  749 + sp.sched_priority = 1;
  750 + if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
  751 + VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!");
  752 + n_rcu_torture_boost_rterror++;
  753 + }
  754 +
  755 + /* Each pass through the following loop does one boost-test cycle. */
  756 + do {
  757 + /* Wait for the next test interval. */
  758 + oldstarttime = boost_starttime;
  759 + while (jiffies - oldstarttime > ULONG_MAX / 2) {
  760 + schedule_timeout_uninterruptible(1);
  761 + rcu_stutter_wait("rcu_torture_boost");
  762 + if (kthread_should_stop() ||
  763 + fullstop != FULLSTOP_DONTSTOP)
  764 + goto checkwait;
  765 + }
  766 +
  767 + /* Do one boost-test interval. */
  768 + endtime = oldstarttime + test_boost_duration * HZ;
  769 + call_rcu_time = jiffies;
  770 + while (jiffies - endtime > ULONG_MAX / 2) {
  771 + /* If we don't have a callback in flight, post one. */
  772 + if (!rbi.inflight) {
  773 + smp_mb(); /* RCU core before ->inflight = 1. */
  774 + rbi.inflight = 1;
  775 + call_rcu(&rbi.rcu, rcu_torture_boost_cb);
  776 + if (jiffies - call_rcu_time >
  777 + test_boost_duration * HZ - HZ / 2) {
  778 + VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed");
  779 + n_rcu_torture_boost_failure++;
  780 + }
  781 + call_rcu_time = jiffies;
  782 + }
  783 + cond_resched();
  784 + rcu_stutter_wait("rcu_torture_boost");
  785 + if (kthread_should_stop() ||
  786 + fullstop != FULLSTOP_DONTSTOP)
  787 + goto checkwait;
  788 + }
  789 +
  790 + /*
  791 + * Set the start time of the next test interval.
  792 + * Yes, this is vulnerable to long delays, but such
  793 + * delays simply cause a false negative for the next
  794 + * interval. Besides, we are running at RT priority,
  795 + * so delays should be relatively rare.
  796 + */
  797 + while (oldstarttime == boost_starttime) {
  798 + if (mutex_trylock(&boost_mutex)) {
  799 + boost_starttime = jiffies +
  800 + test_boost_interval * HZ;
  801 + n_rcu_torture_boosts++;
  802 + mutex_unlock(&boost_mutex);
  803 + break;
  804 + }
  805 + schedule_timeout_uninterruptible(1);
  806 + }
  807 +
  808 + /* Go do the stutter. */
  809 +checkwait: rcu_stutter_wait("rcu_torture_boost");
  810 + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
  811 +
  812 + /* Clean up and exit. */
  813 + VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
  814 + rcutorture_shutdown_absorb("rcu_torture_boost");
  815 + while (!kthread_should_stop() || rbi.inflight)
  816 + schedule_timeout_uninterruptible(1);
  817 + smp_mb(); /* order accesses to ->inflight before stack-frame death. */
  818 + return 0;
  819 +}
  820 +
  821 +/*
687 822 * RCU torture force-quiescent-state kthread. Repeatedly induces
688 823 * bursts of calls to force_quiescent_state(), increasing the probability
689 824 * of occurrence of some important types of race conditions.
... ... @@ -933,7 +1068,8 @@
933 1068 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
934 1069 cnt += sprintf(&page[cnt],
935 1070 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
936   - "rtmbe: %d nt: %ld",
  1071 + "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld "
  1072 + "rtbf: %ld rtb: %ld nt: %ld",
937 1073 rcu_torture_current,
938 1074 rcu_torture_current_version,
939 1075 list_empty(&rcu_torture_freelist),
940 1076  
... ... @@ -941,8 +1077,19 @@
941 1077 atomic_read(&n_rcu_torture_alloc_fail),
942 1078 atomic_read(&n_rcu_torture_free),
943 1079 atomic_read(&n_rcu_torture_mberror),
  1080 + n_rcu_torture_boost_ktrerror,
  1081 + n_rcu_torture_boost_rterror,
  1082 + n_rcu_torture_boost_allocerror,
  1083 + n_rcu_torture_boost_afferror,
  1084 + n_rcu_torture_boost_failure,
  1085 + n_rcu_torture_boosts,
944 1086 n_rcu_torture_timers);
945   - if (atomic_read(&n_rcu_torture_mberror) != 0)
  1087 + if (atomic_read(&n_rcu_torture_mberror) != 0 ||
  1088 + n_rcu_torture_boost_ktrerror != 0 ||
  1089 + n_rcu_torture_boost_rterror != 0 ||
  1090 + n_rcu_torture_boost_allocerror != 0 ||
  1091 + n_rcu_torture_boost_afferror != 0 ||
  1092 + n_rcu_torture_boost_failure != 0)
946 1093 cnt += sprintf(&page[cnt], " !!!");
947 1094 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
948 1095 if (i > 1) {
949 1096  
950 1097  
951 1098  
952 1099  
... ... @@ -1094,22 +1241,91 @@
1094 1241 }
1095 1242  
1096 1243 static inline void
1097   -rcu_torture_print_module_parms(char *tag)
  1244 +rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
1098 1245 {
1099 1246 printk(KERN_ALERT "%s" TORTURE_FLAG
1100 1247 "--- %s: nreaders=%d nfakewriters=%d "
1101 1248 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
1102 1249 "shuffle_interval=%d stutter=%d irqreader=%d "
1103   - "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n",
  1250 + "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
  1251 + "test_boost=%d/%d test_boost_interval=%d "
  1252 + "test_boost_duration=%d\n",
1104 1253 torture_type, tag, nrealreaders, nfakewriters,
1105 1254 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1106   - stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter);
  1255 + stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
  1256 + test_boost, cur_ops->can_boost,
  1257 + test_boost_interval, test_boost_duration);
1107 1258 }
1108 1259  
1109   -static struct notifier_block rcutorture_nb = {
  1260 +static struct notifier_block rcutorture_shutdown_nb = {
1110 1261 .notifier_call = rcutorture_shutdown_notify,
1111 1262 };
1112 1263  
  1264 +static void rcutorture_booster_cleanup(int cpu)
  1265 +{
  1266 + struct task_struct *t;
  1267 +
  1268 + if (boost_tasks[cpu] == NULL)
  1269 + return;
  1270 + mutex_lock(&boost_mutex);
  1271 + VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task");
  1272 + t = boost_tasks[cpu];
  1273 + boost_tasks[cpu] = NULL;
  1274 + mutex_unlock(&boost_mutex);
  1275 +
  1276 + /* This must be outside of the mutex, otherwise deadlock! */
  1277 + kthread_stop(t);
  1278 +}
  1279 +
  1280 +static int rcutorture_booster_init(int cpu)
  1281 +{
  1282 + int retval;
  1283 +
  1284 + if (boost_tasks[cpu] != NULL)
  1285 + return 0; /* Already created, nothing more to do. */
  1286 +
  1287 + /* Don't allow time recalculation while creating a new task. */
  1288 + mutex_lock(&boost_mutex);
  1289 + VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
  1290 + boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL,
  1291 + "rcu_torture_boost");
  1292 + if (IS_ERR(boost_tasks[cpu])) {
  1293 + retval = PTR_ERR(boost_tasks[cpu]);
  1294 + VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
  1295 + n_rcu_torture_boost_ktrerror++;
  1296 + boost_tasks[cpu] = NULL;
  1297 + mutex_unlock(&boost_mutex);
  1298 + return retval;
  1299 + }
  1300 + kthread_bind(boost_tasks[cpu], cpu);
  1301 + wake_up_process(boost_tasks[cpu]);
  1302 + mutex_unlock(&boost_mutex);
  1303 + return 0;
  1304 +}
  1305 +
  1306 +static int rcutorture_cpu_notify(struct notifier_block *self,
  1307 + unsigned long action, void *hcpu)
  1308 +{
  1309 + long cpu = (long)hcpu;
  1310 +
  1311 + switch (action) {
  1312 + case CPU_ONLINE:
  1313 + case CPU_DOWN_FAILED:
  1314 + (void)rcutorture_booster_init(cpu);
  1315 + break;
  1316 + case CPU_DOWN_PREPARE:
  1317 + rcutorture_booster_cleanup(cpu);
  1318 + break;
  1319 + default:
  1320 + break;
  1321 + }
  1322 + return NOTIFY_OK;
  1323 +}
  1324 +
  1325 +static struct notifier_block rcutorture_cpu_nb = {
  1326 + .notifier_call = rcutorture_cpu_notify,
  1327 +};
  1328 +
1113 1329 static void
1114 1330 rcu_torture_cleanup(void)
1115 1331 {
... ... @@ -1127,7 +1343,7 @@
1127 1343 }
1128 1344 fullstop = FULLSTOP_RMMOD;
1129 1345 mutex_unlock(&fullstop_mutex);
1130   - unregister_reboot_notifier(&rcutorture_nb);
  1346 + unregister_reboot_notifier(&rcutorture_shutdown_nb);
1131 1347 if (stutter_task) {
1132 1348 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
1133 1349 kthread_stop(stutter_task);
... ... @@ -1184,6 +1400,12 @@
1184 1400 kthread_stop(fqs_task);
1185 1401 }
1186 1402 fqs_task = NULL;
  1403 + if ((test_boost == 1 && cur_ops->can_boost) ||
  1404 + test_boost == 2) {
  1405 + unregister_cpu_notifier(&rcutorture_cpu_nb);
  1406 + for_each_possible_cpu(i)
  1407 + rcutorture_booster_cleanup(i);
  1408 + }
1187 1409  
1188 1410 /* Wait for all RCU callbacks to fire. */
1189 1411  
1190 1412  
... ... @@ -1195,9 +1417,9 @@
1195 1417 if (cur_ops->cleanup)
1196 1418 cur_ops->cleanup();
1197 1419 if (atomic_read(&n_rcu_torture_error))
1198   - rcu_torture_print_module_parms("End of test: FAILURE");
  1420 + rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
1199 1421 else
1200   - rcu_torture_print_module_parms("End of test: SUCCESS");
  1422 + rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
1201 1423 }
1202 1424  
1203 1425 static int __init
... ... @@ -1242,7 +1464,7 @@
1242 1464 nrealreaders = nreaders;
1243 1465 else
1244 1466 nrealreaders = 2 * num_online_cpus();
1245   - rcu_torture_print_module_parms("Start of test");
  1467 + rcu_torture_print_module_parms(cur_ops, "Start of test");
1246 1468 fullstop = FULLSTOP_DONTSTOP;
1247 1469  
1248 1470 /* Set up the freelist. */
... ... @@ -1263,6 +1485,12 @@
1263 1485 atomic_set(&n_rcu_torture_free, 0);
1264 1486 atomic_set(&n_rcu_torture_mberror, 0);
1265 1487 atomic_set(&n_rcu_torture_error, 0);
  1488 + n_rcu_torture_boost_ktrerror = 0;
  1489 + n_rcu_torture_boost_rterror = 0;
  1490 + n_rcu_torture_boost_allocerror = 0;
  1491 + n_rcu_torture_boost_afferror = 0;
  1492 + n_rcu_torture_boost_failure = 0;
  1493 + n_rcu_torture_boosts = 0;
1266 1494 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
1267 1495 atomic_set(&rcu_torture_wcount[i], 0);
1268 1496 for_each_possible_cpu(cpu) {
... ... @@ -1376,7 +1604,27 @@
1376 1604 goto unwind;
1377 1605 }
1378 1606 }
1379   - register_reboot_notifier(&rcutorture_nb);
  1607 + if (test_boost_interval < 1)
  1608 + test_boost_interval = 1;
  1609 + if (test_boost_duration < 2)
  1610 + test_boost_duration = 2;
  1611 + if ((test_boost == 1 && cur_ops->can_boost) ||
  1612 + test_boost == 2) {
  1613 + int retval;
  1614 +
  1615 + boost_starttime = jiffies + test_boost_interval * HZ;
  1616 + register_cpu_notifier(&rcutorture_cpu_nb);
  1617 + for_each_possible_cpu(i) {
  1618 + if (cpu_is_offline(i))
  1619 + continue; /* Heuristic: CPU can go offline. */
  1620 + retval = rcutorture_booster_init(i);
  1621 + if (retval < 0) {
  1622 + firsterr = retval;
  1623 + goto unwind;
  1624 + }
  1625 + }
  1626 + }
  1627 + register_reboot_notifier(&rcutorture_shutdown_nb);
1380 1628 mutex_unlock(&fullstop_mutex);
1381 1629 return 0;
1382 1630  
... ... @@ -67,9 +67,6 @@
67 67 .gpnum = -300, \
68 68 .completed = -300, \
69 69 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
70   - .orphan_cbs_list = NULL, \
71   - .orphan_cbs_tail = &structname.orphan_cbs_list, \
72   - .orphan_qlen = 0, \
73 70 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
74 71 .n_force_qs = 0, \
75 72 .n_force_qs_ngp = 0, \
76 73  
... ... @@ -620,9 +617,17 @@
620 617 static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
621 618 {
622 619 if (rdp->gpnum != rnp->gpnum) {
623   - rdp->qs_pending = 1;
624   - rdp->passed_quiesc = 0;
  620 + /*
  621 + * If the current grace period is waiting for this CPU,
  622 + * set up to detect a quiescent state, otherwise don't
  623 + * go looking for one.
  624 + */
625 625 rdp->gpnum = rnp->gpnum;
  626 + if (rnp->qsmask & rdp->grpmask) {
  627 + rdp->qs_pending = 1;
  628 + rdp->passed_quiesc = 0;
  629 + } else
  630 + rdp->qs_pending = 0;
626 631 }
627 632 }
628 633  
... ... @@ -681,6 +686,24 @@
681 686  
682 687 /* Remember that we saw this grace-period completion. */
683 688 rdp->completed = rnp->completed;
  689 +
  690 + /*
  691 + * If we were in an extended quiescent state, we may have
  692 + * missed some grace periods that others CPUs handled on
  693 + * our behalf. Catch up with this state to avoid noting
  694 + * spurious new grace periods. If another grace period
  695 + * has started, then rnp->gpnum will have advanced, so
  696 + * we will detect this later on.
  697 + */
  698 + if (ULONG_CMP_LT(rdp->gpnum, rdp->completed))
  699 + rdp->gpnum = rdp->completed;
  700 +
  701 + /*
  702 + * If RCU does not need a quiescent state from this CPU,
  703 + * then make sure that this CPU doesn't go looking for one.
  704 + */
  705 + if ((rnp->qsmask & rdp->grpmask) == 0)
  706 + rdp->qs_pending = 0;
684 707 }
685 708 }
686 709  
687 710  
688 711  
689 712  
690 713  
691 714  
692 715  
693 716  
... ... @@ -984,56 +1007,34 @@
984 1007 #ifdef CONFIG_HOTPLUG_CPU
985 1008  
986 1009 /*
987   - * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the
988   - * specified flavor of RCU. The callbacks will be adopted by the next
989   - * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever
990   - * comes first. Because this is invoked from the CPU_DYING notifier,
991   - * irqs are already disabled.
  1010 + * Move a dying CPU's RCU callbacks to online CPU's callback list.
  1011 + * Synchronization is not required because this function executes
  1012 + * in stop_machine() context.
992 1013 */
993   -static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
  1014 +static void rcu_send_cbs_to_online(struct rcu_state *rsp)
994 1015 {
995 1016 int i;
  1017 + /* current DYING CPU is cleared in the cpu_online_mask */
  1018 + int receive_cpu = cpumask_any(cpu_online_mask);
996 1019 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
  1020 + struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
997 1021  
998 1022 if (rdp->nxtlist == NULL)
999 1023 return; /* irqs disabled, so comparison is stable. */
1000   - raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
1001   - *rsp->orphan_cbs_tail = rdp->nxtlist;
1002   - rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
  1024 +
  1025 + *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
  1026 + receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
  1027 + receive_rdp->qlen += rdp->qlen;
  1028 + receive_rdp->n_cbs_adopted += rdp->qlen;
  1029 + rdp->n_cbs_orphaned += rdp->qlen;
  1030 +
1003 1031 rdp->nxtlist = NULL;
1004 1032 for (i = 0; i < RCU_NEXT_SIZE; i++)
1005 1033 rdp->nxttail[i] = &rdp->nxtlist;
1006   - rsp->orphan_qlen += rdp->qlen;
1007   - rdp->n_cbs_orphaned += rdp->qlen;
1008 1034 rdp->qlen = 0;
1009   - raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
1010 1035 }
1011 1036  
1012 1037 /*
1013   - * Adopt previously orphaned RCU callbacks.
1014   - */
1015   -static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1016   -{
1017   - unsigned long flags;
1018   - struct rcu_data *rdp;
1019   -
1020   - raw_spin_lock_irqsave(&rsp->onofflock, flags);
1021   - rdp = this_cpu_ptr(rsp->rda);
1022   - if (rsp->orphan_cbs_list == NULL) {
1023   - raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
1024   - return;
1025   - }
1026   - *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
1027   - rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
1028   - rdp->qlen += rsp->orphan_qlen;
1029   - rdp->n_cbs_adopted += rsp->orphan_qlen;
1030   - rsp->orphan_cbs_list = NULL;
1031   - rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
1032   - rsp->orphan_qlen = 0;
1033   - raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
1034   -}
1035   -
1036   -/*
1037 1038 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
1038 1039 * and move all callbacks from the outgoing CPU to the current one.
1039 1040 */
... ... @@ -1081,8 +1082,6 @@
1081 1082 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1082 1083 if (need_report & RCU_OFL_TASKS_EXP_GP)
1083 1084 rcu_report_exp_rnp(rsp, rnp);
1084   -
1085   - rcu_adopt_orphan_cbs(rsp);
1086 1085 }
1087 1086  
1088 1087 /*
1089 1088  
... ... @@ -1100,14 +1099,10 @@
1100 1099  
1101 1100 #else /* #ifdef CONFIG_HOTPLUG_CPU */
1102 1101  
1103   -static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
  1102 +static void rcu_send_cbs_to_online(struct rcu_state *rsp)
1104 1103 {
1105 1104 }
1106 1105  
1107   -static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1108   -{
1109   -}
1110   -
1111 1106 static void rcu_offline_cpu(int cpu)
1112 1107 {
1113 1108 }
1114 1109  
... ... @@ -1440,22 +1435,11 @@
1440 1435 */
1441 1436 local_irq_save(flags);
1442 1437 rdp = this_cpu_ptr(rsp->rda);
1443   - rcu_process_gp_end(rsp, rdp);
1444   - check_for_new_grace_period(rsp, rdp);
1445 1438  
1446 1439 /* Add the callback to our list. */
1447 1440 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1448 1441 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1449 1442  
1450   - /* Start a new grace period if one not already started. */
1451   - if (!rcu_gp_in_progress(rsp)) {
1452   - unsigned long nestflag;
1453   - struct rcu_node *rnp_root = rcu_get_root(rsp);
1454   -
1455   - raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
1456   - rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */
1457   - }
1458   -
1459 1443 /*
1460 1444 * Force the grace period if too many callbacks or too long waiting.
1461 1445 * Enforce hysteresis, and don't invoke force_quiescent_state()
... ... @@ -1464,12 +1448,27 @@
1464 1448 * is the only one waiting for a grace period to complete.
1465 1449 */
1466 1450 if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
1467   - rdp->blimit = LONG_MAX;
1468   - if (rsp->n_force_qs == rdp->n_force_qs_snap &&
1469   - *rdp->nxttail[RCU_DONE_TAIL] != head)
1470   - force_quiescent_state(rsp, 0);
1471   - rdp->n_force_qs_snap = rsp->n_force_qs;
1472   - rdp->qlen_last_fqs_check = rdp->qlen;
  1451 +
  1452 + /* Are we ignoring a completed grace period? */
  1453 + rcu_process_gp_end(rsp, rdp);
  1454 + check_for_new_grace_period(rsp, rdp);
  1455 +
  1456 + /* Start a new grace period if one not already started. */
  1457 + if (!rcu_gp_in_progress(rsp)) {
  1458 + unsigned long nestflag;
  1459 + struct rcu_node *rnp_root = rcu_get_root(rsp);
  1460 +
  1461 + raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
  1462 + rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */
  1463 + } else {
  1464 + /* Give the grace period a kick. */
  1465 + rdp->blimit = LONG_MAX;
  1466 + if (rsp->n_force_qs == rdp->n_force_qs_snap &&
  1467 + *rdp->nxttail[RCU_DONE_TAIL] != head)
  1468 + force_quiescent_state(rsp, 0);
  1469 + rdp->n_force_qs_snap = rsp->n_force_qs;
  1470 + rdp->qlen_last_fqs_check = rdp->qlen;
  1471 + }
1473 1472 } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
1474 1473 force_quiescent_state(rsp, 1);
1475 1474 local_irq_restore(flags);
1476 1475  
1477 1476  
... ... @@ -1699,13 +1698,12 @@
1699 1698 * decrement rcu_barrier_cpu_count -- otherwise the first CPU
1700 1699 * might complete its grace period before all of the other CPUs
1701 1700 * did their increment, causing this function to return too
1702   - * early.
  1701 + * early. Note that on_each_cpu() disables irqs, which prevents
  1702 + * any CPUs from coming online or going offline until each online
  1703 + * CPU has queued its RCU-barrier callback.
1703 1704 */
1704 1705 atomic_set(&rcu_barrier_cpu_count, 1);
1705   - preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */
1706   - rcu_adopt_orphan_cbs(rsp);
1707 1706 on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
1708   - preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */
1709 1707 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
1710 1708 complete(&rcu_barrier_completion);
1711 1709 wait_for_completion(&rcu_barrier_completion);
1712 1710  
... ... @@ -1831,18 +1829,13 @@
1831 1829 case CPU_DYING:
1832 1830 case CPU_DYING_FROZEN:
1833 1831 /*
1834   - * preempt_disable() in _rcu_barrier() prevents stop_machine(),
1835   - * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
1836   - * returns, all online cpus have queued rcu_barrier_func().
1837   - * The dying CPU clears its cpu_online_mask bit and
1838   - * moves all of its RCU callbacks to ->orphan_cbs_list
1839   - * in the context of stop_machine(), so subsequent calls
1840   - * to _rcu_barrier() will adopt these callbacks and only
1841   - * then queue rcu_barrier_func() on all remaining CPUs.
  1832 + * The whole machine is "stopped" except this CPU, so we can
  1833 + * touch any data without introducing corruption. We send the
  1834 + * dying CPU's callbacks to an arbitrarily chosen online CPU.
1842 1835 */
1843   - rcu_send_cbs_to_orphanage(&rcu_bh_state);
1844   - rcu_send_cbs_to_orphanage(&rcu_sched_state);
1845   - rcu_preempt_send_cbs_to_orphanage();
  1836 + rcu_send_cbs_to_online(&rcu_bh_state);
  1837 + rcu_send_cbs_to_online(&rcu_sched_state);
  1838 + rcu_preempt_send_cbs_to_online();
1846 1839 break;
1847 1840 case CPU_DEAD:
1848 1841 case CPU_DEAD_FROZEN:
1849 1842  
... ... @@ -1880,8 +1873,9 @@
1880 1873 {
1881 1874 int i;
1882 1875  
1883   - for (i = NUM_RCU_LVLS - 1; i >= 0; i--)
  1876 + for (i = NUM_RCU_LVLS - 1; i > 0; i--)
1884 1877 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
  1878 + rsp->levelspread[0] = RCU_FANOUT_LEAF;
1885 1879 }
1886 1880 #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
1887 1881 static void __init rcu_init_levelspread(struct rcu_state *rsp)
... ... @@ -31,46 +31,51 @@
31 31 /*
32 32 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
33 33 * In theory, it should be possible to add more levels straightforwardly.
34   - * In practice, this has not been tested, so there is probably some
35   - * bug somewhere.
  34 + * In practice, this did work well going from three levels to four.
  35 + * Of course, your mileage may vary.
36 36 */
37 37 #define MAX_RCU_LVLS 4
38   -#define RCU_FANOUT (CONFIG_RCU_FANOUT)
39   -#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT)
40   -#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT)
41   -#define RCU_FANOUT_FOURTH (RCU_FANOUT_CUBE * RCU_FANOUT)
  38 +#if CONFIG_RCU_FANOUT > 16
  39 +#define RCU_FANOUT_LEAF 16
  40 +#else /* #if CONFIG_RCU_FANOUT > 16 */
  41 +#define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT)
  42 +#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
  43 +#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
  44 +#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
  45 +#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
  46 +#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
42 47  
43   -#if NR_CPUS <= RCU_FANOUT
  48 +#if NR_CPUS <= RCU_FANOUT_1
44 49 # define NUM_RCU_LVLS 1
45 50 # define NUM_RCU_LVL_0 1
46 51 # define NUM_RCU_LVL_1 (NR_CPUS)
47 52 # define NUM_RCU_LVL_2 0
48 53 # define NUM_RCU_LVL_3 0
49 54 # define NUM_RCU_LVL_4 0
50   -#elif NR_CPUS <= RCU_FANOUT_SQ
  55 +#elif NR_CPUS <= RCU_FANOUT_2
51 56 # define NUM_RCU_LVLS 2
52 57 # define NUM_RCU_LVL_0 1
53   -# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
  58 +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
54 59 # define NUM_RCU_LVL_2 (NR_CPUS)
55 60 # define NUM_RCU_LVL_3 0
56 61 # define NUM_RCU_LVL_4 0
57   -#elif NR_CPUS <= RCU_FANOUT_CUBE
  62 +#elif NR_CPUS <= RCU_FANOUT_3
58 63 # define NUM_RCU_LVLS 3
59 64 # define NUM_RCU_LVL_0 1
60   -# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
61   -# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
62   -# define NUM_RCU_LVL_3 NR_CPUS
  65 +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
  66 +# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
  67 +# define NUM_RCU_LVL_3 (NR_CPUS)
63 68 # define NUM_RCU_LVL_4 0
64   -#elif NR_CPUS <= RCU_FANOUT_FOURTH
  69 +#elif NR_CPUS <= RCU_FANOUT_4
65 70 # define NUM_RCU_LVLS 4
66 71 # define NUM_RCU_LVL_0 1
67   -# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE)
68   -# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
69   -# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
70   -# define NUM_RCU_LVL_4 NR_CPUS
  72 +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
  73 +# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
  74 +# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
  75 +# define NUM_RCU_LVL_4 (NR_CPUS)
71 76 #else
72 77 # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
73   -#endif /* #if (NR_CPUS) <= RCU_FANOUT */
  78 +#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
74 79  
75 80 #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
76 81 #define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
... ... @@ -203,8 +208,8 @@
203 208 long qlen_last_fqs_check;
204 209 /* qlen at last check for QS forcing */
205 210 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
206   - unsigned long n_cbs_orphaned; /* RCU cbs sent to orphanage. */
207   - unsigned long n_cbs_adopted; /* RCU cbs adopted from orphanage. */
  211 + unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
  212 + unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */
208 213 unsigned long n_force_qs_snap;
209 214 /* did other CPU force QS recently? */
210 215 long blimit; /* Upper limit on a processed batch */
... ... @@ -309,15 +314,7 @@
309 314 /* End of fields guarded by root rcu_node's lock. */
310 315  
311 316 raw_spinlock_t onofflock; /* exclude on/offline and */
312   - /* starting new GP. Also */
313   - /* protects the following */
314   - /* orphan_cbs fields. */
315   - struct rcu_head *orphan_cbs_list; /* list of rcu_head structs */
316   - /* orphaned by all CPUs in */
317   - /* a given leaf rcu_node */
318   - /* going offline. */
319   - struct rcu_head **orphan_cbs_tail; /* And tail pointer. */
320   - long orphan_qlen; /* Number of orphaned cbs. */
  317 + /* starting new GP. */
321 318 raw_spinlock_t fqslock; /* Only one task forcing */
322 319 /* quiescent states. */
323 320 unsigned long jiffies_force_qs; /* Time at which to invoke */
... ... @@ -390,7 +387,7 @@
390 387 static int rcu_preempt_pending(int cpu);
391 388 static int rcu_preempt_needs_cpu(int cpu);
392 389 static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
393   -static void rcu_preempt_send_cbs_to_orphanage(void);
  390 +static void rcu_preempt_send_cbs_to_online(void);
394 391 static void __init __rcu_init_preempt(void);
395 392 static void rcu_needs_cpu_flush(void);
396 393  
kernel/rcutree_plugin.h
... ... @@ -25,6 +25,7 @@
25 25 */
26 26  
27 27 #include <linux/delay.h>
  28 +#include <linux/stop_machine.h>
28 29  
29 30 /*
30 31 * Check the RCU kernel configuration parameters and print informative
31 32  
32 33  
... ... @@ -773,11 +774,11 @@
773 774 }
774 775  
775 776 /*
776   - * Move preemptable RCU's callbacks to ->orphan_cbs_list.
  777 + * Move preemptable RCU's callbacks from dying CPU to other online CPU.
777 778 */
778   -static void rcu_preempt_send_cbs_to_orphanage(void)
  779 +static void rcu_preempt_send_cbs_to_online(void)
779 780 {
780   - rcu_send_cbs_to_orphanage(&rcu_preempt_state);
  781 + rcu_send_cbs_to_online(&rcu_preempt_state);
781 782 }
782 783  
783 784 /*
... ... @@ -1001,7 +1002,7 @@
1001 1002 /*
1002 1003 * Because there is no preemptable RCU, there are no callbacks to move.
1003 1004 */
1004   -static void rcu_preempt_send_cbs_to_orphanage(void)
  1005 +static void rcu_preempt_send_cbs_to_online(void)
1005 1006 {
1006 1007 }
1007 1008  
... ... @@ -1013,6 +1014,132 @@
1013 1014 }
1014 1015  
1015 1016 #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
  1017 +
  1018 +#ifndef CONFIG_SMP
  1019 +
  1020 +void synchronize_sched_expedited(void)
  1021 +{
  1022 + cond_resched();
  1023 +}
  1024 +EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
  1025 +
  1026 +#else /* #ifndef CONFIG_SMP */
  1027 +
  1028 +static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
  1029 +static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
  1030 +
  1031 +static int synchronize_sched_expedited_cpu_stop(void *data)
  1032 +{
  1033 + /*
  1034 + * There must be a full memory barrier on each affected CPU
  1035 + * between the time that try_stop_cpus() is called and the
  1036 + * time that it returns.
  1037 + *
  1038 + * In the current initial implementation of cpu_stop, the
  1039 + * above condition is already met when the control reaches
  1040 + * this point and the following smp_mb() is not strictly
  1041 + * necessary. Do smp_mb() anyway for documentation and
  1042 + * robustness against future implementation changes.
  1043 + */
  1044 + smp_mb(); /* See above comment block. */
  1045 + return 0;
  1046 +}
  1047 +
  1048 +/*
  1049 + * Wait for an rcu-sched grace period to elapse, but use "big hammer"
  1050 + * approach to force grace period to end quickly. This consumes
  1051 + * significant time on all CPUs, and is thus not recommended for
  1052 + * any sort of common-case code.
  1053 + *
  1054 + * Note that it is illegal to call this function while holding any
  1055 + * lock that is acquired by a CPU-hotplug notifier. Failing to
  1056 + * observe this restriction will result in deadlock.
  1057 + *
  1058 + * This implementation can be thought of as an application of ticket
  1059 + * locking to RCU, with sync_sched_expedited_started and
  1060 + * sync_sched_expedited_done taking on the roles of the halves
  1061 + * of the ticket-lock word. Each task atomically increments
  1062 + * sync_sched_expedited_started upon entry, snapshotting the old value,
  1063 + * then attempts to stop all the CPUs. If this succeeds, then each
  1064 + * CPU will have executed a context switch, resulting in an RCU-sched
  1065 + * grace period. We are then done, so we use atomic_cmpxchg() to
  1066 + * update sync_sched_expedited_done to match our snapshot -- but
  1067 + * only if someone else has not already advanced past our snapshot.
  1068 + *
  1069 + * On the other hand, if try_stop_cpus() fails, we check the value
  1070 + * of sync_sched_expedited_done. If it has advanced past our
  1071 + * initial snapshot, then someone else must have forced a grace period
  1072 + * some time after we took our snapshot. In this case, our work is
  1073 + * done for us, and we can simply return. Otherwise, we try again,
  1074 + * but keep our initial snapshot for purposes of checking for someone
  1075 + * doing our work for us.
  1076 + *
  1077 + * If we fail too many times in a row, we fall back to synchronize_sched().
  1078 + */
  1079 +void synchronize_sched_expedited(void)
  1080 +{
  1081 + int firstsnap, s, snap, trycount = 0;
  1082 +
  1083 + /* Note that atomic_inc_return() implies full memory barrier. */
  1084 + firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
  1085 + get_online_cpus();
  1086 +
  1087 + /*
  1088 + * Each pass through the following loop attempts to force a
  1089 + * context switch on each CPU.
  1090 + */
  1091 + while (try_stop_cpus(cpu_online_mask,
  1092 + synchronize_sched_expedited_cpu_stop,
  1093 + NULL) == -EAGAIN) {
  1094 + put_online_cpus();
  1095 +
  1096 + /* No joy, try again later. Or just synchronize_sched(). */
  1097 + if (trycount++ < 10)
  1098 + udelay(trycount * num_online_cpus());
  1099 + else {
  1100 + synchronize_sched();
  1101 + return;
  1102 + }
  1103 +
  1104 + /* Check to see if someone else did our work for us. */
  1105 + s = atomic_read(&sync_sched_expedited_done);
  1106 + if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
  1107 + smp_mb(); /* ensure test happens before caller kfree */
  1108 + return;
  1109 + }
  1110 +
  1111 + /*
  1112 + * Refetching sync_sched_expedited_started allows later
  1113 + * callers to piggyback on our grace period. We subtract
  1114 + * 1 to get the same token that the last incrementer got.
  1115 + * We retry after they started, so our grace period works
  1116 + * for them, and they started after our first try, so their
  1117 + * grace period works for us.
  1118 + */
  1119 + get_online_cpus();
  1120 + snap = atomic_read(&sync_sched_expedited_started) - 1;
  1121 + smp_mb(); /* ensure read is before try_stop_cpus(). */
  1122 + }
  1123 +
  1124 + /*
  1125 + * Everyone up to our most recent fetch is covered by our grace
  1126 + * period. Update the counter, but only if our work is still
  1127 + * relevant -- which it won't be if someone who started later
  1128 + * than we did beat us to the punch.
  1129 + */
  1130 + do {
  1131 + s = atomic_read(&sync_sched_expedited_done);
  1132 + if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
  1133 + smp_mb(); /* ensure test happens before caller kfree */
  1134 + break;
  1135 + }
  1136 + } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
  1137 +
  1138 + put_online_cpus();
  1139 +}
  1140 +EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
  1141 +
  1142 +#endif /* #else #ifndef CONFIG_SMP */
1016 1143  
1017 1144 #if !defined(CONFIG_RCU_FAST_NO_HZ)
1018 1145  
kernel/rcutree_trace.c
... ... @@ -166,13 +166,13 @@
166 166  
167 167 gpnum = rsp->gpnum;
168 168 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
169   - "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
  169 + "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
170 170 rsp->completed, gpnum, rsp->signaled,
171 171 (long)(rsp->jiffies_force_qs - jiffies),
172 172 (int)(jiffies & 0xffff),
173 173 rsp->n_force_qs, rsp->n_force_qs_ngp,
174 174 rsp->n_force_qs - rsp->n_force_qs_ngp,
175   - rsp->n_force_qs_lh, rsp->orphan_qlen);
  175 + rsp->n_force_qs_lh);
176 176 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
177 177 if (rnp->level != level) {
178 178 seq_puts(m, "\n");
... ... @@ -300,7 +300,7 @@
300 300  
301 301 static struct dentry *rcudir;
302 302  
303   -static int __init rcuclassic_trace_init(void)
  303 +static int __init rcutree_trace_init(void)
304 304 {
305 305 struct dentry *retval;
306 306  
307 307  
... ... @@ -337,14 +337,14 @@
337 337 return 1;
338 338 }
339 339  
340   -static void __exit rcuclassic_trace_cleanup(void)
  340 +static void __exit rcutree_trace_cleanup(void)
341 341 {
342 342 debugfs_remove_recursive(rcudir);
343 343 }
344 344  
345 345  
346   -module_init(rcuclassic_trace_init);
347   -module_exit(rcuclassic_trace_cleanup);
  346 +module_init(rcutree_trace_init);
  347 +module_exit(rcutree_trace_cleanup);
348 348  
349 349 MODULE_AUTHOR("Paul E. McKenney");
350 350 MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
... ... @@ -9533,74 +9533,4 @@
9533 9533 .subsys_id = cpuacct_subsys_id,
9534 9534 };
9535 9535 #endif /* CONFIG_CGROUP_CPUACCT */
9536   -
9537   -#ifndef CONFIG_SMP
9538   -
9539   -void synchronize_sched_expedited(void)
9540   -{
9541   - barrier();
9542   -}
9543   -EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9544   -
9545   -#else /* #ifndef CONFIG_SMP */
9546   -
9547   -static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
9548   -
9549   -static int synchronize_sched_expedited_cpu_stop(void *data)
9550   -{
9551   - /*
9552   - * There must be a full memory barrier on each affected CPU
9553   - * between the time that try_stop_cpus() is called and the
9554   - * time that it returns.
9555   - *
9556   - * In the current initial implementation of cpu_stop, the
9557   - * above condition is already met when the control reaches
9558   - * this point and the following smp_mb() is not strictly
9559   - * necessary. Do smp_mb() anyway for documentation and
9560   - * robustness against future implementation changes.
9561   - */
9562   - smp_mb(); /* See above comment block. */
9563   - return 0;
9564   -}
9565   -
9566   -/*
9567   - * Wait for an rcu-sched grace period to elapse, but use "big hammer"
9568   - * approach to force grace period to end quickly. This consumes
9569   - * significant time on all CPUs, and is thus not recommended for
9570   - * any sort of common-case code.
9571   - *
9572   - * Note that it is illegal to call this function while holding any
9573   - * lock that is acquired by a CPU-hotplug notifier. Failing to
9574   - * observe this restriction will result in deadlock.
9575   - */
9576   -void synchronize_sched_expedited(void)
9577   -{
9578   - int snap, trycount = 0;
9579   -
9580   - smp_mb(); /* ensure prior mod happens before capturing snap. */
9581   - snap = atomic_read(&synchronize_sched_expedited_count) + 1;
9582   - get_online_cpus();
9583   - while (try_stop_cpus(cpu_online_mask,
9584   - synchronize_sched_expedited_cpu_stop,
9585   - NULL) == -EAGAIN) {
9586   - put_online_cpus();
9587   - if (trycount++ < 10)
9588   - udelay(trycount * num_online_cpus());
9589   - else {
9590   - synchronize_sched();
9591   - return;
9592   - }
9593   - if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
9594   - smp_mb(); /* ensure test happens before caller kfree */
9595   - return;
9596   - }
9597   - get_online_cpus();
9598   - }
9599   - atomic_inc(&synchronize_sched_expedited_count);
9600   - smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
9601   - put_online_cpus();
9602   -}
9603   -EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9604   -
9605   -#endif /* #else #ifndef CONFIG_SMP */
... ... @@ -31,6 +31,7 @@
31 31 #include <linux/rcupdate.h>
32 32 #include <linux/sched.h>
33 33 #include <linux/smp.h>
  34 +#include <linux/delay.h>
34 35 #include <linux/srcu.h>
35 36  
36 37 static int init_srcu_struct_fields(struct srcu_struct *sp)
37 38  
... ... @@ -203,9 +204,14 @@
203 204 * all srcu_read_lock() calls using the old counters have completed.
204 205 * Their corresponding critical sections might well be still
205 206 * executing, but the srcu_read_lock() primitives themselves
206   - * will have finished executing.
  207 + * will have finished executing. We initially give readers
  208 + * an arbitrarily chosen 10 microseconds to get out of their
  209 + * SRCU read-side critical sections, then loop waiting 1/HZ
  210 + * seconds per iteration.
207 211 */
208 212  
  213 + if (srcu_readers_active_idx(sp, idx))
  214 + udelay(CONFIG_SRCU_SYNCHRONIZE_DELAY);
209 215 while (srcu_readers_active_idx(sp, idx))
210 216 schedule_timeout_interruptible(1);
211 217