Merge branch 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: rcu: remove unused __list_for_each_rcu() macro rculist: fix borked __list_for_each_rcu() macro rcu: reduce __call_rcu()-induced contention on rcu_node structures rcu: limit rcu_node leaf-level fanout rcu: fine-tune grace-period begin/end checks rcu: Keep gpnum and completed fields synchronized rcu: Stop chasing QS if another CPU did it for us rcu: increase synchronize_sched_expedited() batching rcu: Make synchronize_srcu_expedited() fast if running readers rcu: fix race condition in synchronize_sched_expedited() rcu: update documentation/comments for Lai's adoption patch rcu,cleanup: simplify the code when cpu is dying rcu,cleanup: move synchronize_sched_expedited() out of sched.c rcu: get rid of obsolete "classic" names in TREE_RCU tracing rcu: Distinguish between boosting and boosted rcu: document TINY_RCU and TINY_PREEMPT_RCU tracing. rcu: add tracing for TINY_RCU and TINY_PREEMPT_RCU rcu: priority boosting for TINY_PREEMPT_RCU rcu: move TINY_RCU from softirq to kthread rcu: add priority-inversion testing to rcutorture

Merge branch 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: rcu: remove unused __list_for_each_rcu() macro rculist: fix borked __list_for_each_rcu() macro rcu: reduce __call_rcu()-induced contention on rcu_node structures rcu: limit rcu_node leaf-level fanout rcu: fine-tune grace-period begin/end checks rcu: Keep gpnum and completed fields synchronized rcu: Stop chasing QS if another CPU did it for us rcu: increase synchronize_sched_expedited() batching rcu: Make synchronize_srcu_expedited() fast if running readers rcu: fix race condition in synchronize_sched_expedited() rcu: update documentation/comments for Lai's adoption patch rcu,cleanup: simplify the code when cpu is dying rcu,cleanup: move synchronize_sched_expedited() out of sched.c rcu: get rid of obsolete "classic" names in TREE_RCU tracing rcu: Distinguish between boosting and boosted rcu: document TINY_RCU and TINY_PREEMPT_RCU tracing. rcu: add tracing for TINY_RCU and TINY_PREEMPT_RCU rcu: priority boosting for TINY_PREEMPT_RCU rcu: move TINY_RCU from softirq to kthread rcu: add priority-inversion testing to rcutorture
Linus Torvalds
2 parents b08b272133 394f4528c5
Showing 17 changed files Side-by-side Diff
Documentation/RCU/trace.txt
include/linux/init_task.h
include/linux/rculist.h
include/linux/rcupdate.h
include/linux/rcutiny.h
include/linux/rcutree.h
include/linux/sched.h
init/Kconfig
kernel/rcutiny.c
kernel/rcutiny_plugin.h
kernel/rcutorture.c
kernel/rcutree.c
kernel/rcutree.h
kernel/rcutree_plugin.h
kernel/rcutree_trace.c
kernel/sched.c
kernel/srcu.c
 CONFIG_RCU_TRACE debugfs Files and Formats
  
  
-The rcutree implementation of RCU provides debugfs trace output that
-summarizes counters and state.  This information is useful for debugging
-RCU itself, and can sometimes also help to debug abuses of RCU.
-The following sections describe the debugfs files and formats.
+The rcutree and rcutiny implementations of RCU provide debugfs trace
+output that summarizes counters and state.  This information is useful for
+debugging RCU itself, and can sometimes also help to debug abuses of RCU.
+The following sections describe the debugfs files and formats, first
+for rcutree and next for rcutiny.
  
  
-Hierarchical RCU debugfs Files and Formats
+CONFIG_TREE_RCU and CONFIG_TREE_PREEMPT_RCU debugfs Files and Formats
  
-This implementation of RCU provides three debugfs files under the
+These implementations of RCU provides five debugfs files under the
 top-level directory RCU: rcu/rcudata (which displays fields in struct
-rcu_data), rcu/rcugp (which displays grace-period counters), and
-rcu/rcuhier (which displays the struct rcu_node hierarchy).
+rcu_data), rcu/rcudata.csv (which is a .csv spreadsheet version of
+rcu/rcudata), rcu/rcugp (which displays grace-period counters),
+rcu/rcuhier (which displays the struct rcu_node hierarchy), and
+rcu/rcu_pending (which displays counts of the reasons that the
+rcu_pending() function decided that there was core RCU work to do).
  
 The output of "cat rcu/rcudata" looks as follows:
  
@@ -130,7 +134,8 @@
 	been registered in absence of CPU-hotplug activity.
  
 o	"co" is the number of RCU callbacks that have been orphaned due to
-	this CPU going offline.
+	this CPU going offline.  These orphaned callbacks have been moved
+	to an arbitrarily chosen online CPU.
  
 o	"ca" is the number of RCU callbacks that have been adopted due to
 	other CPUs going offline.  Note that ci+co-ca+ql is the number of
  
@@ -168,12 +173,12 @@
  
 The output of "cat rcu/rcuhier" looks as follows, with very long lines:
  
-c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 oqlen=0
+c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6
 1/1 .>. 0:127 ^0    
 3/3 .>. 0:35 ^0    0/0 .>. 36:71 ^1    0/0 .>. 72:107 ^2    0/0 .>. 108:127 ^3    
 3/3f .>. 0:5 ^0    2/3 .>. 6:11 ^1    0/0 .>. 12:17 ^2    0/0 .>. 18:23 ^3    0/0 .>. 24:29 ^4    0/0 .>. 30:35 ^5    0/0 .>. 36:41 ^0    0/0 .>. 42:47 ^1    0/0 .>. 48:53 ^2    0/0 .>. 54:59 ^3    0/0 .>. 60:65 ^4    0/0 .>. 66:71 ^5    0/0 .>. 72:77 ^0    0/0 .>. 78:83 ^1    0/0 .>. 84:89 ^2    0/0 .>. 90:95 ^3    0/0 .>. 96:101 ^4    0/0 .>. 102:107 ^5    0/0 .>. 108:113 ^0    0/0 .>. 114:119 ^1    0/0 .>. 120:125 ^2    0/0 .>. 126:127 ^3    
 rcu_bh:
-c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 oqlen=0
+c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0
 0/1 .>. 0:127 ^0    
 0/3 .>. 0:35 ^0    0/0 .>. 36:71 ^1    0/0 .>. 72:107 ^2    0/0 .>. 108:127 ^3    
 0/3f .>. 0:5 ^0    0/3 .>. 6:11 ^1    0/0 .>. 12:17 ^2    0/0 .>. 18:23 ^3    0/0 .>. 24:29 ^4    0/0 .>. 30:35 ^5    0/0 .>. 36:41 ^0    0/0 .>. 42:47 ^1    0/0 .>. 48:53 ^2    0/0 .>. 54:59 ^3    0/0 .>. 60:65 ^4    0/0 .>. 66:71 ^5    0/0 .>. 72:77 ^0    0/0 .>. 78:83 ^1    0/0 .>. 84:89 ^2    0/0 .>. 90:95 ^3    0/0 .>. 96:101 ^4    0/0 .>. 102:107 ^5    0/0 .>. 108:113 ^0    0/0 .>. 114:119 ^1    0/0 .>. 120:125 ^2    0/0 .>. 126:127 ^3
@@ -212,11 +217,6 @@
 	exited immediately (without even being counted in nfqs above)
 	due to contention on ->fqslock.
  
-o	"oqlen" is the number of callbacks on the "orphan" callback
-	list.  RCU callbacks are placed on this list by CPUs going
-	offline, and are "adopted" either by the CPU helping the outgoing
-	CPU or by the next rcu_barrier*() call, whichever comes first.
-
 o	Each element of the form "1/1 0:127 ^0" represents one struct
 	rcu_node.  Each line represents one level of the hierarchy, from
 	root to leaves.  It is best to think of the rcu_data structures
@@ -326,4 +326,116 @@
 	readers will note that the rcu "nn" number for a given CPU very
 	closely matches the rcu_bh "np" number for that same CPU.  This
 	is due to short-circuit evaluation in rcu_pending().
+
+
+CONFIG_TINY_RCU and CONFIG_TINY_PREEMPT_RCU debugfs Files and Formats
+
+These implementations of RCU provides a single debugfs file under the
+top-level directory RCU, namely rcu/rcudata, which displays fields in
+rcu_bh_ctrlblk, rcu_sched_ctrlblk and, for CONFIG_TINY_PREEMPT_RCU,
+rcu_preempt_ctrlblk.
+
+The output of "cat rcu/rcudata" is as follows:
+
+rcu_preempt: qlen=24 gp=1097669 g197/p197/c197 tasks=...
+             ttb=. btg=no ntb=184 neb=0 nnb=183 j=01f7 bt=0274
+             normal balk: nt=1097669 gt=0 bt=371 b=0 ny=25073378 nos=0
+             exp balk: bt=0 nos=0
+rcu_sched: qlen: 0
+rcu_bh: qlen: 0
+
+This is split into rcu_preempt, rcu_sched, and rcu_bh sections, with the
+rcu_preempt section appearing only in CONFIG_TINY_PREEMPT_RCU builds.
+The last three lines of the rcu_preempt section appear only in
+CONFIG_RCU_BOOST kernel builds.  The fields are as follows:
+
+o	"qlen" is the number of RCU callbacks currently waiting either
+	for an RCU grace period or waiting to be invoked.  This is the
+	only field present for rcu_sched and rcu_bh, due to the
+	short-circuiting of grace period in those two cases.
+
+o	"gp" is the number of grace periods that have completed.
+
+o	"g197/p197/c197" displays the grace-period state, with the
+	"g" number being the number of grace periods that have started
+	(mod 256), the "p" number being the number of grace periods
+	that the CPU has responded to (also mod 256), and the "c"
+	number being the number of grace periods that have completed
+	(once again mode 256).
+
+	Why have both "gp" and "g"?  Because the data flowing into
+	"gp" is only present in a CONFIG_RCU_TRACE kernel.
+
+o	"tasks" is a set of bits.  The first bit is "T" if there are
+	currently tasks that have recently blocked within an RCU
+	read-side critical section, the second bit is "N" if any of the
+	aforementioned tasks are blocking the current RCU grace period,
+	and the third bit is "E" if any of the aforementioned tasks are
+	blocking the current expedited grace period.  Each bit is "."
+	if the corresponding condition does not hold.
+
+o	"ttb" is a single bit.  It is "B" if any of the blocked tasks
+	need to be priority boosted and "." otherwise.
+
+o	"btg" indicates whether boosting has been carried out during
+	the current grace period, with "exp" indicating that boosting
+	is in progress for an expedited grace period, "no" indicating
+	that boosting has not yet started for a normal grace period,
+	"begun" indicating that boosting has bebug for a normal grace
+	period, and "done" indicating that boosting has completed for
+	a normal grace period.
+
+o	"ntb" is the total number of tasks subjected to RCU priority boosting
+	periods since boot.
+
+o	"neb" is the number of expedited grace periods that have had
+	to resort to RCU priority boosting since boot.
+
+o	"nnb" is the number of normal grace periods that have had
+	to resort to RCU priority boosting since boot.
+
+o	"j" is the low-order 12 bits of the jiffies counter in hexadecimal.
+
+o	"bt" is the low-order 12 bits of the value that the jiffies counter
+	will have at the next time that boosting is scheduled to begin.
+
+o	In the line beginning with "normal balk", the fields are as follows:
+
+	o	"nt" is the number of times that the system balked from
+		boosting because there were no blocked tasks to boost.
+		Note that the system will balk from boosting even if the
+		grace period is overdue when the currently running task
+		is looping within an RCU read-side critical section.
+		There is no point in boosting in this case, because
+		boosting a running task won't make it run any faster.
+
+	o	"gt" is the number of times that the system balked
+		from boosting because, although there were blocked tasks,
+		none of them were preventing the current grace period
+		from completing.
+
+	o	"bt" is the number of times that the system balked
+		from boosting because boosting was already in progress.
+
+	o	"b" is the number of times that the system balked from
+		boosting because boosting had already completed for
+		the grace period in question.
+
+	o	"ny" is the number of times that the system balked from
+		boosting because it was not yet time to start boosting
+		the grace period in question.
+
+	o	"nos" is the number of times that the system balked from
+		boosting for inexplicable ("not otherwise specified")
+		reasons.  This can actually happen due to races involving
+		increments of the jiffies counter.
+
+o	In the line beginning with "exp balk", the fields are as follows:
+
+	o	"bt" is the number of times that the system balked from
+		boosting because there were no blocked tasks to boost.
+
+	o	"nos" is the number of times that the system balked from
+		 boosting for inexplicable ("not otherwise specified")
+		 reasons.
@@ -83,6 +83,12 @@
  */
 # define CAP_INIT_BSET  CAP_FULL_SET
  
+#ifdef CONFIG_RCU_BOOST
+#define INIT_TASK_RCU_BOOST()						\
+	.rcu_boost_mutex = NULL,
+#else
+#define INIT_TASK_RCU_BOOST()
+#endif
 #ifdef CONFIG_TREE_PREEMPT_RCU
 #define INIT_TASK_RCU_TREE_PREEMPT()					\
 	.rcu_blocked_node = NULL,
@@ -94,7 +100,8 @@
 	.rcu_read_lock_nesting = 0,					\
 	.rcu_read_unlock_special = 0,					\
 	.rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry),		\
-	INIT_TASK_RCU_TREE_PREEMPT()
+	INIT_TASK_RCU_TREE_PREEMPT()					\
+	INIT_TASK_RCU_BOOST()
 #else
 #define INIT_TASK_RCU_PREEMPT(tsk)
 #endif
@@ -241,11 +241,6 @@
 #define list_first_entry_rcu(ptr, type, member) \
 	list_entry_rcu((ptr)->next, type, member)
  
-#define __list_for_each_rcu(pos, head) \
-	for (pos = rcu_dereference_raw(list_next_rcu(head)); \
-		pos != (head); \
-		pos = rcu_dereference_raw(list_next_rcu((pos)))
-
 /**
  * list_for_each_entry_rcu	-	iterate over rcu list of given type
  * @pos:	the type * to use as a loop cursor.
@@ -47,6 +47,8 @@
 extern int rcutorture_runnable; /* for sysctl */
 #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
  
+#define UINT_CMP_GE(a, b)	(UINT_MAX / 2 >= (a) - (b))
+#define UINT_CMP_LT(a, b)	(UINT_MAX / 2 < (a) - (b))
 #define ULONG_CMP_GE(a, b)	(ULONG_MAX / 2 >= (a) - (b))
 #define ULONG_CMP_LT(a, b)	(ULONG_MAX / 2 < (a) - (b))
  
@@ -66,7 +68,6 @@
 extern void synchronize_sched(void);
 extern void rcu_barrier_bh(void);
 extern void rcu_barrier_sched(void);
-extern void synchronize_sched_expedited(void);
 extern int sched_expedited_torture_stats(char *page);
  
 static inline void __rcu_read_lock_bh(void)
@@ -118,7 +119,6 @@
 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
  
 /* Internal to kernel */
-extern void rcu_init(void);
 extern void rcu_sched_qs(int cpu);
 extern void rcu_bh_qs(int cpu);
 extern void rcu_check_callbacks(int cpu, int user);
@@ -27,7 +27,9 @@
  
 #include <linux/cache.h>
  
-#define rcu_init_sched()	do { } while (0)
+static inline void rcu_init(void)
+{
+}
  
 #ifdef CONFIG_TINY_RCU
  
@@ -58,6 +60,11 @@
 	synchronize_sched();
 }
  
+static inline void synchronize_sched_expedited(void)
+{
+	synchronize_sched();
+}
+
 #ifdef CONFIG_TINY_RCU
  
 static inline void rcu_preempt_note_context_switch(void)
  
  
  
@@ -125,16 +132,12 @@
 }
  
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-
 extern int rcu_scheduler_active __read_mostly;
 extern void rcu_scheduler_starting(void);
-
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-
 static inline void rcu_scheduler_starting(void)
 {
 }
-
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
  
 #endif /* __LINUX_RCUTINY_H */
@@ -30,6 +30,7 @@
 #ifndef __LINUX_RCUTREE_H
 #define __LINUX_RCUTREE_H
  
+extern void rcu_init(void);
 extern void rcu_note_context_switch(int cpu);
 extern int rcu_needs_cpu(int cpu);
 extern void rcu_cpu_stall_reset(void);
@@ -47,6 +48,7 @@
 #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
  
 extern void synchronize_rcu_bh(void);
+extern void synchronize_sched_expedited(void);
 extern void synchronize_rcu_expedited(void);
  
 static inline void synchronize_rcu_bh_expedited(void)
@@ -1229,6 +1229,9 @@
 #ifdef CONFIG_TREE_PREEMPT_RCU
 	struct rcu_node *rcu_blocked_node;
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+	struct rt_mutex *rcu_boost_mutex;
+#endif /* #ifdef CONFIG_RCU_BOOST */
  
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	struct sched_info sched_info;
@@ -1759,7 +1762,8 @@
 #ifdef CONFIG_PREEMPT_RCU
  
 #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
-#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
+#define RCU_READ_UNLOCK_BOOSTED (1 << 1) /* boosted while in RCU read-side. */
+#define RCU_READ_UNLOCK_NEED_QS (1 << 2) /* RCU core needs CPU response. */
  
 static inline void rcu_copy_process(struct task_struct *p)
 {
@@ -1767,7 +1771,10 @@
 	p->rcu_read_unlock_special = 0;
 #ifdef CONFIG_TREE_PREEMPT_RCU
 	p->rcu_blocked_node = NULL;
-#endif
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+	p->rcu_boost_mutex = NULL;
+#endif /* #ifdef CONFIG_RCU_BOOST */
 	INIT_LIST_HEAD(&p->rcu_node_entry);
 }
  
@@ -393,7 +393,6 @@
  
 config RCU_TRACE
 	bool "Enable tracing for RCU"
-	depends on TREE_RCU || TREE_PREEMPT_RCU
 	help
 	  This option provides tracing in RCU which presents stats
 	  in debugfs for debugging RCU implementation.
@@ -458,6 +457,60 @@
 	  This option provides tracing for the TREE_RCU and
 	  TREE_PREEMPT_RCU implementations, permitting Makefile to
 	  trivially select kernel/rcutree_trace.c.
+
+config RCU_BOOST
+	bool "Enable RCU priority boosting"
+	depends on RT_MUTEXES && TINY_PREEMPT_RCU
+	default n
+	help
+	  This option boosts the priority of preempted RCU readers that
+	  block the current preemptible RCU grace period for too long.
+	  This option also prevents heavy loads from blocking RCU
+	  callback invocation for all flavors of RCU.
+
+	  Say Y here if you are working with real-time apps or heavy loads
+	  Say N here if you are unsure.
+
+config RCU_BOOST_PRIO
+	int "Real-time priority to boost RCU readers to"
+	range 1 99
+	depends on RCU_BOOST
+	default 1
+	help
+	  This option specifies the real-time priority to which preempted
+	  RCU readers are to be boosted.  If you are working with CPU-bound
+	  real-time applications, you should specify a priority higher then
+	  the highest-priority CPU-bound application.
+
+	  Specify the real-time priority, or take the default if unsure.
+
+config RCU_BOOST_DELAY
+	int "Milliseconds to delay boosting after RCU grace-period start"
+	range 0 3000
+	depends on RCU_BOOST
+	default 500
+	help
+	  This option specifies the time to wait after the beginning of
+	  a given grace period before priority-boosting preempted RCU
+	  readers blocking that grace period.  Note that any RCU reader
+	  blocking an expedited RCU grace period is boosted immediately.
+
+	  Accept the default if unsure.
+
+config SRCU_SYNCHRONIZE_DELAY
+	int "Microseconds to delay before waiting for readers"
+	range 0 20
+	default 10
+	help
+	  This option controls how long SRCU delays before entering its
+	  loop waiting on SRCU readers.  The purpose of this loop is
+	  to avoid the unconditional context-switch penalty that would
+	  otherwise be incurred if there was an active SRCU reader,
+	  in a manner similar to adaptive locking schemes.  This should
+	  be set to be a bit longer than the common-case SRCU read-side
+	  critical-section overhead.
+
+	  Accept the default if unsure.
  
 endmenu # "RCU Subsystem"
  
@@ -36,31 +36,16 @@
 #include <linux/time.h>
 #include <linux/cpu.h>
  
-/* Global control variables for rcupdate callback mechanism. */
-struct rcu_ctrlblk {
-	struct rcu_head *rcucblist;	/* List of pending callbacks (CBs). */
-	struct rcu_head **donetail;	/* ->next pointer of last "done" CB. */
-	struct rcu_head **curtail;	/* ->next pointer of last CB. */
-};
+/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
+static struct task_struct *rcu_kthread_task;
+static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
+static unsigned long have_rcu_kthread_work;
+static void invoke_rcu_kthread(void);
  
-/* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_sched_ctrlblk = {
-	.donetail	= &rcu_sched_ctrlblk.rcucblist,
-	.curtail	= &rcu_sched_ctrlblk.rcucblist,
-};
-
-static struct rcu_ctrlblk rcu_bh_ctrlblk = {
-	.donetail	= &rcu_bh_ctrlblk.rcucblist,
-	.curtail	= &rcu_bh_ctrlblk.rcucblist,
-};
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-int rcu_scheduler_active __read_mostly;
-EXPORT_SYMBOL_GPL(rcu_scheduler_active);
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-
 /* Forward declarations for rcutiny_plugin.h. */
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+struct rcu_ctrlblk;
+static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+static int rcu_kthread(void *arg);
 static void __call_rcu(struct rcu_head *head,
 		       void (*func)(struct rcu_head *rcu),
 		       struct rcu_ctrlblk *rcp);
@@ -123,7 +108,7 @@
 {
 	if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
 	    rcu_qsctr_help(&rcu_bh_ctrlblk))
-		raise_softirq(RCU_SOFTIRQ);
+		invoke_rcu_kthread();
 }
  
 /*
@@ -132,7 +117,7 @@
 void rcu_bh_qs(int cpu)
 {
 	if (rcu_qsctr_help(&rcu_bh_ctrlblk))
-		raise_softirq(RCU_SOFTIRQ);
+		invoke_rcu_kthread();
 }
  
 /*
  
  
@@ -152,13 +137,14 @@
 }
  
 /*
- * Helper function for rcu_process_callbacks() that operates on the
- * specified rcu_ctrlkblk structure.
+ * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
+ * whose grace period has elapsed.
  */
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
+static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 {
 	struct rcu_head *next, *list;
 	unsigned long flags;
+	RCU_TRACE(int cb_count = 0);
  
 	/* If no RCU callbacks ready to invoke, just return. */
 	if (&rcp->rcucblist == rcp->donetail)
  
  
  
  
  
  
  
@@ -180,22 +166,61 @@
 		next = list->next;
 		prefetch(next);
 		debug_rcu_head_unqueue(list);
+		local_bh_disable();
 		list->func(list);
+		local_bh_enable();
 		list = next;
+		RCU_TRACE(cb_count++);
 	}
+	RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
 }
  
 /*
- * Invoke any callbacks whose grace period has completed.
+ * This kthread invokes RCU callbacks whose grace periods have
+ * elapsed.  It is awakened as needed, and takes the place of the
+ * RCU_SOFTIRQ that was used previously for this purpose.
+ * This is a kthread, but it is never stopped, at least not until
+ * the system goes down.
  */
-static void rcu_process_callbacks(struct softirq_action *unused)
+static int rcu_kthread(void *arg)
 {
-	__rcu_process_callbacks(&rcu_sched_ctrlblk);
-	__rcu_process_callbacks(&rcu_bh_ctrlblk);
-	rcu_preempt_process_callbacks();
+	unsigned long work;
+	unsigned long morework;
+	unsigned long flags;
+
+	for (;;) {
+		wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0);
+		morework = rcu_boost();
+		local_irq_save(flags);
+		work = have_rcu_kthread_work;
+		have_rcu_kthread_work = morework;
+		local_irq_restore(flags);
+		if (work) {
+			rcu_process_callbacks(&rcu_sched_ctrlblk);
+			rcu_process_callbacks(&rcu_bh_ctrlblk);
+			rcu_preempt_process_callbacks();
+		}
+		schedule_timeout_interruptible(1); /* Leave CPU for others. */
+	}
+
+	return 0;  /* Not reached, but needed to shut gcc up. */
 }
  
 /*
+ * Wake up rcu_kthread() to process callbacks now eligible for invocation
+ * or to boost readers.
+ */
+static void invoke_rcu_kthread(void)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	have_rcu_kthread_work = 1;
+	wake_up(&rcu_kthread_wq);
+	local_irq_restore(flags);
+}
+
+/*
  * Wait for a grace period to elapse.  But it is illegal to invoke
  * synchronize_sched() from within an RCU read-side critical section.
  * Therefore, any legal call to synchronize_sched() is a quiescent
@@ -230,6 +255,7 @@
 	local_irq_save(flags);
 	*rcp->curtail = head;
 	rcp->curtail = &head->next;
+	RCU_TRACE(rcp->qlen++);
 	local_irq_restore(flags);
 }
  
  
  
@@ -282,8 +308,17 @@
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_sched);
  
-void __init rcu_init(void)
+/*
+ * Spawn the kthread that invokes RCU callbacks.
+ */
+static int __init rcu_spawn_kthreads(void)
 {
-	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+	struct sched_param sp;
+
+	rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
+	sp.sched_priority = RCU_BOOST_PRIO;
+	sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
+	return 0;
 }
+early_initcall(rcu_spawn_kthreads);
@@ -22,6 +22,40 @@
  * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
  */
  
+#include <linux/kthread.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#ifdef CONFIG_RCU_TRACE
+#define RCU_TRACE(stmt)	stmt
+#else /* #ifdef CONFIG_RCU_TRACE */
+#define RCU_TRACE(stmt)
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+
+/* Global control variables for rcupdate callback mechanism. */
+struct rcu_ctrlblk {
+	struct rcu_head *rcucblist;	/* List of pending callbacks (CBs). */
+	struct rcu_head **donetail;	/* ->next pointer of last "done" CB. */
+	struct rcu_head **curtail;	/* ->next pointer of last CB. */
+	RCU_TRACE(long qlen);		/* Number of pending CBs. */
+};
+
+/* Definition for rcupdate control block. */
+static struct rcu_ctrlblk rcu_sched_ctrlblk = {
+	.donetail	= &rcu_sched_ctrlblk.rcucblist,
+	.curtail	= &rcu_sched_ctrlblk.rcucblist,
+};
+
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
+	.donetail	= &rcu_bh_ctrlblk.rcucblist,
+	.curtail	= &rcu_bh_ctrlblk.rcucblist,
+};
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+int rcu_scheduler_active __read_mostly;
+EXPORT_SYMBOL_GPL(rcu_scheduler_active);
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
 #ifdef CONFIG_TINY_PREEMPT_RCU
  
 #include <linux/delay.h>
  
  
@@ -46,17 +80,45 @@
 	struct list_head *gp_tasks;
 				/* Pointer to the first task blocking the */
 				/*  current grace period, or NULL if there */
-				/*  is not such task. */
+				/*  is no such task. */
 	struct list_head *exp_tasks;
 				/* Pointer to first task blocking the */
 				/*  current expedited grace period, or NULL */
 				/*  if there is no such task.  If there */
 				/*  is no current expedited grace period, */
 				/*  then there cannot be any such task. */
+#ifdef CONFIG_RCU_BOOST
+	struct list_head *boost_tasks;
+				/* Pointer to first task that needs to be */
+				/*  priority-boosted, or NULL if no priority */
+				/*  boosting is needed.  If there is no */
+				/*  current or expedited grace period, there */
+				/*  can be no such task. */
+#endif /* #ifdef CONFIG_RCU_BOOST */
 	u8 gpnum;		/* Current grace period. */
 	u8 gpcpu;		/* Last grace period blocked by the CPU. */
 	u8 completed;		/* Last grace period completed. */
 				/*  If all three are equal, RCU is idle. */
+#ifdef CONFIG_RCU_BOOST
+	s8 boosted_this_gp;	/* Has boosting already happened? */
+	unsigned long boost_time; /* When to start boosting (jiffies) */
+#endif /* #ifdef CONFIG_RCU_BOOST */
+#ifdef CONFIG_RCU_TRACE
+	unsigned long n_grace_periods;
+#ifdef CONFIG_RCU_BOOST
+	unsigned long n_tasks_boosted;
+	unsigned long n_exp_boosts;
+	unsigned long n_normal_boosts;
+	unsigned long n_normal_balk_blkd_tasks;
+	unsigned long n_normal_balk_gp_tasks;
+	unsigned long n_normal_balk_boost_tasks;
+	unsigned long n_normal_balk_boosted;
+	unsigned long n_normal_balk_notyet;
+	unsigned long n_normal_balk_nos;
+	unsigned long n_exp_balk_blkd_tasks;
+	unsigned long n_exp_balk_nos;
+#endif /* #ifdef CONFIG_RCU_BOOST */
+#endif /* #ifdef CONFIG_RCU_TRACE */
 };
  
 static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
@@ -122,6 +184,210 @@
 }
  
 /*
+ * Advance a ->blkd_tasks-list pointer to the next entry, instead
+ * returning NULL if at the end of the list.
+ */
+static struct list_head *rcu_next_node_entry(struct task_struct *t)
+{
+	struct list_head *np;
+
+	np = t->rcu_node_entry.next;
+	if (np == &rcu_preempt_ctrlblk.blkd_tasks)
+		np = NULL;
+	return np;
+}
+
+#ifdef CONFIG_RCU_TRACE
+
+#ifdef CONFIG_RCU_BOOST
+static void rcu_initiate_boost_trace(void);
+static void rcu_initiate_exp_boost_trace(void);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+/*
+ * Dump additional statistice for TINY_PREEMPT_RCU.
+ */
+static void show_tiny_preempt_stats(struct seq_file *m)
+{
+	seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n",
+		   rcu_preempt_ctrlblk.rcb.qlen,
+		   rcu_preempt_ctrlblk.n_grace_periods,
+		   rcu_preempt_ctrlblk.gpnum,
+		   rcu_preempt_ctrlblk.gpcpu,
+		   rcu_preempt_ctrlblk.completed,
+		   "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)],
+		   "N."[!rcu_preempt_ctrlblk.gp_tasks],
+		   "E."[!rcu_preempt_ctrlblk.exp_tasks]);
+#ifdef CONFIG_RCU_BOOST
+	seq_printf(m, "             ttb=%c btg=",
+		   "B."[!rcu_preempt_ctrlblk.boost_tasks]);
+	switch (rcu_preempt_ctrlblk.boosted_this_gp) {
+	case -1:
+		seq_puts(m, "exp");
+		break;
+	case 0:
+		seq_puts(m, "no");
+		break;
+	case 1:
+		seq_puts(m, "begun");
+		break;
+	case 2:
+		seq_puts(m, "done");
+		break;
+	default:
+		seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp);
+	}
+	seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
+		   rcu_preempt_ctrlblk.n_tasks_boosted,
+		   rcu_preempt_ctrlblk.n_exp_boosts,
+		   rcu_preempt_ctrlblk.n_normal_boosts,
+		   (int)(jiffies & 0xffff),
+		   (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
+	seq_printf(m, "             %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n",
+		   "normal balk",
+		   rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks,
+		   rcu_preempt_ctrlblk.n_normal_balk_gp_tasks,
+		   rcu_preempt_ctrlblk.n_normal_balk_boost_tasks,
+		   rcu_preempt_ctrlblk.n_normal_balk_boosted,
+		   rcu_preempt_ctrlblk.n_normal_balk_notyet,
+		   rcu_preempt_ctrlblk.n_normal_balk_nos);
+	seq_printf(m, "             exp balk: bt=%lu nos=%lu\n",
+		   rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks,
+		   rcu_preempt_ctrlblk.n_exp_balk_nos);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+}
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
+#ifdef CONFIG_RCU_BOOST
+
+#include "rtmutex_common.h"
+
+/*
+ * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
+ * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
+ */
+static int rcu_boost(void)
+{
+	unsigned long flags;
+	struct rt_mutex mtx;
+	struct list_head *np;
+	struct task_struct *t;
+
+	if (rcu_preempt_ctrlblk.boost_tasks == NULL)
+		return 0;  /* Nothing to boost. */
+	raw_local_irq_save(flags);
+	rcu_preempt_ctrlblk.boosted_this_gp++;
+	t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct,
+			 rcu_node_entry);
+	np = rcu_next_node_entry(t);
+	rt_mutex_init_proxy_locked(&mtx, t);
+	t->rcu_boost_mutex = &mtx;
+	t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
+	raw_local_irq_restore(flags);
+	rt_mutex_lock(&mtx);
+	RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
+	rcu_preempt_ctrlblk.boosted_this_gp++;
+	rt_mutex_unlock(&mtx);
+	return rcu_preempt_ctrlblk.boost_tasks != NULL;
+}
+
+/*
+ * Check to see if it is now time to start boosting RCU readers blocking
+ * the current grace period, and, if so, tell the rcu_kthread_task to
+ * start boosting them.  If there is an expedited boost in progress,
+ * we wait for it to complete.
+ *
+ * If there are no blocked readers blocking the current grace period,
+ * return 0 to let the caller know, otherwise return 1.  Note that this
+ * return value is independent of whether or not boosting was done.
+ */
+static int rcu_initiate_boost(void)
+{
+	if (!rcu_preempt_blocked_readers_cgp()) {
+		RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++);
+		return 0;
+	}
+	if (rcu_preempt_ctrlblk.gp_tasks != NULL &&
+	    rcu_preempt_ctrlblk.boost_tasks == NULL &&
+	    rcu_preempt_ctrlblk.boosted_this_gp == 0 &&
+	    ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) {
+		rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks;
+		invoke_rcu_kthread();
+		RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
+	} else
+		RCU_TRACE(rcu_initiate_boost_trace());
+	return 1;
+}
+
+/*
+ * Initiate boosting for an expedited grace period.
+ */
+static void rcu_initiate_expedited_boost(void)
+{
+	unsigned long flags;
+
+	raw_local_irq_save(flags);
+	if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) {
+		rcu_preempt_ctrlblk.boost_tasks =
+			rcu_preempt_ctrlblk.blkd_tasks.next;
+		rcu_preempt_ctrlblk.boosted_this_gp = -1;
+		invoke_rcu_kthread();
+		RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
+	} else
+		RCU_TRACE(rcu_initiate_exp_boost_trace());
+	raw_local_irq_restore(flags);
+}
+
+#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000);
+
+/*
+ * Do priority-boost accounting for the start of a new grace period.
+ */
+static void rcu_preempt_boost_start_gp(void)
+{
+	rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
+	if (rcu_preempt_ctrlblk.boosted_this_gp > 0)
+		rcu_preempt_ctrlblk.boosted_this_gp = 0;
+}
+
+#else /* #ifdef CONFIG_RCU_BOOST */
+
+/*
+ * If there is no RCU priority boosting, we don't boost.
+ */
+static int rcu_boost(void)
+{
+	return 0;
+}
+
+/*
+ * If there is no RCU priority boosting, we don't initiate boosting,
+ * but we do indicate whether there are blocked readers blocking the
+ * current grace period.
+ */
+static int rcu_initiate_boost(void)
+{
+	return rcu_preempt_blocked_readers_cgp();
+}
+
+/*
+ * If there is no RCU priority boosting, we don't initiate expedited boosting.
+ */
+static void rcu_initiate_expedited_boost(void)
+{
+}
+
+/*
+ * If there is no RCU priority boosting, nothing to do at grace-period start.
+ */
+static void rcu_preempt_boost_start_gp(void)
+{
+}
+
+#endif /* else #ifdef CONFIG_RCU_BOOST */
+
+/*
  * Record a preemptible-RCU quiescent state for the specified CPU.  Note
  * that this just means that the task currently running on the CPU is
  * in a quiescent state.  There might be any number of tasks blocked
  
  
@@ -148,11 +414,14 @@
 	rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
 	current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
  
+	/* If there is no GP then there is nothing more to do.  */
+	if (!rcu_preempt_gp_in_progress())
+		return;
 	/*
-	 * If there is no GP, or if blocked readers are still blocking GP,
-	 * then there is nothing more to do.
+	 * Check up on boosting.  If there are no readers blocking the
+	 * current grace period, leave.
 	 */
-	if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp())
+	if (rcu_initiate_boost())
 		return;
  
 	/* Advance callbacks. */
  
@@ -164,9 +433,9 @@
 	if (!rcu_preempt_blocked_readers_any())
 		rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
  
-	/* If there are done callbacks, make RCU_SOFTIRQ process them. */
+	/* If there are done callbacks, cause them to be invoked. */
 	if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
-		raise_softirq(RCU_SOFTIRQ);
+		invoke_rcu_kthread();
 }
  
 /*
  
@@ -178,12 +447,16 @@
  
 		/* Official start of GP. */
 		rcu_preempt_ctrlblk.gpnum++;
+		RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
  
 		/* Any blocked RCU readers block new GP. */
 		if (rcu_preempt_blocked_readers_any())
 			rcu_preempt_ctrlblk.gp_tasks =
 				rcu_preempt_ctrlblk.blkd_tasks.next;
  
+		/* Set up for RCU priority boosting. */
+		rcu_preempt_boost_start_gp();
+
 		/* If there is no running reader, CPU is done with GP. */
 		if (!rcu_preempt_running_reader())
 			rcu_preempt_cpu_qs();
  
@@ -304,14 +577,16 @@
 		 */
 		empty = !rcu_preempt_blocked_readers_cgp();
 		empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
-		np = t->rcu_node_entry.next;
-		if (np == &rcu_preempt_ctrlblk.blkd_tasks)
-			np = NULL;
+		np = rcu_next_node_entry(t);
 		list_del(&t->rcu_node_entry);
 		if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
 			rcu_preempt_ctrlblk.gp_tasks = np;
 		if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
 			rcu_preempt_ctrlblk.exp_tasks = np;
+#ifdef CONFIG_RCU_BOOST
+		if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
+			rcu_preempt_ctrlblk.boost_tasks = np;
+#endif /* #ifdef CONFIG_RCU_BOOST */
 		INIT_LIST_HEAD(&t->rcu_node_entry);
  
 		/*
@@ -331,6 +606,14 @@
 		if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
 			rcu_report_exp_done();
 	}
+#ifdef CONFIG_RCU_BOOST
+	/* Unboost self if was boosted. */
+	if (special & RCU_READ_UNLOCK_BOOSTED) {
+		t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
+		rt_mutex_unlock(t->rcu_boost_mutex);
+		t->rcu_boost_mutex = NULL;
+	}
+#endif /* #ifdef CONFIG_RCU_BOOST */
 	local_irq_restore(flags);
 }
  
@@ -374,7 +657,7 @@
 		rcu_preempt_cpu_qs();
 	if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
 	    rcu_preempt_ctrlblk.rcb.donetail)
-		raise_softirq(RCU_SOFTIRQ);
+		invoke_rcu_kthread();
 	if (rcu_preempt_gp_in_progress() &&
 	    rcu_cpu_blocking_cur_gp() &&
 	    rcu_preempt_running_reader())
@@ -383,7 +666,7 @@
  
 /*
  * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
- * update, so this is invoked from __rcu_process_callbacks() to
+ * update, so this is invoked from rcu_process_callbacks() to
  * handle that case.  Of course, it is invoked for all flavors of
  * RCU, but RCU callbacks can appear only on one of the lists, and
  * neither ->nexttail nor ->donetail can possibly be NULL, so there
@@ -400,7 +683,7 @@
  */
 static void rcu_preempt_process_callbacks(void)
 {
-	__rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
+	rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
 }
  
 /*
@@ -417,6 +700,7 @@
 	local_irq_save(flags);
 	*rcu_preempt_ctrlblk.nexttail = head;
 	rcu_preempt_ctrlblk.nexttail = &head->next;
+	RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++);
 	rcu_preempt_start_gp();  /* checks to see if GP needed. */
 	local_irq_restore(flags);
 }
@@ -532,6 +816,7 @@
  
 	/* Wait for tail of ->blkd_tasks list to drain. */
 	if (rcu_preempted_readers_exp())
+		rcu_initiate_expedited_boost();
 		wait_event(sync_rcu_preempt_exp_wq,
 			   !rcu_preempted_readers_exp());
  
  
@@ -572,7 +857,28 @@
  
 #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
  
+#ifdef CONFIG_RCU_TRACE
+
 /*
+ * Because preemptible RCU does not exist, it is not necessary to
+ * dump out its statistics.
+ */
+static void show_tiny_preempt_stats(struct seq_file *m)
+{
+}
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
+/*
+ * Because preemptible RCU does not exist, it is never necessary to
+ * boost preempted RCU readers.
+ */
+static int rcu_boost(void)
+{
+	return 0;
+}
+
+/*
  * Because preemptible RCU does not exist, it never has any callbacks
  * to check.
  */
  
  
@@ -599,18 +905,117 @@
 #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
  
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-
 #include <linux/kernel_stat.h>
  
 /*
  * During boot, we forgive RCU lockdep issues.  After this function is
  * invoked, we start taking RCU lockdep issues seriously.
  */
-void rcu_scheduler_starting(void)
+void __init rcu_scheduler_starting(void)
 {
 	WARN_ON(nr_context_switches() > 0);
 	rcu_scheduler_active = 1;
 }
  
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+#ifdef CONFIG_RCU_BOOST
+#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
+#else /* #ifdef CONFIG_RCU_BOOST */
+#define RCU_BOOST_PRIO 1
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+
+#ifdef CONFIG_RCU_TRACE
+
+#ifdef CONFIG_RCU_BOOST
+
+static void rcu_initiate_boost_trace(void)
+{
+	if (rcu_preempt_ctrlblk.gp_tasks == NULL)
+		rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++;
+	else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
+		rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++;
+	else if (rcu_preempt_ctrlblk.boosted_this_gp != 0)
+		rcu_preempt_ctrlblk.n_normal_balk_boosted++;
+	else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
+		rcu_preempt_ctrlblk.n_normal_balk_notyet++;
+	else
+		rcu_preempt_ctrlblk.n_normal_balk_nos++;
+}
+
+static void rcu_initiate_exp_boost_trace(void)
+{
+	if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
+		rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++;
+	else
+		rcu_preempt_ctrlblk.n_exp_balk_nos++;
+}
+
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
+{
+	unsigned long flags;
+
+	raw_local_irq_save(flags);
+	rcp->qlen -= n;
+	raw_local_irq_restore(flags);
+}
+
+/*
+ * Dump statistics for TINY_RCU, such as they are.
+ */
+static int show_tiny_stats(struct seq_file *m, void *unused)
+{
+	show_tiny_preempt_stats(m);
+	seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
+	seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
+	return 0;
+}
+
+static int show_tiny_stats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, show_tiny_stats, NULL);
+}
+
+static const struct file_operations show_tiny_stats_fops = {
+	.owner = THIS_MODULE,
+	.open = show_tiny_stats_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static struct dentry *rcudir;
+
+static int __init rcutiny_trace_init(void)
+{
+	struct dentry *retval;
+
+	rcudir = debugfs_create_dir("rcu", NULL);
+	if (!rcudir)
+		goto free_out;
+	retval = debugfs_create_file("rcudata", 0444, rcudir,
+				     NULL, &show_tiny_stats_fops);
+	if (!retval)
+		goto free_out;
+	return 0;
+free_out:
+	debugfs_remove_recursive(rcudir);
+	return 1;
+}
+
+static void __exit rcutiny_trace_cleanup(void)
+{
+	debugfs_remove_recursive(rcudir);
+}
+
+module_init(rcutiny_trace_init);
+module_exit(rcutiny_trace_cleanup);
+
+MODULE_AUTHOR("Paul E. McKenney");
+MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
+MODULE_LICENSE("GPL");
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
@@ -47,6 +47,7 @@
 #include <linux/srcu.h>
 #include <linux/slab.h>
 #include <asm/byteorder.h>
+#include <linux/sched.h>
  
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
@@ -64,6 +65,9 @@
 static int fqs_duration = 0;	/* Duration of bursts (us), 0 to disable. */
 static int fqs_holdoff = 0;	/* Hold time within burst (us). */
 static int fqs_stutter = 3;	/* Wait time between bursts (s). */
+static int test_boost = 1;	/* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
+static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
+static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
 static char *torture_type = "rcu"; /* What RCU implementation to torture. */
  
 module_param(nreaders, int, 0444);
@@ -88,6 +92,12 @@
 MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
 module_param(fqs_stutter, int, 0444);
 MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
+module_param(test_boost, int, 0444);
+MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
+module_param(test_boost_interval, int, 0444);
+MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
+module_param(test_boost_duration, int, 0444);
+MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
 module_param(torture_type, charp, 0444);
 MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
  
@@ -109,6 +119,7 @@
 static struct task_struct *shuffler_task;
 static struct task_struct *stutter_task;
 static struct task_struct *fqs_task;
+static struct task_struct *boost_tasks[NR_CPUS];
  
 #define RCU_TORTURE_PIPE_LEN 10
  
@@ -134,6 +145,12 @@
 static atomic_t n_rcu_torture_free;
 static atomic_t n_rcu_torture_mberror;
 static atomic_t n_rcu_torture_error;
+static long n_rcu_torture_boost_ktrerror;
+static long n_rcu_torture_boost_rterror;
+static long n_rcu_torture_boost_allocerror;
+static long n_rcu_torture_boost_afferror;
+static long n_rcu_torture_boost_failure;
+static long n_rcu_torture_boosts;
 static long n_rcu_torture_timers;
 static struct list_head rcu_torture_removed;
 static cpumask_var_t shuffle_tmp_mask;
@@ -147,6 +164,16 @@
 #endif
 int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
  
+#ifdef CONFIG_RCU_BOOST
+#define rcu_can_boost() 1
+#else /* #ifdef CONFIG_RCU_BOOST */
+#define rcu_can_boost() 0
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+
+static unsigned long boost_starttime;	/* jiffies of next boost test start. */
+DEFINE_MUTEX(boost_mutex);		/* protect setting boost_starttime */
+					/*  and boost task create/destroy. */
+
 /* Mediate rmmod and system shutdown.  Concurrent rmmod & shutdown illegal! */
  
 #define FULLSTOP_DONTSTOP 0	/* Normal operation. */
@@ -277,6 +304,7 @@
 	void (*fqs)(void);
 	int (*stats)(char *page);
 	int irq_capable;
+	int can_boost;
 	char *name;
 };
  
@@ -366,6 +394,7 @@
 	.fqs		= rcu_force_quiescent_state,
 	.stats		= NULL,
 	.irq_capable	= 1,
+	.can_boost	= rcu_can_boost(),
 	.name		= "rcu"
 };
  
@@ -408,6 +437,7 @@
 	.fqs		= rcu_force_quiescent_state,
 	.stats		= NULL,
 	.irq_capable	= 1,
+	.can_boost	= rcu_can_boost(),
 	.name		= "rcu_sync"
 };
  
@@ -424,6 +454,7 @@
 	.fqs		= rcu_force_quiescent_state,
 	.stats		= NULL,
 	.irq_capable	= 1,
+	.can_boost	= rcu_can_boost(),
 	.name		= "rcu_expedited"
 };
  
@@ -684,6 +715,110 @@
 };
  
 /*
+ * RCU torture priority-boost testing.  Runs one real-time thread per
+ * CPU for moderate bursts, repeatedly registering RCU callbacks and
+ * spinning waiting for them to be invoked.  If a given callback takes
+ * too long to be invoked, we assume that priority inversion has occurred.
+ */
+
+struct rcu_boost_inflight {
+	struct rcu_head rcu;
+	int inflight;
+};
+
+static void rcu_torture_boost_cb(struct rcu_head *head)
+{
+	struct rcu_boost_inflight *rbip =
+		container_of(head, struct rcu_boost_inflight, rcu);
+
+	smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
+	rbip->inflight = 0;
+}
+
+static int rcu_torture_boost(void *arg)
+{
+	unsigned long call_rcu_time;
+	unsigned long endtime;
+	unsigned long oldstarttime;
+	struct rcu_boost_inflight rbi = { .inflight = 0 };
+	struct sched_param sp;
+
+	VERBOSE_PRINTK_STRING("rcu_torture_boost started");
+
+	/* Set real-time priority. */
+	sp.sched_priority = 1;
+	if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
+		VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!");
+		n_rcu_torture_boost_rterror++;
+	}
+
+	/* Each pass through the following loop does one boost-test cycle. */
+	do {
+		/* Wait for the next test interval. */
+		oldstarttime = boost_starttime;
+		while (jiffies - oldstarttime > ULONG_MAX / 2) {
+			schedule_timeout_uninterruptible(1);
+			rcu_stutter_wait("rcu_torture_boost");
+			if (kthread_should_stop() ||
+			    fullstop != FULLSTOP_DONTSTOP)
+				goto checkwait;
+		}
+
+		/* Do one boost-test interval. */
+		endtime = oldstarttime + test_boost_duration * HZ;
+		call_rcu_time = jiffies;
+		while (jiffies - endtime > ULONG_MAX / 2) {
+			/* If we don't have a callback in flight, post one. */
+			if (!rbi.inflight) {
+				smp_mb(); /* RCU core before ->inflight = 1. */
+				rbi.inflight = 1;
+				call_rcu(&rbi.rcu, rcu_torture_boost_cb);
+				if (jiffies - call_rcu_time >
+					 test_boost_duration * HZ - HZ / 2) {
+					VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed");
+					n_rcu_torture_boost_failure++;
+				}
+				call_rcu_time = jiffies;
+			}
+			cond_resched();
+			rcu_stutter_wait("rcu_torture_boost");
+			if (kthread_should_stop() ||
+			    fullstop != FULLSTOP_DONTSTOP)
+				goto checkwait;
+		}
+
+		/*
+		 * Set the start time of the next test interval.
+		 * Yes, this is vulnerable to long delays, but such
+		 * delays simply cause a false negative for the next
+		 * interval.  Besides, we are running at RT priority,
+		 * so delays should be relatively rare.
+		 */
+		while (oldstarttime == boost_starttime) {
+			if (mutex_trylock(&boost_mutex)) {
+				boost_starttime = jiffies +
+						  test_boost_interval * HZ;
+				n_rcu_torture_boosts++;
+				mutex_unlock(&boost_mutex);
+				break;
+			}
+			schedule_timeout_uninterruptible(1);
+		}
+
+		/* Go do the stutter. */
+checkwait:	rcu_stutter_wait("rcu_torture_boost");
+	} while (!kthread_should_stop() && fullstop  == FULLSTOP_DONTSTOP);
+
+	/* Clean up and exit. */
+	VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
+	rcutorture_shutdown_absorb("rcu_torture_boost");
+	while (!kthread_should_stop() || rbi.inflight)
+		schedule_timeout_uninterruptible(1);
+	smp_mb(); /* order accesses to ->inflight before stack-frame death. */
+	return 0;
+}
+
+/*
  * RCU torture force-quiescent-state kthread.  Repeatedly induces
  * bursts of calls to force_quiescent_state(), increasing the probability
  * of occurrence of some important types of race conditions.
@@ -933,7 +1068,8 @@
 	cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
 	cnt += sprintf(&page[cnt],
 		       "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
-		       "rtmbe: %d nt: %ld",
+		       "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld "
+		       "rtbf: %ld rtb: %ld nt: %ld",
 		       rcu_torture_current,
 		       rcu_torture_current_version,
 		       list_empty(&rcu_torture_freelist),
  
@@ -941,8 +1077,19 @@
 		       atomic_read(&n_rcu_torture_alloc_fail),
 		       atomic_read(&n_rcu_torture_free),
 		       atomic_read(&n_rcu_torture_mberror),
+		       n_rcu_torture_boost_ktrerror,
+		       n_rcu_torture_boost_rterror,
+		       n_rcu_torture_boost_allocerror,
+		       n_rcu_torture_boost_afferror,
+		       n_rcu_torture_boost_failure,
+		       n_rcu_torture_boosts,
 		       n_rcu_torture_timers);
-	if (atomic_read(&n_rcu_torture_mberror) != 0)
+	if (atomic_read(&n_rcu_torture_mberror) != 0 ||
+	    n_rcu_torture_boost_ktrerror != 0 ||
+	    n_rcu_torture_boost_rterror != 0 ||
+	    n_rcu_torture_boost_allocerror != 0 ||
+	    n_rcu_torture_boost_afferror != 0 ||
+	    n_rcu_torture_boost_failure != 0)
 		cnt += sprintf(&page[cnt], " !!!");
 	cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
 	if (i > 1) {
  
  
  
  
@@ -1094,22 +1241,91 @@
 }
  
 static inline void
-rcu_torture_print_module_parms(char *tag)
+rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
 {
 	printk(KERN_ALERT "%s" TORTURE_FLAG
 		"--- %s: nreaders=%d nfakewriters=%d "
 		"stat_interval=%d verbose=%d test_no_idle_hz=%d "
 		"shuffle_interval=%d stutter=%d irqreader=%d "
-		"fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n",
+		"fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
+		"test_boost=%d/%d test_boost_interval=%d "
+		"test_boost_duration=%d\n",
 		torture_type, tag, nrealreaders, nfakewriters,
 		stat_interval, verbose, test_no_idle_hz, shuffle_interval,
-		stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter);
+		stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
+		test_boost, cur_ops->can_boost,
+		test_boost_interval, test_boost_duration);
 }
  
-static struct notifier_block rcutorture_nb = {
+static struct notifier_block rcutorture_shutdown_nb = {
 	.notifier_call = rcutorture_shutdown_notify,
 };
  
+static void rcutorture_booster_cleanup(int cpu)
+{
+	struct task_struct *t;
+
+	if (boost_tasks[cpu] == NULL)
+		return;
+	mutex_lock(&boost_mutex);
+	VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task");
+	t = boost_tasks[cpu];
+	boost_tasks[cpu] = NULL;
+	mutex_unlock(&boost_mutex);
+
+	/* This must be outside of the mutex, otherwise deadlock! */
+	kthread_stop(t);
+}
+
+static int rcutorture_booster_init(int cpu)
+{
+	int retval;
+
+	if (boost_tasks[cpu] != NULL)
+		return 0;  /* Already created, nothing more to do. */
+
+	/* Don't allow time recalculation while creating a new task. */
+	mutex_lock(&boost_mutex);
+	VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
+	boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL,
+					  "rcu_torture_boost");
+	if (IS_ERR(boost_tasks[cpu])) {
+		retval = PTR_ERR(boost_tasks[cpu]);
+		VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
+		n_rcu_torture_boost_ktrerror++;
+		boost_tasks[cpu] = NULL;
+		mutex_unlock(&boost_mutex);
+		return retval;
+	}
+	kthread_bind(boost_tasks[cpu], cpu);
+	wake_up_process(boost_tasks[cpu]);
+	mutex_unlock(&boost_mutex);
+	return 0;
+}
+
+static int rcutorture_cpu_notify(struct notifier_block *self,
+				 unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+
+	switch (action) {
+	case CPU_ONLINE:
+	case CPU_DOWN_FAILED:
+		(void)rcutorture_booster_init(cpu);
+		break;
+	case CPU_DOWN_PREPARE:
+		rcutorture_booster_cleanup(cpu);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block rcutorture_cpu_nb = {
+	.notifier_call = rcutorture_cpu_notify,
+};
+
 static void
 rcu_torture_cleanup(void)
 {
@@ -1127,7 +1343,7 @@
 	}
 	fullstop = FULLSTOP_RMMOD;
 	mutex_unlock(&fullstop_mutex);
-	unregister_reboot_notifier(&rcutorture_nb);
+	unregister_reboot_notifier(&rcutorture_shutdown_nb);
 	if (stutter_task) {
 		VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
 		kthread_stop(stutter_task);
@@ -1184,6 +1400,12 @@
 		kthread_stop(fqs_task);
 	}
 	fqs_task = NULL;
+	if ((test_boost == 1 && cur_ops->can_boost) ||
+	    test_boost == 2) {
+		unregister_cpu_notifier(&rcutorture_cpu_nb);
+		for_each_possible_cpu(i)
+			rcutorture_booster_cleanup(i);
+	}
  
 	/* Wait for all RCU callbacks to fire.  */
  
  
@@ -1195,9 +1417,9 @@
 	if (cur_ops->cleanup)
 		cur_ops->cleanup();
 	if (atomic_read(&n_rcu_torture_error))
-		rcu_torture_print_module_parms("End of test: FAILURE");
+		rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
 	else
-		rcu_torture_print_module_parms("End of test: SUCCESS");
+		rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
 }
  
 static int __init
@@ -1242,7 +1464,7 @@
 		nrealreaders = nreaders;
 	else
 		nrealreaders = 2 * num_online_cpus();
-	rcu_torture_print_module_parms("Start of test");
+	rcu_torture_print_module_parms(cur_ops, "Start of test");
 	fullstop = FULLSTOP_DONTSTOP;
  
 	/* Set up the freelist. */
@@ -1263,6 +1485,12 @@
 	atomic_set(&n_rcu_torture_free, 0);
 	atomic_set(&n_rcu_torture_mberror, 0);
 	atomic_set(&n_rcu_torture_error, 0);
+	n_rcu_torture_boost_ktrerror = 0;
+	n_rcu_torture_boost_rterror = 0;
+	n_rcu_torture_boost_allocerror = 0;
+	n_rcu_torture_boost_afferror = 0;
+	n_rcu_torture_boost_failure = 0;
+	n_rcu_torture_boosts = 0;
 	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
 		atomic_set(&rcu_torture_wcount[i], 0);
 	for_each_possible_cpu(cpu) {
@@ -1376,7 +1604,27 @@
 			goto unwind;
 		}
 	}
-	register_reboot_notifier(&rcutorture_nb);
+	if (test_boost_interval < 1)
+		test_boost_interval = 1;
+	if (test_boost_duration < 2)
+		test_boost_duration = 2;
+	if ((test_boost == 1 && cur_ops->can_boost) ||
+	    test_boost == 2) {
+		int retval;
+
+		boost_starttime = jiffies + test_boost_interval * HZ;
+		register_cpu_notifier(&rcutorture_cpu_nb);
+		for_each_possible_cpu(i) {
+			if (cpu_is_offline(i))
+				continue;  /* Heuristic: CPU can go offline. */
+			retval = rcutorture_booster_init(i);
+			if (retval < 0) {
+				firsterr = retval;
+				goto unwind;
+			}
+		}
+	}
+	register_reboot_notifier(&rcutorture_shutdown_nb);
 	mutex_unlock(&fullstop_mutex);
 	return 0;
  
@@ -67,9 +67,6 @@
 	.gpnum = -300, \
 	.completed = -300, \
 	.onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
-	.orphan_cbs_list = NULL, \
-	.orphan_cbs_tail = &structname.orphan_cbs_list, \
-	.orphan_qlen = 0, \
 	.fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
 	.n_force_qs = 0, \
 	.n_force_qs_ngp = 0, \
  
@@ -620,9 +617,17 @@
 static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
 {
 	if (rdp->gpnum != rnp->gpnum) {
-		rdp->qs_pending = 1;
-		rdp->passed_quiesc = 0;
+		/*
+		 * If the current grace period is waiting for this CPU,
+		 * set up to detect a quiescent state, otherwise don't
+		 * go looking for one.
+		 */
 		rdp->gpnum = rnp->gpnum;
+		if (rnp->qsmask & rdp->grpmask) {
+			rdp->qs_pending = 1;
+			rdp->passed_quiesc = 0;
+		} else
+			rdp->qs_pending = 0;
 	}
 }
  
@@ -681,6 +686,24 @@
  
 		/* Remember that we saw this grace-period completion. */
 		rdp->completed = rnp->completed;
+
+		/*
+		 * If we were in an extended quiescent state, we may have
+		 * missed some grace periods that others CPUs handled on
+		 * our behalf. Catch up with this state to avoid noting
+		 * spurious new grace periods.  If another grace period
+		 * has started, then rnp->gpnum will have advanced, so
+		 * we will detect this later on.
+		 */
+		if (ULONG_CMP_LT(rdp->gpnum, rdp->completed))
+			rdp->gpnum = rdp->completed;
+
+		/*
+		 * If RCU does not need a quiescent state from this CPU,
+		 * then make sure that this CPU doesn't go looking for one.
+		 */
+		if ((rnp->qsmask & rdp->grpmask) == 0)
+			rdp->qs_pending = 0;
 	}
 }
  
  
  
  
  
  
  
  
@@ -984,56 +1007,34 @@
 #ifdef CONFIG_HOTPLUG_CPU
  
 /*
- * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the
- * specified flavor of RCU.  The callbacks will be adopted by the next
- * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever
- * comes first.  Because this is invoked from the CPU_DYING notifier,
- * irqs are already disabled.
+ * Move a dying CPU's RCU callbacks to online CPU's callback list.
+ * Synchronization is not required because this function executes
+ * in stop_machine() context.
  */
-static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
+static void rcu_send_cbs_to_online(struct rcu_state *rsp)
 {
 	int i;
+	/* current DYING CPU is cleared in the cpu_online_mask */
+	int receive_cpu = cpumask_any(cpu_online_mask);
 	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+	struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
  
 	if (rdp->nxtlist == NULL)
 		return;  /* irqs disabled, so comparison is stable. */
-	raw_spin_lock(&rsp->onofflock);  /* irqs already disabled. */
-	*rsp->orphan_cbs_tail = rdp->nxtlist;
-	rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
+
+	*receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
+	receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+	receive_rdp->qlen += rdp->qlen;
+	receive_rdp->n_cbs_adopted += rdp->qlen;
+	rdp->n_cbs_orphaned += rdp->qlen;
+
 	rdp->nxtlist = NULL;
 	for (i = 0; i < RCU_NEXT_SIZE; i++)
 		rdp->nxttail[i] = &rdp->nxtlist;
-	rsp->orphan_qlen += rdp->qlen;
-	rdp->n_cbs_orphaned += rdp->qlen;
 	rdp->qlen = 0;
-	raw_spin_unlock(&rsp->onofflock);  /* irqs remain disabled. */
 }
  
 /*
- * Adopt previously orphaned RCU callbacks.
- */
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
-{
-	unsigned long flags;
-	struct rcu_data *rdp;
-
-	raw_spin_lock_irqsave(&rsp->onofflock, flags);
-	rdp = this_cpu_ptr(rsp->rda);
-	if (rsp->orphan_cbs_list == NULL) {
-		raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
-		return;
-	}
-	*rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
-	rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
-	rdp->qlen += rsp->orphan_qlen;
-	rdp->n_cbs_adopted += rsp->orphan_qlen;
-	rsp->orphan_cbs_list = NULL;
-	rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
-	rsp->orphan_qlen = 0;
-	raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
-}
-
-/*
  * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
  * and move all callbacks from the outgoing CPU to the current one.
  */
@@ -1081,8 +1082,6 @@
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	if (need_report & RCU_OFL_TASKS_EXP_GP)
 		rcu_report_exp_rnp(rsp, rnp);
-
-	rcu_adopt_orphan_cbs(rsp);
 }
  
 /*
  
@@ -1100,14 +1099,10 @@
  
 #else /* #ifdef CONFIG_HOTPLUG_CPU */
  
-static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
+static void rcu_send_cbs_to_online(struct rcu_state *rsp)
 {
 }
  
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
-{
-}
-
 static void rcu_offline_cpu(int cpu)
 {
 }
  
@@ -1440,22 +1435,11 @@
 	 */
 	local_irq_save(flags);
 	rdp = this_cpu_ptr(rsp->rda);
-	rcu_process_gp_end(rsp, rdp);
-	check_for_new_grace_period(rsp, rdp);
  
 	/* Add the callback to our list. */
 	*rdp->nxttail[RCU_NEXT_TAIL] = head;
 	rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
  
-	/* Start a new grace period if one not already started. */
-	if (!rcu_gp_in_progress(rsp)) {
-		unsigned long nestflag;
-		struct rcu_node *rnp_root = rcu_get_root(rsp);
-
-		raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
-		rcu_start_gp(rsp, nestflag);  /* releases rnp_root->lock. */
-	}
-
 	/*
 	 * Force the grace period if too many callbacks or too long waiting.
 	 * Enforce hysteresis, and don't invoke force_quiescent_state()
@@ -1464,12 +1448,27 @@
 	 * is the only one waiting for a grace period to complete.
 	 */
 	if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
-		rdp->blimit = LONG_MAX;
-		if (rsp->n_force_qs == rdp->n_force_qs_snap &&
-		    *rdp->nxttail[RCU_DONE_TAIL] != head)
-			force_quiescent_state(rsp, 0);
-		rdp->n_force_qs_snap = rsp->n_force_qs;
-		rdp->qlen_last_fqs_check = rdp->qlen;
+
+		/* Are we ignoring a completed grace period? */
+		rcu_process_gp_end(rsp, rdp);
+		check_for_new_grace_period(rsp, rdp);
+
+		/* Start a new grace period if one not already started. */
+		if (!rcu_gp_in_progress(rsp)) {
+			unsigned long nestflag;
+			struct rcu_node *rnp_root = rcu_get_root(rsp);
+
+			raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
+			rcu_start_gp(rsp, nestflag);  /* rlses rnp_root->lock */
+		} else {
+			/* Give the grace period a kick. */
+			rdp->blimit = LONG_MAX;
+			if (rsp->n_force_qs == rdp->n_force_qs_snap &&
+			    *rdp->nxttail[RCU_DONE_TAIL] != head)
+				force_quiescent_state(rsp, 0);
+			rdp->n_force_qs_snap = rsp->n_force_qs;
+			rdp->qlen_last_fqs_check = rdp->qlen;
+		}
 	} else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
 		force_quiescent_state(rsp, 1);
 	local_irq_restore(flags);
  
  
@@ -1699,13 +1698,12 @@
 	 * decrement rcu_barrier_cpu_count -- otherwise the first CPU
 	 * might complete its grace period before all of the other CPUs
 	 * did their increment, causing this function to return too
-	 * early.
+	 * early.  Note that on_each_cpu() disables irqs, which prevents
+	 * any CPUs from coming online or going offline until each online
+	 * CPU has queued its RCU-barrier callback.
 	 */
 	atomic_set(&rcu_barrier_cpu_count, 1);
-	preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */
-	rcu_adopt_orphan_cbs(rsp);
 	on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
-	preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */
 	if (atomic_dec_and_test(&rcu_barrier_cpu_count))
 		complete(&rcu_barrier_completion);
 	wait_for_completion(&rcu_barrier_completion);
  
@@ -1831,18 +1829,13 @@
 	case CPU_DYING:
 	case CPU_DYING_FROZEN:
 		/*
-		 * preempt_disable() in _rcu_barrier() prevents stop_machine(),
-		 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
-		 * returns, all online cpus have queued rcu_barrier_func().
-		 * The dying CPU clears its cpu_online_mask bit and
-		 * moves all of its RCU callbacks to ->orphan_cbs_list
-		 * in the context of stop_machine(), so subsequent calls
-		 * to _rcu_barrier() will adopt these callbacks and only
-		 * then queue rcu_barrier_func() on all remaining CPUs.
+		 * The whole machine is "stopped" except this CPU, so we can
+		 * touch any data without introducing corruption. We send the
+		 * dying CPU's callbacks to an arbitrarily chosen online CPU.
 		 */
-		rcu_send_cbs_to_orphanage(&rcu_bh_state);
-		rcu_send_cbs_to_orphanage(&rcu_sched_state);
-		rcu_preempt_send_cbs_to_orphanage();
+		rcu_send_cbs_to_online(&rcu_bh_state);
+		rcu_send_cbs_to_online(&rcu_sched_state);
+		rcu_preempt_send_cbs_to_online();
 		break;
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
  
@@ -1880,8 +1873,9 @@
 {
 	int i;
  
-	for (i = NUM_RCU_LVLS - 1; i >= 0; i--)
+	for (i = NUM_RCU_LVLS - 1; i > 0; i--)
 		rsp->levelspread[i] = CONFIG_RCU_FANOUT;
+	rsp->levelspread[0] = RCU_FANOUT_LEAF;
 }
 #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
 static void __init rcu_init_levelspread(struct rcu_state *rsp)
@@ -31,46 +31,51 @@
 /*
  * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
  * In theory, it should be possible to add more levels straightforwardly.
- * In practice, this has not been tested, so there is probably some
- * bug somewhere.
+ * In practice, this did work well going from three levels to four.
+ * Of course, your mileage may vary.
  */
 #define MAX_RCU_LVLS 4
-#define RCU_FANOUT	      (CONFIG_RCU_FANOUT)
-#define RCU_FANOUT_SQ	      (RCU_FANOUT * RCU_FANOUT)
-#define RCU_FANOUT_CUBE	      (RCU_FANOUT_SQ * RCU_FANOUT)
-#define RCU_FANOUT_FOURTH     (RCU_FANOUT_CUBE * RCU_FANOUT)
+#if CONFIG_RCU_FANOUT > 16
+#define RCU_FANOUT_LEAF       16
+#else /* #if CONFIG_RCU_FANOUT > 16 */
+#define RCU_FANOUT_LEAF       (CONFIG_RCU_FANOUT)
+#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
+#define RCU_FANOUT_1	      (RCU_FANOUT_LEAF)
+#define RCU_FANOUT_2	      (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_3	      (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_4	      (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
  
-#if NR_CPUS <= RCU_FANOUT
+#if NR_CPUS <= RCU_FANOUT_1
 #  define NUM_RCU_LVLS	      1
 #  define NUM_RCU_LVL_0	      1
 #  define NUM_RCU_LVL_1	      (NR_CPUS)
 #  define NUM_RCU_LVL_2	      0
 #  define NUM_RCU_LVL_3	      0
 #  define NUM_RCU_LVL_4	      0
-#elif NR_CPUS <= RCU_FANOUT_SQ
+#elif NR_CPUS <= RCU_FANOUT_2
 #  define NUM_RCU_LVLS	      2
 #  define NUM_RCU_LVL_0	      1
-#  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
+#  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
 #  define NUM_RCU_LVL_2	      (NR_CPUS)
 #  define NUM_RCU_LVL_3	      0
 #  define NUM_RCU_LVL_4	      0
-#elif NR_CPUS <= RCU_FANOUT_CUBE
+#elif NR_CPUS <= RCU_FANOUT_3
 #  define NUM_RCU_LVLS	      3
 #  define NUM_RCU_LVL_0	      1
-#  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
-#  define NUM_RCU_LVL_2	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
-#  define NUM_RCU_LVL_3	      NR_CPUS
+#  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
+#  define NUM_RCU_LVL_2	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+#  define NUM_RCU_LVL_3	      (NR_CPUS)
 #  define NUM_RCU_LVL_4	      0
-#elif NR_CPUS <= RCU_FANOUT_FOURTH
+#elif NR_CPUS <= RCU_FANOUT_4
 #  define NUM_RCU_LVLS	      4
 #  define NUM_RCU_LVL_0	      1
-#  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE)
-#  define NUM_RCU_LVL_2	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
-#  define NUM_RCU_LVL_3	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
-#  define NUM_RCU_LVL_4	      NR_CPUS
+#  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
+#  define NUM_RCU_LVL_2	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
+#  define NUM_RCU_LVL_3	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+#  define NUM_RCU_LVL_4	      (NR_CPUS)
 #else
 # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
-#endif /* #if (NR_CPUS) <= RCU_FANOUT */
+#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
  
 #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
 #define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
@@ -203,8 +208,8 @@
 	long		qlen_last_fqs_check;
 					/* qlen at last check for QS forcing */
 	unsigned long	n_cbs_invoked;	/* count of RCU cbs invoked. */
-	unsigned long	n_cbs_orphaned;	/* RCU cbs sent to orphanage. */
-	unsigned long	n_cbs_adopted;	/* RCU cbs adopted from orphanage. */
+	unsigned long   n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
+	unsigned long   n_cbs_adopted;  /* RCU cbs adopted from dying CPU */
 	unsigned long	n_force_qs_snap;
 					/* did other CPU force QS recently? */
 	long		blimit;		/* Upper limit on a processed batch */
@@ -309,15 +314,7 @@
 	/* End of fields guarded by root rcu_node's lock. */
  
 	raw_spinlock_t onofflock;		/* exclude on/offline and */
-						/*  starting new GP.  Also */
-						/*  protects the following */
-						/*  orphan_cbs fields. */
-	struct rcu_head *orphan_cbs_list;	/* list of rcu_head structs */
-						/*  orphaned by all CPUs in */
-						/*  a given leaf rcu_node */
-						/*  going offline. */
-	struct rcu_head **orphan_cbs_tail;	/* And tail pointer. */
-	long orphan_qlen;			/* Number of orphaned cbs. */
+						/*  starting new GP. */
 	raw_spinlock_t fqslock;			/* Only one task forcing */
 						/*  quiescent states. */
 	unsigned long jiffies_force_qs;		/* Time at which to invoke */
@@ -390,7 +387,7 @@
 static int rcu_preempt_pending(int cpu);
 static int rcu_preempt_needs_cpu(int cpu);
 static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
-static void rcu_preempt_send_cbs_to_orphanage(void);
+static void rcu_preempt_send_cbs_to_online(void);
 static void __init __rcu_init_preempt(void);
 static void rcu_needs_cpu_flush(void);
  
@@ -25,6 +25,7 @@
  */
  
 #include <linux/delay.h>
+#include <linux/stop_machine.h>
  
 /*
  * Check the RCU kernel configuration parameters and print informative
  
  
@@ -773,11 +774,11 @@
 }
  
 /*
- * Move preemptable RCU's callbacks to ->orphan_cbs_list.
+ * Move preemptable RCU's callbacks from dying CPU to other online CPU.
  */
-static void rcu_preempt_send_cbs_to_orphanage(void)
+static void rcu_preempt_send_cbs_to_online(void)
 {
-	rcu_send_cbs_to_orphanage(&rcu_preempt_state);
+	rcu_send_cbs_to_online(&rcu_preempt_state);
 }
  
 /*
@@ -1001,7 +1002,7 @@
 /*
  * Because there is no preemptable RCU, there are no callbacks to move.
  */
-static void rcu_preempt_send_cbs_to_orphanage(void)
+static void rcu_preempt_send_cbs_to_online(void)
 {
 }
  
@@ -1013,6 +1014,132 @@
 }
  
 #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
+
+#ifndef CONFIG_SMP
+
+void synchronize_sched_expedited(void)
+{
+	cond_resched();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+#else /* #ifndef CONFIG_SMP */
+
+static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
+static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
+
+static int synchronize_sched_expedited_cpu_stop(void *data)
+{
+	/*
+	 * There must be a full memory barrier on each affected CPU
+	 * between the time that try_stop_cpus() is called and the
+	 * time that it returns.
+	 *
+	 * In the current initial implementation of cpu_stop, the
+	 * above condition is already met when the control reaches
+	 * this point and the following smp_mb() is not strictly
+	 * necessary.  Do smp_mb() anyway for documentation and
+	 * robustness against future implementation changes.
+	 */
+	smp_mb(); /* See above comment block. */
+	return 0;
+}
+
+/*
+ * Wait for an rcu-sched grace period to elapse, but use "big hammer"
+ * approach to force grace period to end quickly.  This consumes
+ * significant time on all CPUs, and is thus not recommended for
+ * any sort of common-case code.
+ *
+ * Note that it is illegal to call this function while holding any
+ * lock that is acquired by a CPU-hotplug notifier.  Failing to
+ * observe this restriction will result in deadlock.
+ *
+ * This implementation can be thought of as an application of ticket
+ * locking to RCU, with sync_sched_expedited_started and
+ * sync_sched_expedited_done taking on the roles of the halves
+ * of the ticket-lock word.  Each task atomically increments
+ * sync_sched_expedited_started upon entry, snapshotting the old value,
+ * then attempts to stop all the CPUs.  If this succeeds, then each
+ * CPU will have executed a context switch, resulting in an RCU-sched
+ * grace period.  We are then done, so we use atomic_cmpxchg() to
+ * update sync_sched_expedited_done to match our snapshot -- but
+ * only if someone else has not already advanced past our snapshot.
+ *
+ * On the other hand, if try_stop_cpus() fails, we check the value
+ * of sync_sched_expedited_done.  If it has advanced past our
+ * initial snapshot, then someone else must have forced a grace period
+ * some time after we took our snapshot.  In this case, our work is
+ * done for us, and we can simply return.  Otherwise, we try again,
+ * but keep our initial snapshot for purposes of checking for someone
+ * doing our work for us.
+ *
+ * If we fail too many times in a row, we fall back to synchronize_sched().
+ */
+void synchronize_sched_expedited(void)
+{
+	int firstsnap, s, snap, trycount = 0;
+
+	/* Note that atomic_inc_return() implies full memory barrier. */
+	firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
+	get_online_cpus();
+
+	/*
+	 * Each pass through the following loop attempts to force a
+	 * context switch on each CPU.
+	 */
+	while (try_stop_cpus(cpu_online_mask,
+			     synchronize_sched_expedited_cpu_stop,
+			     NULL) == -EAGAIN) {
+		put_online_cpus();
+
+		/* No joy, try again later.  Or just synchronize_sched(). */
+		if (trycount++ < 10)
+			udelay(trycount * num_online_cpus());
+		else {
+			synchronize_sched();
+			return;
+		}
+
+		/* Check to see if someone else did our work for us. */
+		s = atomic_read(&sync_sched_expedited_done);
+		if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
+			smp_mb(); /* ensure test happens before caller kfree */
+			return;
+		}
+
+		/*
+		 * Refetching sync_sched_expedited_started allows later
+		 * callers to piggyback on our grace period.  We subtract
+		 * 1 to get the same token that the last incrementer got.
+		 * We retry after they started, so our grace period works
+		 * for them, and they started after our first try, so their
+		 * grace period works for us.
+		 */
+		get_online_cpus();
+		snap = atomic_read(&sync_sched_expedited_started) - 1;
+		smp_mb(); /* ensure read is before try_stop_cpus(). */
+	}
+
+	/*
+	 * Everyone up to our most recent fetch is covered by our grace
+	 * period.  Update the counter, but only if our work is still
+	 * relevant -- which it won't be if someone who started later
+	 * than we did beat us to the punch.
+	 */
+	do {
+		s = atomic_read(&sync_sched_expedited_done);
+		if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
+			smp_mb(); /* ensure test happens before caller kfree */
+			break;
+		}
+	} while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
+
+	put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+#endif /* #else #ifndef CONFIG_SMP */
  
 #if !defined(CONFIG_RCU_FAST_NO_HZ)
  
@@ -166,13 +166,13 @@
  
 	gpnum = rsp->gpnum;
 	seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
-		      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
+		      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
 		   rsp->completed, gpnum, rsp->signaled,
 		   (long)(rsp->jiffies_force_qs - jiffies),
 		   (int)(jiffies & 0xffff),
 		   rsp->n_force_qs, rsp->n_force_qs_ngp,
 		   rsp->n_force_qs - rsp->n_force_qs_ngp,
-		   rsp->n_force_qs_lh, rsp->orphan_qlen);
+		   rsp->n_force_qs_lh);
 	for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
 		if (rnp->level != level) {
 			seq_puts(m, "\n");
@@ -300,7 +300,7 @@
  
 static struct dentry *rcudir;
  
-static int __init rcuclassic_trace_init(void)
+static int __init rcutree_trace_init(void)
 {
 	struct dentry *retval;
  
  
@@ -337,14 +337,14 @@
 	return 1;
 }
  
-static void __exit rcuclassic_trace_cleanup(void)
+static void __exit rcutree_trace_cleanup(void)
 {
 	debugfs_remove_recursive(rcudir);
 }
  
  
-module_init(rcuclassic_trace_init);
-module_exit(rcuclassic_trace_cleanup);
+module_init(rcutree_trace_init);
+module_exit(rcutree_trace_cleanup);
  
 MODULE_AUTHOR("Paul E. McKenney");
 MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
@@ -9533,74 +9533,4 @@
 	.subsys_id = cpuacct_subsys_id,
 };
 #endif	/* CONFIG_CGROUP_CPUACCT */
-
-#ifndef CONFIG_SMP
-
-void synchronize_sched_expedited(void)
-{
-	barrier();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
-#else /* #ifndef CONFIG_SMP */
-
-static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
-
-static int synchronize_sched_expedited_cpu_stop(void *data)
-{
-	/*
-	 * There must be a full memory barrier on each affected CPU
-	 * between the time that try_stop_cpus() is called and the
-	 * time that it returns.
-	 *
-	 * In the current initial implementation of cpu_stop, the
-	 * above condition is already met when the control reaches
-	 * this point and the following smp_mb() is not strictly
-	 * necessary.  Do smp_mb() anyway for documentation and
-	 * robustness against future implementation changes.
-	 */
-	smp_mb(); /* See above comment block. */
-	return 0;
-}
-
-/*
- * Wait for an rcu-sched grace period to elapse, but use "big hammer"
- * approach to force grace period to end quickly.  This consumes
- * significant time on all CPUs, and is thus not recommended for
- * any sort of common-case code.
- *
- * Note that it is illegal to call this function while holding any
- * lock that is acquired by a CPU-hotplug notifier.  Failing to
- * observe this restriction will result in deadlock.
- */
-void synchronize_sched_expedited(void)
-{
-	int snap, trycount = 0;
-
-	smp_mb();  /* ensure prior mod happens before capturing snap. */
-	snap = atomic_read(&synchronize_sched_expedited_count) + 1;
-	get_online_cpus();
-	while (try_stop_cpus(cpu_online_mask,
-			     synchronize_sched_expedited_cpu_stop,
-			     NULL) == -EAGAIN) {
-		put_online_cpus();
-		if (trycount++ < 10)
-			udelay(trycount * num_online_cpus());
-		else {
-			synchronize_sched();
-			return;
-		}
-		if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
-			smp_mb(); /* ensure test happens before caller kfree */
-			return;
-		}
-		get_online_cpus();
-	}
-	atomic_inc(&synchronize_sched_expedited_count);
-	smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
-	put_online_cpus();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
-#endif /* #else #ifndef CONFIG_SMP */
@@ -31,6 +31,7 @@
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/smp.h>
+#include <linux/delay.h>
 #include <linux/srcu.h>
  
 static int init_srcu_struct_fields(struct srcu_struct *sp)
  
@@ -203,9 +204,14 @@
 	 * all srcu_read_lock() calls using the old counters have completed.
 	 * Their corresponding critical sections might well be still
 	 * executing, but the srcu_read_lock() primitives themselves
-	 * will have finished executing.
+	 * will have finished executing.  We initially give readers
+	 * an arbitrarily chosen 10 microseconds to get out of their
+	 * SRCU read-side critical sections, then loop waiting 1/HZ
+	 * seconds per iteration.
 	 */
  
+	if (srcu_readers_active_idx(sp, idx))
+		udelay(CONFIG_SRCU_SYNCHRONIZE_DELAY);
 	while (srcu_readers_active_idx(sp, idx))
 		schedule_timeout_interruptible(1);
1	1	CONFIG_RCU_TRACE debugfs Files and Formats
2	2
3	3
4		-The rcutree implementation of RCU provides debugfs trace output that
5		-summarizes counters and state. This information is useful for debugging
6		-RCU itself, and can sometimes also help to debug abuses of RCU.
7		-The following sections describe the debugfs files and formats.
	4	+The rcutree and rcutiny implementations of RCU provide debugfs trace
	5	+output that summarizes counters and state. This information is useful for
	6	+debugging RCU itself, and can sometimes also help to debug abuses of RCU.
	7	+The following sections describe the debugfs files and formats, first
	8	+for rcutree and next for rcutiny.
8	9
9	10
10		-Hierarchical RCU debugfs Files and Formats
	11	+CONFIG_TREE_RCU and CONFIG_TREE_PREEMPT_RCU debugfs Files and Formats
11	12
12		-This implementation of RCU provides three debugfs files under the
	13	+These implementations of RCU provides five debugfs files under the
13	14	top-level directory RCU: rcu/rcudata (which displays fields in struct
14		-rcu_data), rcu/rcugp (which displays grace-period counters), and
15		-rcu/rcuhier (which displays the struct rcu_node hierarchy).
	15	+rcu_data), rcu/rcudata.csv (which is a .csv spreadsheet version of
	16	+rcu/rcudata), rcu/rcugp (which displays grace-period counters),
	17	+rcu/rcuhier (which displays the struct rcu_node hierarchy), and
	18	+rcu/rcu_pending (which displays counts of the reasons that the
	19	+rcu_pending() function decided that there was core RCU work to do).
16	20
17	21	The output of "cat rcu/rcudata" looks as follows:
18	22
...	...	@@ -130,7 +134,8 @@
130	134	been registered in absence of CPU-hotplug activity.
131	135
132	136	o "co" is the number of RCU callbacks that have been orphaned due to
133		- this CPU going offline.
	137	+ this CPU going offline. These orphaned callbacks have been moved
	138	+ to an arbitrarily chosen online CPU.
134	139
135	140	o "ca" is the number of RCU callbacks that have been adopted due to
136	141	other CPUs going offline. Note that ci+co-ca+ql is the number of
137	142
...	...	@@ -168,12 +173,12 @@
168	173
169	174	The output of "cat rcu/rcuhier" looks as follows, with very long lines:
170	175
171		-c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 oqlen=0
	176	+c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6
172	177	1/1 .>. 0:127 ^0
173	178	3/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3
174	179	3/3f .>. 0:5 ^0 2/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3
175	180	rcu_bh:
176		-c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 oqlen=0
	181	+c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0
177	182	0/1 .>. 0:127 ^0
178	183	0/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3
179	184	0/3f .>. 0:5 ^0 0/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3
...	...	@@ -212,11 +217,6 @@
212	217	exited immediately (without even being counted in nfqs above)
213	218	due to contention on ->fqslock.
214	219
215		-o "oqlen" is the number of callbacks on the "orphan" callback
216		- list. RCU callbacks are placed on this list by CPUs going
217		- offline, and are "adopted" either by the CPU helping the outgoing
218		- CPU or by the next rcu_barrier*() call, whichever comes first.
219		-
220	220	o Each element of the form "1/1 0:127 ^0" represents one struct
221	221	rcu_node. Each line represents one level of the hierarchy, from
222	222	root to leaves. It is best to think of the rcu_data structures
...	...	@@ -326,4 +326,116 @@
326	326	readers will note that the rcu "nn" number for a given CPU very
327	327	closely matches the rcu_bh "np" number for that same CPU. This
328	328	is due to short-circuit evaluation in rcu_pending().
	329	+
	330	+
	331	+CONFIG_TINY_RCU and CONFIG_TINY_PREEMPT_RCU debugfs Files and Formats
	332	+
	333	+These implementations of RCU provides a single debugfs file under the
	334	+top-level directory RCU, namely rcu/rcudata, which displays fields in
	335	+rcu_bh_ctrlblk, rcu_sched_ctrlblk and, for CONFIG_TINY_PREEMPT_RCU,
	336	+rcu_preempt_ctrlblk.
	337	+
	338	+The output of "cat rcu/rcudata" is as follows:
	339	+
	340	+rcu_preempt: qlen=24 gp=1097669 g197/p197/c197 tasks=...
	341	+ ttb=. btg=no ntb=184 neb=0 nnb=183 j=01f7 bt=0274
	342	+ normal balk: nt=1097669 gt=0 bt=371 b=0 ny=25073378 nos=0
	343	+ exp balk: bt=0 nos=0
	344	+rcu_sched: qlen: 0
	345	+rcu_bh: qlen: 0
	346	+
	347	+This is split into rcu_preempt, rcu_sched, and rcu_bh sections, with the
	348	+rcu_preempt section appearing only in CONFIG_TINY_PREEMPT_RCU builds.
	349	+The last three lines of the rcu_preempt section appear only in
	350	+CONFIG_RCU_BOOST kernel builds. The fields are as follows:
	351	+
	352	+o "qlen" is the number of RCU callbacks currently waiting either
	353	+ for an RCU grace period or waiting to be invoked. This is the
	354	+ only field present for rcu_sched and rcu_bh, due to the
	355	+ short-circuiting of grace period in those two cases.
	356	+
	357	+o "gp" is the number of grace periods that have completed.
	358	+
	359	+o "g197/p197/c197" displays the grace-period state, with the
	360	+ "g" number being the number of grace periods that have started
	361	+ (mod 256), the "p" number being the number of grace periods
	362	+ that the CPU has responded to (also mod 256), and the "c"
	363	+ number being the number of grace periods that have completed
	364	+ (once again mode 256).
	365	+
	366	+ Why have both "gp" and "g"? Because the data flowing into
	367	+ "gp" is only present in a CONFIG_RCU_TRACE kernel.
	368	+
	369	+o "tasks" is a set of bits. The first bit is "T" if there are
	370	+ currently tasks that have recently blocked within an RCU
	371	+ read-side critical section, the second bit is "N" if any of the
	372	+ aforementioned tasks are blocking the current RCU grace period,
	373	+ and the third bit is "E" if any of the aforementioned tasks are
	374	+ blocking the current expedited grace period. Each bit is "."
	375	+ if the corresponding condition does not hold.
	376	+
	377	+o "ttb" is a single bit. It is "B" if any of the blocked tasks
	378	+ need to be priority boosted and "." otherwise.
	379	+
	380	+o "btg" indicates whether boosting has been carried out during
	381	+ the current grace period, with "exp" indicating that boosting
	382	+ is in progress for an expedited grace period, "no" indicating
	383	+ that boosting has not yet started for a normal grace period,
	384	+ "begun" indicating that boosting has bebug for a normal grace
	385	+ period, and "done" indicating that boosting has completed for
	386	+ a normal grace period.
	387	+
	388	+o "ntb" is the total number of tasks subjected to RCU priority boosting
	389	+ periods since boot.
	390	+
	391	+o "neb" is the number of expedited grace periods that have had
	392	+ to resort to RCU priority boosting since boot.
	393	+
	394	+o "nnb" is the number of normal grace periods that have had
	395	+ to resort to RCU priority boosting since boot.
	396	+
	397	+o "j" is the low-order 12 bits of the jiffies counter in hexadecimal.
	398	+
	399	+o "bt" is the low-order 12 bits of the value that the jiffies counter
	400	+ will have at the next time that boosting is scheduled to begin.
	401	+
	402	+o In the line beginning with "normal balk", the fields are as follows:
	403	+
	404	+ o "nt" is the number of times that the system balked from
	405	+ boosting because there were no blocked tasks to boost.
	406	+ Note that the system will balk from boosting even if the
	407	+ grace period is overdue when the currently running task
	408	+ is looping within an RCU read-side critical section.
	409	+ There is no point in boosting in this case, because
	410	+ boosting a running task won't make it run any faster.
	411	+
	412	+ o "gt" is the number of times that the system balked
	413	+ from boosting because, although there were blocked tasks,
	414	+ none of them were preventing the current grace period
	415	+ from completing.
	416	+
	417	+ o "bt" is the number of times that the system balked
	418	+ from boosting because boosting was already in progress.
	419	+
	420	+ o "b" is the number of times that the system balked from
	421	+ boosting because boosting had already completed for
	422	+ the grace period in question.
	423	+
	424	+ o "ny" is the number of times that the system balked from
	425	+ boosting because it was not yet time to start boosting
	426	+ the grace period in question.
	427	+
	428	+ o "nos" is the number of times that the system balked from
	429	+ boosting for inexplicable ("not otherwise specified")
	430	+ reasons. This can actually happen due to races involving
	431	+ increments of the jiffies counter.
	432	+
	433	+o In the line beginning with "exp balk", the fields are as follows:
	434	+
	435	+ o "bt" is the number of times that the system balked from
	436	+ boosting because there were no blocked tasks to boost.
	437	+
	438	+ o "nos" is the number of times that the system balked from
	439	+ boosting for inexplicable ("not otherwise specified")
	440	+ reasons.
...	...	@@ -83,6 +83,12 @@
83	83	*/
84	84	# define CAP_INIT_BSET CAP_FULL_SET
85	85
	86	+#ifdef CONFIG_RCU_BOOST
	87	+#define INIT_TASK_RCU_BOOST() \
	88	+ .rcu_boost_mutex = NULL,
	89	+#else
	90	+#define INIT_TASK_RCU_BOOST()
	91	+#endif
86	92	#ifdef CONFIG_TREE_PREEMPT_RCU
87	93	#define INIT_TASK_RCU_TREE_PREEMPT() \
88	94	.rcu_blocked_node = NULL,
...	...	@@ -94,7 +100,8 @@
94	100	.rcu_read_lock_nesting = 0, \
95	101	.rcu_read_unlock_special = 0, \
96	102	.rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry), \
97		- INIT_TASK_RCU_TREE_PREEMPT()
	103	+ INIT_TASK_RCU_TREE_PREEMPT() \
	104	+ INIT_TASK_RCU_BOOST()
98	105	#else
99	106	#define INIT_TASK_RCU_PREEMPT(tsk)
100	107	#endif
...	...	@@ -241,11 +241,6 @@
241	241	#define list_first_entry_rcu(ptr, type, member) \
242	242	list_entry_rcu((ptr)->next, type, member)
243	243
244		-#define __list_for_each_rcu(pos, head) \
245		- for (pos = rcu_dereference_raw(list_next_rcu(head)); \
246		- pos != (head); \
247		- pos = rcu_dereference_raw(list_next_rcu((pos)))
248		-
249	244	/**
250	245	* list_for_each_entry_rcu - iterate over rcu list of given type
251	246	* @pos: the type * to use as a loop cursor.
...	...	@@ -47,6 +47,8 @@
47	47	extern int rcutorture_runnable; /* for sysctl */
48	48	#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
49	49
	50	+#define UINT_CMP_GE(a, b) (UINT_MAX / 2 >= (a) - (b))
	51	+#define UINT_CMP_LT(a, b) (UINT_MAX / 2 < (a) - (b))
50	52	#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
51	53	#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
52	54
...	...	@@ -66,7 +68,6 @@
66	68	extern void synchronize_sched(void);
67	69	extern void rcu_barrier_bh(void);
68	70	extern void rcu_barrier_sched(void);
69		-extern void synchronize_sched_expedited(void);
70	71	extern int sched_expedited_torture_stats(char *page);
71	72
72	73	static inline void __rcu_read_lock_bh(void)
...	...	@@ -118,7 +119,6 @@
118	119	#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
119	120
120	121	/* Internal to kernel */
121		-extern void rcu_init(void);
122	122	extern void rcu_sched_qs(int cpu);
123	123	extern void rcu_bh_qs(int cpu);
124	124	extern void rcu_check_callbacks(int cpu, int user);
...	...	@@ -27,7 +27,9 @@
27	27
28	28	#include <linux/cache.h>
29	29
30		-#define rcu_init_sched() do { } while (0)
	30	+static inline void rcu_init(void)
	31	+{
	32	+}
31	33
32	34	#ifdef CONFIG_TINY_RCU
33	35
...	...	@@ -58,6 +60,11 @@
58	60	synchronize_sched();
59	61	}
60	62
	63	+static inline void synchronize_sched_expedited(void)
	64	+{
	65	+ synchronize_sched();
	66	+}
	67	+
61	68	#ifdef CONFIG_TINY_RCU
62	69
63	70	static inline void rcu_preempt_note_context_switch(void)
64	71
65	72
66	73
...	...	@@ -125,16 +132,12 @@
125	132	}
126	133
127	134	#ifdef CONFIG_DEBUG_LOCK_ALLOC
128		-
129	135	extern int rcu_scheduler_active __read_mostly;
130	136	extern void rcu_scheduler_starting(void);
131		-
132	137	#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
133		-
134	138	static inline void rcu_scheduler_starting(void)
135	139	{
136	140	}
137		-
138	141	#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
139	142
140	143	#endif /* __LINUX_RCUTINY_H */
...	...	@@ -30,6 +30,7 @@
30	30	#ifndef __LINUX_RCUTREE_H
31	31	#define __LINUX_RCUTREE_H
32	32
	33	+extern void rcu_init(void);
33	34	extern void rcu_note_context_switch(int cpu);
34	35	extern int rcu_needs_cpu(int cpu);
35	36	extern void rcu_cpu_stall_reset(void);
...	...	@@ -47,6 +48,7 @@
47	48	#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
48	49
49	50	extern void synchronize_rcu_bh(void);
	51	+extern void synchronize_sched_expedited(void);
50	52	extern void synchronize_rcu_expedited(void);
51	53
52	54	static inline void synchronize_rcu_bh_expedited(void)
...	...	@@ -1229,6 +1229,9 @@
1229	1229	#ifdef CONFIG_TREE_PREEMPT_RCU
1230	1230	struct rcu_node *rcu_blocked_node;
1231	1231	#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
	1232	+#ifdef CONFIG_RCU_BOOST
	1233	+ struct rt_mutex *rcu_boost_mutex;
	1234	+#endif /* #ifdef CONFIG_RCU_BOOST */
1232	1235
1233	1236	#if defined(CONFIG_SCHEDSTATS) \|\| defined(CONFIG_TASK_DELAY_ACCT)
1234	1237	struct sched_info sched_info;
...	...	@@ -1759,7 +1762,8 @@
1759	1762	#ifdef CONFIG_PREEMPT_RCU
1760	1763
1761	1764	#define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
1762		-#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
	1765	+#define RCU_READ_UNLOCK_BOOSTED (1 << 1) /* boosted while in RCU read-side. */
	1766	+#define RCU_READ_UNLOCK_NEED_QS (1 << 2) /* RCU core needs CPU response. */
1763	1767
1764	1768	static inline void rcu_copy_process(struct task_struct *p)
1765	1769	{
...	...	@@ -1767,7 +1771,10 @@
1767	1771	p->rcu_read_unlock_special = 0;
1768	1772	#ifdef CONFIG_TREE_PREEMPT_RCU
1769	1773	p->rcu_blocked_node = NULL;
1770		-#endif
	1774	+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
	1775	+#ifdef CONFIG_RCU_BOOST
	1776	+ p->rcu_boost_mutex = NULL;
	1777	+#endif /* #ifdef CONFIG_RCU_BOOST */
1771	1778	INIT_LIST_HEAD(&p->rcu_node_entry);
1772	1779	}
1773	1780
...	...	@@ -393,7 +393,6 @@
393	393
394	394	config RCU_TRACE
395	395	bool "Enable tracing for RCU"
396		- depends on TREE_RCU \|\| TREE_PREEMPT_RCU
397	396	help
398	397	This option provides tracing in RCU which presents stats
399	398	in debugfs for debugging RCU implementation.
...	...	@@ -458,6 +457,60 @@
458	457	This option provides tracing for the TREE_RCU and
459	458	TREE_PREEMPT_RCU implementations, permitting Makefile to
460	459	trivially select kernel/rcutree_trace.c.
	460	+
	461	+config RCU_BOOST
	462	+ bool "Enable RCU priority boosting"
	463	+ depends on RT_MUTEXES && TINY_PREEMPT_RCU
	464	+ default n
	465	+ help
	466	+ This option boosts the priority of preempted RCU readers that
	467	+ block the current preemptible RCU grace period for too long.
	468	+ This option also prevents heavy loads from blocking RCU
	469	+ callback invocation for all flavors of RCU.
	470	+
	471	+ Say Y here if you are working with real-time apps or heavy loads
	472	+ Say N here if you are unsure.
	473	+
	474	+config RCU_BOOST_PRIO
	475	+ int "Real-time priority to boost RCU readers to"
	476	+ range 1 99
	477	+ depends on RCU_BOOST
	478	+ default 1
	479	+ help
	480	+ This option specifies the real-time priority to which preempted
	481	+ RCU readers are to be boosted. If you are working with CPU-bound
	482	+ real-time applications, you should specify a priority higher then
	483	+ the highest-priority CPU-bound application.
	484	+
	485	+ Specify the real-time priority, or take the default if unsure.
	486	+
	487	+config RCU_BOOST_DELAY
	488	+ int "Milliseconds to delay boosting after RCU grace-period start"
	489	+ range 0 3000
	490	+ depends on RCU_BOOST
	491	+ default 500
	492	+ help
	493	+ This option specifies the time to wait after the beginning of
	494	+ a given grace period before priority-boosting preempted RCU
	495	+ readers blocking that grace period. Note that any RCU reader
	496	+ blocking an expedited RCU grace period is boosted immediately.
	497	+
	498	+ Accept the default if unsure.
	499	+
	500	+config SRCU_SYNCHRONIZE_DELAY
	501	+ int "Microseconds to delay before waiting for readers"
	502	+ range 0 20
	503	+ default 10
	504	+ help
	505	+ This option controls how long SRCU delays before entering its
	506	+ loop waiting on SRCU readers. The purpose of this loop is
	507	+ to avoid the unconditional context-switch penalty that would
	508	+ otherwise be incurred if there was an active SRCU reader,
	509	+ in a manner similar to adaptive locking schemes. This should
	510	+ be set to be a bit longer than the common-case SRCU read-side
	511	+ critical-section overhead.
	512	+
	513	+ Accept the default if unsure.
461	514
462	515	endmenu # "RCU Subsystem"
463	516
...	...	@@ -36,31 +36,16 @@
36	36	#include <linux/time.h>
37	37	#include <linux/cpu.h>
38	38
39		-/* Global control variables for rcupdate callback mechanism. */
40		-struct rcu_ctrlblk {
41		- struct rcu_head rcucblist; / List of pending callbacks (CBs). */
42		- struct rcu_head *donetail; / ->next pointer of last "done" CB. */
43		- struct rcu_head *curtail; / ->next pointer of last CB. */
44		-};
	39	+/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
	40	+static struct task_struct *rcu_kthread_task;
	41	+static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
	42	+static unsigned long have_rcu_kthread_work;
	43	+static void invoke_rcu_kthread(void);
45	44
46		-/* Definition for rcupdate control block. */
47		-static struct rcu_ctrlblk rcu_sched_ctrlblk = {
48		- .donetail = &rcu_sched_ctrlblk.rcucblist,
49		- .curtail = &rcu_sched_ctrlblk.rcucblist,
50		-};
51		-
52		-static struct rcu_ctrlblk rcu_bh_ctrlblk = {
53		- .donetail = &rcu_bh_ctrlblk.rcucblist,
54		- .curtail = &rcu_bh_ctrlblk.rcucblist,
55		-};
56		-
57		-#ifdef CONFIG_DEBUG_LOCK_ALLOC
58		-int rcu_scheduler_active __read_mostly;
59		-EXPORT_SYMBOL_GPL(rcu_scheduler_active);
60		-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
61		-
62	45	/* Forward declarations for rcutiny_plugin.h. */
63		-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
	46	+struct rcu_ctrlblk;
	47	+static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
	48	+static int rcu_kthread(void *arg);
64	49	static void __call_rcu(struct rcu_head *head,
65	50	void (func)(struct rcu_head rcu),
66	51	struct rcu_ctrlblk *rcp);
...	...	@@ -123,7 +108,7 @@
123	108	{
124	109	if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
125	110	rcu_qsctr_help(&rcu_bh_ctrlblk))
126		- raise_softirq(RCU_SOFTIRQ);
	111	+ invoke_rcu_kthread();
127	112	}
128	113
129	114	/*
...	...	@@ -132,7 +117,7 @@
132	117	void rcu_bh_qs(int cpu)
133	118	{
134	119	if (rcu_qsctr_help(&rcu_bh_ctrlblk))
135		- raise_softirq(RCU_SOFTIRQ);
	120	+ invoke_rcu_kthread();
136	121	}
137	122
138	123	/*
139	124
140	125
...	...	@@ -152,13 +137,14 @@
152	137	}
153	138
154	139	/*
155		- * Helper function for rcu_process_callbacks() that operates on the
156		- * specified rcu_ctrlkblk structure.
	140	+ * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
	141	+ * whose grace period has elapsed.
157	142	*/
158		-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
	143	+static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
159	144	{
160	145	struct rcu_head next, list;
161	146	unsigned long flags;
	147	+ RCU_TRACE(int cb_count = 0);
162	148
163	149	/* If no RCU callbacks ready to invoke, just return. */
164	150	if (&rcp->rcucblist == rcp->donetail)
165	151
166	152
167	153
168	154
169	155
170	156
171	157
...	...	@@ -180,22 +166,61 @@
180	166	next = list->next;
181	167	prefetch(next);
182	168	debug_rcu_head_unqueue(list);
	169	+ local_bh_disable();
183	170	list->func(list);
	171	+ local_bh_enable();
184	172	list = next;
	173	+ RCU_TRACE(cb_count++);
185	174	}
	175	+ RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
186	176	}
187	177
188	178	/*
189		- * Invoke any callbacks whose grace period has completed.
	179	+ * This kthread invokes RCU callbacks whose grace periods have
	180	+ * elapsed. It is awakened as needed, and takes the place of the
	181	+ * RCU_SOFTIRQ that was used previously for this purpose.
	182	+ * This is a kthread, but it is never stopped, at least not until
	183	+ * the system goes down.
190	184	*/
191		-static void rcu_process_callbacks(struct softirq_action *unused)
	185	+static int rcu_kthread(void *arg)
192	186	{
193		- __rcu_process_callbacks(&rcu_sched_ctrlblk);
194		- __rcu_process_callbacks(&rcu_bh_ctrlblk);
195		- rcu_preempt_process_callbacks();
	187	+ unsigned long work;
	188	+ unsigned long morework;
	189	+ unsigned long flags;
	190	+
	191	+ for (;;) {
	192	+ wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0);
	193	+ morework = rcu_boost();
	194	+ local_irq_save(flags);
	195	+ work = have_rcu_kthread_work;
	196	+ have_rcu_kthread_work = morework;
	197	+ local_irq_restore(flags);
	198	+ if (work) {
	199	+ rcu_process_callbacks(&rcu_sched_ctrlblk);
	200	+ rcu_process_callbacks(&rcu_bh_ctrlblk);
	201	+ rcu_preempt_process_callbacks();
	202	+ }
	203	+ schedule_timeout_interruptible(1); /* Leave CPU for others. */
	204	+ }
	205	+
	206	+ return 0; /* Not reached, but needed to shut gcc up. */
196	207	}
197	208
198	209	/*
	210	+ * Wake up rcu_kthread() to process callbacks now eligible for invocation
	211	+ * or to boost readers.
	212	+ */
	213	+static void invoke_rcu_kthread(void)
	214	+{
	215	+ unsigned long flags;
	216	+
	217	+ local_irq_save(flags);
	218	+ have_rcu_kthread_work = 1;
	219	+ wake_up(&rcu_kthread_wq);
	220	+ local_irq_restore(flags);
	221	+}
	222	+
	223	+/*
199	224	* Wait for a grace period to elapse. But it is illegal to invoke
200	225	* synchronize_sched() from within an RCU read-side critical section.
201	226	* Therefore, any legal call to synchronize_sched() is a quiescent
...	...	@@ -230,6 +255,7 @@
230	255	local_irq_save(flags);
231	256	*rcp->curtail = head;
232	257	rcp->curtail = &head->next;
	258	+ RCU_TRACE(rcp->qlen++);
233	259	local_irq_restore(flags);
234	260	}
235	261
236	262
237	263
...	...	@@ -282,8 +308,17 @@
282	308	}
283	309	EXPORT_SYMBOL_GPL(rcu_barrier_sched);
284	310
285		-void __init rcu_init(void)
	311	+/*
	312	+ * Spawn the kthread that invokes RCU callbacks.
	313	+ */
	314	+static int __init rcu_spawn_kthreads(void)
286	315	{
287		- open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
	316	+ struct sched_param sp;
	317	+
	318	+ rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
	319	+ sp.sched_priority = RCU_BOOST_PRIO;
	320	+ sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
	321	+ return 0;
288	322	}
	323	+early_initcall(rcu_spawn_kthreads);
...	...	@@ -22,6 +22,40 @@
22	22	* Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
23	23	*/
24	24
	25	+#include <linux/kthread.h>
	26	+#include <linux/debugfs.h>
	27	+#include <linux/seq_file.h>
	28	+
	29	+#ifdef CONFIG_RCU_TRACE
	30	+#define RCU_TRACE(stmt) stmt
	31	+#else /* #ifdef CONFIG_RCU_TRACE */
	32	+#define RCU_TRACE(stmt)
	33	+#endif /* #else #ifdef CONFIG_RCU_TRACE */
	34	+
	35	+/* Global control variables for rcupdate callback mechanism. */
	36	+struct rcu_ctrlblk {
	37	+ struct rcu_head rcucblist; / List of pending callbacks (CBs). */
	38	+ struct rcu_head *donetail; / ->next pointer of last "done" CB. */
	39	+ struct rcu_head *curtail; / ->next pointer of last CB. */
	40	+ RCU_TRACE(long qlen); /* Number of pending CBs. */
	41	+};
	42	+
	43	+/* Definition for rcupdate control block. */
	44	+static struct rcu_ctrlblk rcu_sched_ctrlblk = {
	45	+ .donetail = &rcu_sched_ctrlblk.rcucblist,
	46	+ .curtail = &rcu_sched_ctrlblk.rcucblist,
	47	+};
	48	+
	49	+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
	50	+ .donetail = &rcu_bh_ctrlblk.rcucblist,
	51	+ .curtail = &rcu_bh_ctrlblk.rcucblist,
	52	+};
	53	+
	54	+#ifdef CONFIG_DEBUG_LOCK_ALLOC
	55	+int rcu_scheduler_active __read_mostly;
	56	+EXPORT_SYMBOL_GPL(rcu_scheduler_active);
	57	+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
	58	+
25	59	#ifdef CONFIG_TINY_PREEMPT_RCU
26	60
27	61	#include <linux/delay.h>
28	62
29	63
...	...	@@ -46,17 +80,45 @@
46	80	struct list_head *gp_tasks;
47	81	/* Pointer to the first task blocking the */
48	82	/* current grace period, or NULL if there */
49		- /* is not such task. */
	83	+ /* is no such task. */
50	84	struct list_head *exp_tasks;
51	85	/* Pointer to first task blocking the */
52	86	/* current expedited grace period, or NULL */
53	87	/* if there is no such task. If there */
54	88	/* is no current expedited grace period, */
55	89	/* then there cannot be any such task. */
	90	+#ifdef CONFIG_RCU_BOOST
	91	+ struct list_head *boost_tasks;
	92	+ /* Pointer to first task that needs to be */
	93	+ /* priority-boosted, or NULL if no priority */
	94	+ /* boosting is needed. If there is no */
	95	+ /* current or expedited grace period, there */
	96	+ /* can be no such task. */
	97	+#endif /* #ifdef CONFIG_RCU_BOOST */
56	98	u8 gpnum; /* Current grace period. */
57	99	u8 gpcpu; /* Last grace period blocked by the CPU. */
58	100	u8 completed; /* Last grace period completed. */
59	101	/* If all three are equal, RCU is idle. */
	102	+#ifdef CONFIG_RCU_BOOST
	103	+ s8 boosted_this_gp; /* Has boosting already happened? */
	104	+ unsigned long boost_time; /* When to start boosting (jiffies) */
	105	+#endif /* #ifdef CONFIG_RCU_BOOST */
	106	+#ifdef CONFIG_RCU_TRACE
	107	+ unsigned long n_grace_periods;
	108	+#ifdef CONFIG_RCU_BOOST
	109	+ unsigned long n_tasks_boosted;
	110	+ unsigned long n_exp_boosts;
	111	+ unsigned long n_normal_boosts;
	112	+ unsigned long n_normal_balk_blkd_tasks;
	113	+ unsigned long n_normal_balk_gp_tasks;
	114	+ unsigned long n_normal_balk_boost_tasks;
	115	+ unsigned long n_normal_balk_boosted;
	116	+ unsigned long n_normal_balk_notyet;
	117	+ unsigned long n_normal_balk_nos;
	118	+ unsigned long n_exp_balk_blkd_tasks;
	119	+ unsigned long n_exp_balk_nos;
	120	+#endif /* #ifdef CONFIG_RCU_BOOST */
	121	+#endif /* #ifdef CONFIG_RCU_TRACE */
60	122	};
61	123
62	124	static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
...	...	@@ -122,6 +184,210 @@
122	184	}
123	185
124	186	/*
	187	+ * Advance a ->blkd_tasks-list pointer to the next entry, instead
	188	+ * returning NULL if at the end of the list.
	189	+ */
	190	+static struct list_head rcu_next_node_entry(struct task_struct t)
	191	+{
	192	+ struct list_head *np;
	193	+
	194	+ np = t->rcu_node_entry.next;
	195	+ if (np == &rcu_preempt_ctrlblk.blkd_tasks)
	196	+ np = NULL;
	197	+ return np;
	198	+}
	199	+
	200	+#ifdef CONFIG_RCU_TRACE
	201	+
	202	+#ifdef CONFIG_RCU_BOOST
	203	+static void rcu_initiate_boost_trace(void);
	204	+static void rcu_initiate_exp_boost_trace(void);
	205	+#endif /* #ifdef CONFIG_RCU_BOOST */
	206	+
	207	+/*
	208	+ * Dump additional statistice for TINY_PREEMPT_RCU.
	209	+ */
	210	+static void show_tiny_preempt_stats(struct seq_file *m)
	211	+{
	212	+ seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n",
	213	+ rcu_preempt_ctrlblk.rcb.qlen,
	214	+ rcu_preempt_ctrlblk.n_grace_periods,
	215	+ rcu_preempt_ctrlblk.gpnum,
	216	+ rcu_preempt_ctrlblk.gpcpu,
	217	+ rcu_preempt_ctrlblk.completed,
	218	+ "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)],
	219	+ "N."[!rcu_preempt_ctrlblk.gp_tasks],
	220	+ "E."[!rcu_preempt_ctrlblk.exp_tasks]);
	221	+#ifdef CONFIG_RCU_BOOST
	222	+ seq_printf(m, " ttb=%c btg=",
	223	+ "B."[!rcu_preempt_ctrlblk.boost_tasks]);
	224	+ switch (rcu_preempt_ctrlblk.boosted_this_gp) {
	225	+ case -1:
	226	+ seq_puts(m, "exp");
	227	+ break;
	228	+ case 0:
	229	+ seq_puts(m, "no");
	230	+ break;
	231	+ case 1:
	232	+ seq_puts(m, "begun");
	233	+ break;
	234	+ case 2:
	235	+ seq_puts(m, "done");
	236	+ break;
	237	+ default:
	238	+ seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp);
	239	+ }
	240	+ seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
	241	+ rcu_preempt_ctrlblk.n_tasks_boosted,
	242	+ rcu_preempt_ctrlblk.n_exp_boosts,
	243	+ rcu_preempt_ctrlblk.n_normal_boosts,
	244	+ (int)(jiffies & 0xffff),
	245	+ (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
	246	+ seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n",
	247	+ "normal balk",
	248	+ rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks,
	249	+ rcu_preempt_ctrlblk.n_normal_balk_gp_tasks,
	250	+ rcu_preempt_ctrlblk.n_normal_balk_boost_tasks,
	251	+ rcu_preempt_ctrlblk.n_normal_balk_boosted,
	252	+ rcu_preempt_ctrlblk.n_normal_balk_notyet,
	253	+ rcu_preempt_ctrlblk.n_normal_balk_nos);
	254	+ seq_printf(m, " exp balk: bt=%lu nos=%lu\n",
	255	+ rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks,
	256	+ rcu_preempt_ctrlblk.n_exp_balk_nos);
	257	+#endif /* #ifdef CONFIG_RCU_BOOST */
	258	+}
	259	+
	260	+#endif /* #ifdef CONFIG_RCU_TRACE */
	261	+
	262	+#ifdef CONFIG_RCU_BOOST
	263	+
	264	+#include "rtmutex_common.h"
	265	+
	266	+/*
	267	+ * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
	268	+ * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
	269	+ */
	270	+static int rcu_boost(void)
	271	+{
	272	+ unsigned long flags;
	273	+ struct rt_mutex mtx;
	274	+ struct list_head *np;
	275	+ struct task_struct *t;
	276	+
	277	+ if (rcu_preempt_ctrlblk.boost_tasks == NULL)
	278	+ return 0; /* Nothing to boost. */
	279	+ raw_local_irq_save(flags);
	280	+ rcu_preempt_ctrlblk.boosted_this_gp++;
	281	+ t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct,
	282	+ rcu_node_entry);
	283	+ np = rcu_next_node_entry(t);
	284	+ rt_mutex_init_proxy_locked(&mtx, t);
	285	+ t->rcu_boost_mutex = &mtx;
	286	+ t->rcu_read_unlock_special \|= RCU_READ_UNLOCK_BOOSTED;
	287	+ raw_local_irq_restore(flags);
	288	+ rt_mutex_lock(&mtx);
	289	+ RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
	290	+ rcu_preempt_ctrlblk.boosted_this_gp++;
	291	+ rt_mutex_unlock(&mtx);
	292	+ return rcu_preempt_ctrlblk.boost_tasks != NULL;
	293	+}
	294	+
	295	+/*
	296	+ * Check to see if it is now time to start boosting RCU readers blocking
	297	+ * the current grace period, and, if so, tell the rcu_kthread_task to
	298	+ * start boosting them. If there is an expedited boost in progress,
	299	+ * we wait for it to complete.
	300	+ *
	301	+ * If there are no blocked readers blocking the current grace period,
	302	+ * return 0 to let the caller know, otherwise return 1. Note that this
	303	+ * return value is independent of whether or not boosting was done.
	304	+ */
	305	+static int rcu_initiate_boost(void)
	306	+{
	307	+ if (!rcu_preempt_blocked_readers_cgp()) {
	308	+ RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++);
	309	+ return 0;
	310	+ }
	311	+ if (rcu_preempt_ctrlblk.gp_tasks != NULL &&
	312	+ rcu_preempt_ctrlblk.boost_tasks == NULL &&
	313	+ rcu_preempt_ctrlblk.boosted_this_gp == 0 &&
	314	+ ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) {
	315	+ rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks;
	316	+ invoke_rcu_kthread();
	317	+ RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
	318	+ } else
	319	+ RCU_TRACE(rcu_initiate_boost_trace());
	320	+ return 1;
	321	+}
	322	+
	323	+/*
	324	+ * Initiate boosting for an expedited grace period.
	325	+ */
	326	+static void rcu_initiate_expedited_boost(void)
	327	+{
	328	+ unsigned long flags;
	329	+
	330	+ raw_local_irq_save(flags);
	331	+ if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) {
	332	+ rcu_preempt_ctrlblk.boost_tasks =
	333	+ rcu_preempt_ctrlblk.blkd_tasks.next;
	334	+ rcu_preempt_ctrlblk.boosted_this_gp = -1;
	335	+ invoke_rcu_kthread();
	336	+ RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
	337	+ } else
	338	+ RCU_TRACE(rcu_initiate_exp_boost_trace());
	339	+ raw_local_irq_restore(flags);
	340	+}
	341	+
	342	+#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000);
	343	+
	344	+/*
	345	+ * Do priority-boost accounting for the start of a new grace period.
	346	+ */
	347	+static void rcu_preempt_boost_start_gp(void)
	348	+{
	349	+ rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
	350	+ if (rcu_preempt_ctrlblk.boosted_this_gp > 0)
	351	+ rcu_preempt_ctrlblk.boosted_this_gp = 0;
	352	+}
	353	+
	354	+#else /* #ifdef CONFIG_RCU_BOOST */
	355	+
	356	+/*
	357	+ * If there is no RCU priority boosting, we don't boost.
	358	+ */
	359	+static int rcu_boost(void)
	360	+{
	361	+ return 0;
	362	+}
	363	+
	364	+/*
	365	+ * If there is no RCU priority boosting, we don't initiate boosting,
	366	+ * but we do indicate whether there are blocked readers blocking the
	367	+ * current grace period.
	368	+ */
	369	+static int rcu_initiate_boost(void)
	370	+{
	371	+ return rcu_preempt_blocked_readers_cgp();
	372	+}
	373	+
	374	+/*
	375	+ * If there is no RCU priority boosting, we don't initiate expedited boosting.
	376	+ */
	377	+static void rcu_initiate_expedited_boost(void)
	378	+{
	379	+}
	380	+
	381	+/*
	382	+ * If there is no RCU priority boosting, nothing to do at grace-period start.
	383	+ */
	384	+static void rcu_preempt_boost_start_gp(void)
	385	+{
	386	+}
	387	+
	388	+#endif /* else #ifdef CONFIG_RCU_BOOST */
	389	+
	390	+/*
125	391	* Record a preemptible-RCU quiescent state for the specified CPU. Note
126	392	* that this just means that the task currently running on the CPU is
127	393	* in a quiescent state. There might be any number of tasks blocked
128	394
129	395
...	...	@@ -148,11 +414,14 @@
148	414	rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
149	415	current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
150	416
	417	+ /* If there is no GP then there is nothing more to do. */
	418	+ if (!rcu_preempt_gp_in_progress())
	419	+ return;
151	420	/*
152		- * If there is no GP, or if blocked readers are still blocking GP,
153		- * then there is nothing more to do.
	421	+ * Check up on boosting. If there are no readers blocking the
	422	+ * current grace period, leave.
154	423	*/
155		- if (!rcu_preempt_gp_in_progress() \|\| rcu_preempt_blocked_readers_cgp())
	424	+ if (rcu_initiate_boost())
156	425	return;
157	426
158	427	/* Advance callbacks. */
159	428
...	...	@@ -164,9 +433,9 @@
164	433	if (!rcu_preempt_blocked_readers_any())
165	434	rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
166	435
167		- /* If there are done callbacks, make RCU_SOFTIRQ process them. */
	436	+ /* If there are done callbacks, cause them to be invoked. */
168	437	if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
169		- raise_softirq(RCU_SOFTIRQ);
	438	+ invoke_rcu_kthread();
170	439	}
171	440
172	441	/*
173	442
...	...	@@ -178,12 +447,16 @@
178	447
179	448	/* Official start of GP. */
180	449	rcu_preempt_ctrlblk.gpnum++;
	450	+ RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
181	451
182	452	/* Any blocked RCU readers block new GP. */
183	453	if (rcu_preempt_blocked_readers_any())
184	454	rcu_preempt_ctrlblk.gp_tasks =
185	455	rcu_preempt_ctrlblk.blkd_tasks.next;
186	456
	457	+ /* Set up for RCU priority boosting. */
	458	+ rcu_preempt_boost_start_gp();
	459	+
187	460	/* If there is no running reader, CPU is done with GP. */
188	461	if (!rcu_preempt_running_reader())
189	462	rcu_preempt_cpu_qs();
190	463
...	...	@@ -304,14 +577,16 @@
304	577	*/
305	578	empty = !rcu_preempt_blocked_readers_cgp();
306	579	empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
307		- np = t->rcu_node_entry.next;
308		- if (np == &rcu_preempt_ctrlblk.blkd_tasks)
309		- np = NULL;
	580	+ np = rcu_next_node_entry(t);
310	581	list_del(&t->rcu_node_entry);
311	582	if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
312	583	rcu_preempt_ctrlblk.gp_tasks = np;
313	584	if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
314	585	rcu_preempt_ctrlblk.exp_tasks = np;
	586	+#ifdef CONFIG_RCU_BOOST
	587	+ if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
	588	+ rcu_preempt_ctrlblk.boost_tasks = np;
	589	+#endif /* #ifdef CONFIG_RCU_BOOST */
315	590	INIT_LIST_HEAD(&t->rcu_node_entry);
316	591
317	592	/*
...	...	@@ -331,6 +606,14 @@
331	606	if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
332	607	rcu_report_exp_done();
333	608	}
	609	+#ifdef CONFIG_RCU_BOOST
	610	+ /* Unboost self if was boosted. */
	611	+ if (special & RCU_READ_UNLOCK_BOOSTED) {
	612	+ t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
	613	+ rt_mutex_unlock(t->rcu_boost_mutex);
	614	+ t->rcu_boost_mutex = NULL;
	615	+ }
	616	+#endif /* #ifdef CONFIG_RCU_BOOST */
334	617	local_irq_restore(flags);
335	618	}
336	619
...	...	@@ -374,7 +657,7 @@
374	657	rcu_preempt_cpu_qs();
375	658	if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
376	659	rcu_preempt_ctrlblk.rcb.donetail)
377		- raise_softirq(RCU_SOFTIRQ);
	660	+ invoke_rcu_kthread();
378	661	if (rcu_preempt_gp_in_progress() &&
379	662	rcu_cpu_blocking_cur_gp() &&
380	663	rcu_preempt_running_reader())
...	...	@@ -383,7 +666,7 @@
383	666
384	667	/*
385	668	* TINY_PREEMPT_RCU has an extra callback-list tail pointer to
386		- * update, so this is invoked from __rcu_process_callbacks() to
	669	+ * update, so this is invoked from rcu_process_callbacks() to
387	670	* handle that case. Of course, it is invoked for all flavors of
388	671	* RCU, but RCU callbacks can appear only on one of the lists, and
389	672	* neither ->nexttail nor ->donetail can possibly be NULL, so there
...	...	@@ -400,7 +683,7 @@
400	683	*/
401	684	static void rcu_preempt_process_callbacks(void)
402	685	{
403		- __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
	686	+ rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
404	687	}
405	688
406	689	/*
...	...	@@ -417,6 +700,7 @@
417	700	local_irq_save(flags);
418	701	*rcu_preempt_ctrlblk.nexttail = head;
419	702	rcu_preempt_ctrlblk.nexttail = &head->next;
	703	+ RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++);
420	704	rcu_preempt_start_gp(); /* checks to see if GP needed. */
421	705	local_irq_restore(flags);
422	706	}
...	...	@@ -532,6 +816,7 @@
532	816
533	817	/* Wait for tail of ->blkd_tasks list to drain. */
534	818	if (rcu_preempted_readers_exp())
	819	+ rcu_initiate_expedited_boost();
535	820	wait_event(sync_rcu_preempt_exp_wq,
536	821	!rcu_preempted_readers_exp());
537	822
538	823
...	...	@@ -572,7 +857,28 @@
572	857
573	858	#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
574	859
	860	+#ifdef CONFIG_RCU_TRACE
	861	+
575	862	/*
	863	+ * Because preemptible RCU does not exist, it is not necessary to
	864	+ * dump out its statistics.
	865	+ */
	866	+static void show_tiny_preempt_stats(struct seq_file *m)
	867	+{
	868	+}
	869	+
	870	+#endif /* #ifdef CONFIG_RCU_TRACE */
	871	+
	872	+/*
	873	+ * Because preemptible RCU does not exist, it is never necessary to
	874	+ * boost preempted RCU readers.
	875	+ */
	876	+static int rcu_boost(void)
	877	+{
	878	+ return 0;
	879	+}
	880	+
	881	+/*
576	882	* Because preemptible RCU does not exist, it never has any callbacks
577	883	* to check.
578	884	*/
579	885
580	886
...	...	@@ -599,18 +905,117 @@
599	905	#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
600	906
601	907	#ifdef CONFIG_DEBUG_LOCK_ALLOC
602		-
603	908	#include <linux/kernel_stat.h>
604	909
605	910	/*
606	911	* During boot, we forgive RCU lockdep issues. After this function is
607	912	* invoked, we start taking RCU lockdep issues seriously.
608	913	*/
609		-void rcu_scheduler_starting(void)
	914	+void __init rcu_scheduler_starting(void)
610	915	{
611	916	WARN_ON(nr_context_switches() > 0);
612	917	rcu_scheduler_active = 1;
613	918	}
614	919
615	920	#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
	921	+
	922	+#ifdef CONFIG_RCU_BOOST
	923	+#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
	924	+#else /* #ifdef CONFIG_RCU_BOOST */
	925	+#define RCU_BOOST_PRIO 1
	926	+#endif /* #else #ifdef CONFIG_RCU_BOOST */
	927	+
	928	+#ifdef CONFIG_RCU_TRACE
	929	+
	930	+#ifdef CONFIG_RCU_BOOST
	931	+
	932	+static void rcu_initiate_boost_trace(void)
	933	+{
	934	+ if (rcu_preempt_ctrlblk.gp_tasks == NULL)
	935	+ rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++;
	936	+ else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
	937	+ rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++;
	938	+ else if (rcu_preempt_ctrlblk.boosted_this_gp != 0)
	939	+ rcu_preempt_ctrlblk.n_normal_balk_boosted++;
	940	+ else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
	941	+ rcu_preempt_ctrlblk.n_normal_balk_notyet++;
	942	+ else
	943	+ rcu_preempt_ctrlblk.n_normal_balk_nos++;
	944	+}
	945	+
	946	+static void rcu_initiate_exp_boost_trace(void)
	947	+{
	948	+ if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
	949	+ rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++;
	950	+ else
	951	+ rcu_preempt_ctrlblk.n_exp_balk_nos++;
	952	+}
	953	+
	954	+#endif /* #ifdef CONFIG_RCU_BOOST */
	955	+
	956	+static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
	957	+{
	958	+ unsigned long flags;
	959	+
	960	+ raw_local_irq_save(flags);
	961	+ rcp->qlen -= n;
	962	+ raw_local_irq_restore(flags);
	963	+}
	964	+
	965	+/*
	966	+ * Dump statistics for TINY_RCU, such as they are.
	967	+ */
	968	+static int show_tiny_stats(struct seq_file m, void unused)
	969	+{
	970	+ show_tiny_preempt_stats(m);
	971	+ seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
	972	+ seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
	973	+ return 0;
	974	+}
	975	+
	976	+static int show_tiny_stats_open(struct inode inode, struct file file)
	977	+{
	978	+ return single_open(file, show_tiny_stats, NULL);
	979	+}
	980	+
	981	+static const struct file_operations show_tiny_stats_fops = {
	982	+ .owner = THIS_MODULE,
	983	+ .open = show_tiny_stats_open,
	984	+ .read = seq_read,
	985	+ .llseek = seq_lseek,
	986	+ .release = single_release,
	987	+};
	988	+
	989	+static struct dentry *rcudir;
	990	+
	991	+static int __init rcutiny_trace_init(void)
	992	+{
	993	+ struct dentry *retval;
	994	+
	995	+ rcudir = debugfs_create_dir("rcu", NULL);
	996	+ if (!rcudir)
	997	+ goto free_out;
	998	+ retval = debugfs_create_file("rcudata", 0444, rcudir,
	999	+ NULL, &show_tiny_stats_fops);
	1000	+ if (!retval)
	1001	+ goto free_out;
	1002	+ return 0;
	1003	+free_out:
	1004	+ debugfs_remove_recursive(rcudir);
	1005	+ return 1;
	1006	+}
	1007	+
	1008	+static void __exit rcutiny_trace_cleanup(void)
	1009	+{
	1010	+ debugfs_remove_recursive(rcudir);
	1011	+}
	1012	+
	1013	+module_init(rcutiny_trace_init);
	1014	+module_exit(rcutiny_trace_cleanup);
	1015	+
	1016	+MODULE_AUTHOR("Paul E. McKenney");
	1017	+MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
	1018	+MODULE_LICENSE("GPL");
	1019	+
	1020	+#endif /* #ifdef CONFIG_RCU_TRACE */