Merge branches 'doctorture.2013.01.29a', 'fixes.2013.01.26a', 'tagcb.2013.01.24a…

…' and 'tiny.2013.01.29b' into HEAD doctorture.2013.01.11a: Changes to rcutorture and to RCU documentation. fixes.2013.01.26a: Miscellaneous fixes. tagcb.2013.01.24a: Tag RCU callbacks with grace-period number to simplify callback advancement. tiny.2013.01.29b: Enhancements to uniprocessor handling in tiny RCU.

Merge branches 'doctorture.2013.01.29a', 'fixes.2013.01.26a', 'tagcb.2013.01.24a…
…' and 'tiny.2013.01.29b' into HEAD doctorture.2013.01.11a: Changes to rcutorture and to RCU documentation. fixes.2013.01.26a: Miscellaneous fixes. tagcb.2013.01.24a: Tag RCU callbacks with grace-period number to simplify callback advancement. tiny.2013.01.29b: Enhancements to uniprocessor handling in tiny RCU.
Paul E. McKenney
4 parents 0e11c8e8a6 4eacdf1837 6d4b418c75 9fc52d832b
Showing 12 changed files Side-by-side Diff
include/linux/rcupdate.h
include/trace/events/rcu.h
init/Kconfig
kernel/context_tracking.c
kernel/rcu.h
kernel/rcupdate.c
kernel/rcutiny.c
kernel/rcutiny_plugin.h
kernel/rcutorture.c
kernel/rcutree.c
kernel/rcutree.h
lib/Kconfig.debug
@@ -756,7 +756,7 @@
  * preemptible RCU implementations (TREE_PREEMPT_RCU and TINY_PREEMPT_RCU)
  * in CONFIG_PREEMPT kernel builds, RCU read-side critical sections may
  * be preempted, but explicit blocking is illegal.  Finally, in preemptible
- * RCU implementations in real-time (CONFIG_PREEMPT_RT) kernel builds,
+ * RCU implementations in real-time (with -rt patchset) kernel builds,
  * RCU read-side critical sections may be preempted and they may also
  * block, but only when acquiring spinlocks that are subject to priority
  * inheritance.
@@ -44,8 +44,10 @@
  * of a new grace period or the end of an old grace period ("cpustart"
  * and "cpuend", respectively), a CPU passing through a quiescent
  * state ("cpuqs"), a CPU coming online or going offline ("cpuonl"
- * and "cpuofl", respectively), and a CPU being kicked for being too
- * long in dyntick-idle mode ("kick").
+ * and "cpuofl", respectively), a CPU being kicked for being too
+ * long in dyntick-idle mode ("kick"), a CPU accelerating its new
+ * callbacks to RCU_NEXT_READY_TAIL ("AccReadyCB"), and a CPU
+ * accelerating its new callbacks to RCU_WAIT_TAIL ("AccWaitCB").
  */
 TRACE_EVENT(rcu_grace_period,
  
@@ -393,7 +395,7 @@
  */
 TRACE_EVENT(rcu_batch_start,
  
-	TP_PROTO(char *rcuname, long qlen_lazy, long qlen, int blimit),
+	TP_PROTO(char *rcuname, long qlen_lazy, long qlen, long blimit),
  
 	TP_ARGS(rcuname, qlen_lazy, qlen, blimit),
  
@@ -401,7 +403,7 @@
 		__field(char *, rcuname)
 		__field(long, qlen_lazy)
 		__field(long, qlen)
-		__field(int, blimit)
+		__field(long, blimit)
 	),
  
 	TP_fast_assign(
@@ -411,7 +413,7 @@
 		__entry->blimit = blimit;
 	),
  
-	TP_printk("%s CBs=%ld/%ld bl=%d",
+	TP_printk("%s CBs=%ld/%ld bl=%ld",
 		  __entry->rcuname, __entry->qlen_lazy, __entry->qlen,
 		  __entry->blimit)
 );
@@ -453,7 +453,7 @@
  
 config TREE_PREEMPT_RCU
 	bool "Preemptible tree-based hierarchical RCU"
-	depends on PREEMPT && SMP
+	depends on PREEMPT
 	help
 	  This option selects the RCU implementation that is
 	  designed for very large SMP systems with hundreds or
@@ -461,6 +461,8 @@
 	  is also required.  It also scales down nicely to
 	  smaller systems.
  
+	  Select this option if you are unsure.
+
 config TINY_RCU
 	bool "UP-only small-memory-footprint RCU"
 	depends on !PREEMPT && !SMP
@@ -485,6 +487,14 @@
 	help
 	  This option enables preemptible-RCU code that is common between
 	  the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations.
+
+config RCU_STALL_COMMON
+	def_bool ( TREE_RCU || TREE_PREEMPT_RCU || RCU_TRACE )
+	help
+	  This option enables RCU CPU stall code that is common between
+	  the TINY and TREE variants of RCU.  The purpose is to allow
+	  the tiny variants to disable RCU CPU stall warnings, while
+	  making these warnings mandatory for the tree variants.
  
 config CONTEXT_TRACKING
        bool
+/*
+ * Context tracking: Probe on high level context boundaries such as kernel
+ * and userspace. This includes syscalls and exceptions entry/exit.
+ *
+ * This is used by RCU to remove its dependency on the timer tick while a CPU
+ * runs in userspace.
+ *
+ *  Started by Frederic Weisbecker:
+ *
+ * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com>
+ *
+ * Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton,
+ * Steven Rostedt, Peter Zijlstra for suggestions and improvements.
+ *
+ */
+
 #include <linux/context_tracking.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
@@ -6,8 +22,8 @@
  
 struct context_tracking {
 	/*
-	 * When active is false, hooks are not set to
-	 * minimize overhead: TIF flags are cleared
+	 * When active is false, probes are unset in order
+	 * to minimize overhead: TIF flags are cleared
 	 * and calls to user_enter/exit are ignored. This
 	 * may be further optimized using static keys.
 	 */
@@ -24,6 +40,15 @@
 #endif
 };
  
+/**
+ * user_enter - Inform the context tracking that the CPU is going to
+ *              enter userspace mode.
+ *
+ * This function must be called right before we switch from the kernel
+ * to userspace, when it's guaranteed the remaining kernel instructions
+ * to execute won't use any RCU read side critical section because this
+ * function sets RCU in extended quiescent state.
+ */
 void user_enter(void)
 {
 	unsigned long flags;
  
  
  
  
  
@@ -39,40 +64,70 @@
 	if (in_interrupt())
 		return;
  
+	/* Kernel threads aren't supposed to go to userspace */
 	WARN_ON_ONCE(!current->mm);
  
 	local_irq_save(flags);
 	if (__this_cpu_read(context_tracking.active) &&
 	    __this_cpu_read(context_tracking.state) != IN_USER) {
 		__this_cpu_write(context_tracking.state, IN_USER);
+		/*
+		 * At this stage, only low level arch entry code remains and
+		 * then we'll run in userspace. We can assume there won't be
+		 * any RCU read-side critical section until the next call to
+		 * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
+		 * on the tick.
+		 */
 		rcu_user_enter();
 	}
 	local_irq_restore(flags);
 }
  
+
+/**
+ * user_exit - Inform the context tracking that the CPU is
+ *             exiting userspace mode and entering the kernel.
+ *
+ * This function must be called after we entered the kernel from userspace
+ * before any use of RCU read side critical section. This potentially include
+ * any high level kernel code like syscalls, exceptions, signal handling, etc...
+ *
+ * This call supports re-entrancy. This way it can be called from any exception
+ * handler without needing to know if we came from userspace or not.
+ */
 void user_exit(void)
 {
 	unsigned long flags;
  
-	/*
-	 * Some contexts may involve an exception occuring in an irq,
-	 * leading to that nesting:
-	 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
-	 * This would mess up the dyntick_nesting count though. And rcu_irq_*()
-	 * helpers are enough to protect RCU uses inside the exception. So
-	 * just return immediately if we detect we are in an IRQ.
-	 */
 	if (in_interrupt())
 		return;
  
 	local_irq_save(flags);
 	if (__this_cpu_read(context_tracking.state) == IN_USER) {
 		__this_cpu_write(context_tracking.state, IN_KERNEL);
+		/*
+		 * We are going to run code that may use RCU. Inform
+		 * RCU core about that (ie: we may need the tick again).
+		 */
 		rcu_user_exit();
 	}
 	local_irq_restore(flags);
 }
  
+
+/**
+ * context_tracking_task_switch - context switch the syscall callbacks
+ * @prev: the task that is being switched out
+ * @next: the task that is being switched in
+ *
+ * The context tracking uses the syscall slow path to implement its user-kernel
+ * boundaries probes on syscalls. This way it doesn't impact the syscall fast
+ * path on CPUs that don't do context tracking.
+ *
+ * But we need to clear the flag on the previous task because it may later
+ * migrate to some CPU that doesn't do the context tracking. As such the TIF
+ * flag may not be desired there.
+ */
 void context_tracking_task_switch(struct task_struct *prev,
 			     struct task_struct *next)
 {
@@ -111,5 +111,12 @@
  
 extern int rcu_expedited;
  
+#ifdef CONFIG_RCU_STALL_COMMON
+
+extern int rcu_cpu_stall_suppress;
+int rcu_jiffies_till_stall_check(void);
+
+#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
+
 #endif /* __LINUX_RCU_H */
@@ -415,4 +415,55 @@
 #define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \
 	do { } while (0)
 #endif
+
+#ifdef CONFIG_RCU_STALL_COMMON
+
+#ifdef CONFIG_PROVE_RCU
+#define RCU_STALL_DELAY_DELTA	       (5 * HZ)
+#else
+#define RCU_STALL_DELAY_DELTA	       0
+#endif
+
+int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
+int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
+
+module_param(rcu_cpu_stall_suppress, int, 0644);
+module_param(rcu_cpu_stall_timeout, int, 0644);
+
+int rcu_jiffies_till_stall_check(void)
+{
+	int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
+
+	/*
+	 * Limit check must be consistent with the Kconfig limits
+	 * for CONFIG_RCU_CPU_STALL_TIMEOUT.
+	 */
+	if (till_stall_check < 3) {
+		ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
+		till_stall_check = 3;
+	} else if (till_stall_check > 300) {
+		ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
+		till_stall_check = 300;
+	}
+	return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
+}
+
+static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
+{
+	rcu_cpu_stall_suppress = 1;
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block rcu_panic_block = {
+	.notifier_call = rcu_panic,
+};
+
+static int __init check_cpu_stall_init(void)
+{
+	atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
+	return 0;
+}
+early_initcall(check_cpu_stall_init);
+
+#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
@@ -51,10 +51,10 @@
 		       void (*func)(struct rcu_head *rcu),
 		       struct rcu_ctrlblk *rcp);
  
-#include "rcutiny_plugin.h"
-
 static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
  
+#include "rcutiny_plugin.h"
+
 /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
 static void rcu_idle_enter_common(long long newval)
 {
@@ -193,7 +193,7 @@
  * interrupts don't count, we must be running at the first interrupt
  * level.
  */
-int rcu_is_cpu_rrupt_from_idle(void)
+static int rcu_is_cpu_rrupt_from_idle(void)
 {
 	return rcu_dynticks_nesting <= 1;
 }
@@ -205,6 +205,7 @@
  */
 static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
 {
+	reset_cpu_stall_ticks(rcp);
 	if (rcp->rcucblist != NULL &&
 	    rcp->donetail != rcp->curtail) {
 		rcp->donetail = rcp->curtail;
@@ -251,6 +252,7 @@
  */
 void rcu_check_callbacks(int cpu, int user)
 {
+	check_cpu_stalls();
 	if (user || rcu_is_cpu_rrupt_from_idle())
 		rcu_sched_qs(cpu);
 	else if (!in_softirq())
@@ -33,6 +33,9 @@
 	struct rcu_head **donetail;	/* ->next pointer of last "done" CB. */
 	struct rcu_head **curtail;	/* ->next pointer of last CB. */
 	RCU_TRACE(long qlen);		/* Number of pending CBs. */
+	RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */
+	RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */
+	RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */
 	RCU_TRACE(char *name);		/* Name of RCU type. */
 };
  
@@ -54,6 +57,51 @@
 EXPORT_SYMBOL_GPL(rcu_scheduler_active);
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
  
+#ifdef CONFIG_RCU_TRACE
+
+static void check_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+	unsigned long j;
+	unsigned long js;
+
+	if (rcu_cpu_stall_suppress)
+		return;
+	rcp->ticks_this_gp++;
+	j = jiffies;
+	js = rcp->jiffies_stall;
+	if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
+		pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
+		       rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
+		       jiffies - rcp->gp_start, rcp->qlen);
+		dump_stack();
+	}
+	if (*rcp->curtail && ULONG_CMP_GE(j, js))
+		rcp->jiffies_stall = jiffies +
+			3 * rcu_jiffies_till_stall_check() + 3;
+	else if (ULONG_CMP_GE(j, js))
+		rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
+}
+
+static void check_cpu_stall_preempt(void);
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
+static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
+{
+#ifdef CONFIG_RCU_TRACE
+	rcp->ticks_this_gp = 0;
+	rcp->gp_start = jiffies;
+	rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
+#endif /* #ifdef CONFIG_RCU_TRACE */
+}
+
+static void check_cpu_stalls(void)
+{
+	RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk));
+	RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk));
+	RCU_TRACE(check_cpu_stall_preempt());
+}
+
 #ifdef CONFIG_TINY_PREEMPT_RCU
  
 #include <linux/delay.h>
@@ -448,6 +496,7 @@
 		/* Official start of GP. */
 		rcu_preempt_ctrlblk.gpnum++;
 		RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
+		reset_cpu_stall_ticks(&rcu_preempt_ctrlblk.rcb);
  
 		/* Any blocked RCU readers block new GP. */
 		if (rcu_preempt_blocked_readers_any())
@@ -1053,6 +1102,13 @@
 MODULE_AUTHOR("Paul E. McKenney");
 MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
 MODULE_LICENSE("GPL");
+
+static void check_cpu_stall_preempt(void)
+{
+#ifdef CONFIG_TINY_PREEMPT_RCU
+	check_cpu_stall(&rcu_preempt_ctrlblk.rcb);
+#endif /* #ifdef CONFIG_TINY_PREEMPT_RCU */
+}
  
 #endif /* #ifdef CONFIG_RCU_TRACE */
@@ -1782,7 +1782,7 @@
 	barrier_cbs_wq =
 		kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]),
 			GFP_KERNEL);
-	if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0)
+	if (barrier_cbs_tasks == NULL || !barrier_cbs_wq)
 		return -ENOMEM;
 	for (i = 0; i < n_barrier_cbs; i++) {
 		init_waitqueue_head(&barrier_cbs_wq[i]);
@@ -105,7 +105,7 @@
  * The rcu_scheduler_active variable transitions from zero to one just
  * before the first task is spawned.  So when this variable is zero, RCU
  * can assume that there is but one task, allowing RCU to (for example)
- * optimized synchronize_sched() to a simple barrier().  When this variable
+ * optimize synchronize_sched() to a simple barrier().  When this variable
  * is one, RCU must actually do all the hard work required to detect real
  * grace periods.  This variable is also used to suppress boot-time false
  * positives from lockdep-RCU error checking.
@@ -217,12 +217,6 @@
 module_param(qhimark, long, 0444);
 module_param(qlowmark, long, 0444);
  
-int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
-int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
-
-module_param(rcu_cpu_stall_suppress, int, 0644);
-module_param(rcu_cpu_stall_timeout, int, 0644);
-
 static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS;
 static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS;
  
  
  
@@ -305,17 +299,27 @@
 }
  
 /*
- * Does the current CPU require a yet-as-unscheduled grace period?
+ * Does the current CPU require a not-yet-started grace period?
+ * The caller must have disabled interrupts to prevent races with
+ * normal callback registry.
  */
 static int
 cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
 {
-	struct rcu_head **ntp;
+	int i;
  
-	ntp = rdp->nxttail[RCU_DONE_TAIL +
-			   (ACCESS_ONCE(rsp->completed) != rdp->completed)];
-	return rdp->nxttail[RCU_DONE_TAIL] && ntp && *ntp &&
-	       !rcu_gp_in_progress(rsp);
+	if (rcu_gp_in_progress(rsp))
+		return 0;  /* No, a grace period is already in progress. */
+	if (!rdp->nxttail[RCU_NEXT_TAIL])
+		return 0;  /* No, this is a no-CBs (or offline) CPU. */
+	if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
+		return 1;  /* Yes, this CPU has newly registered callbacks. */
+	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
+		if (rdp->nxttail[i - 1] != rdp->nxttail[i] &&
+		    ULONG_CMP_LT(ACCESS_ONCE(rsp->completed),
+				 rdp->nxtcompleted[i]))
+			return 1;  /* Yes, CBs for future grace period. */
+	return 0; /* No grace period needed. */
 }
  
 /*
@@ -336,7 +340,7 @@
 static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
 				bool user)
 {
-	trace_rcu_dyntick("Start", oldval, 0);
+	trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting);
 	if (!user && !is_idle_task(current)) {
 		struct task_struct *idle = idle_task(smp_processor_id());
  
@@ -727,7 +731,7 @@
  * interrupt from idle, return true.  The caller must have at least
  * disabled preemption.
  */
-int rcu_is_cpu_rrupt_from_idle(void)
+static int rcu_is_cpu_rrupt_from_idle(void)
 {
 	return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
 }
  
@@ -793,28 +797,10 @@
 	return 0;
 }
  
-static int jiffies_till_stall_check(void)
-{
-	int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
-
-	/*
-	 * Limit check must be consistent with the Kconfig limits
-	 * for CONFIG_RCU_CPU_STALL_TIMEOUT.
-	 */
-	if (till_stall_check < 3) {
-		ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
-		till_stall_check = 3;
-	} else if (till_stall_check > 300) {
-		ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
-		till_stall_check = 300;
-	}
-	return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
-}
-
 static void record_gp_stall_check_time(struct rcu_state *rsp)
 {
 	rsp->gp_start = jiffies;
-	rsp->jiffies_stall = jiffies + jiffies_till_stall_check();
+	rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
 }
  
 /*
@@ -857,7 +843,7 @@
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
 	}
-	rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3;
+	rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
  
 	/*
@@ -935,7 +921,7 @@
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
 		rsp->jiffies_stall = jiffies +
-				     3 * jiffies_till_stall_check() + 3;
+				     3 * rcu_jiffies_till_stall_check() + 3;
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
  
 	set_need_resched();  /* kick ourselves to get things going. */
@@ -966,12 +952,6 @@
 	}
 }
  
-static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
-{
-	rcu_cpu_stall_suppress = 1;
-	return NOTIFY_DONE;
-}
-
 /**
  * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
  *
@@ -989,15 +969,6 @@
 		rsp->jiffies_stall = jiffies + ULONG_MAX / 2;
 }
  
-static struct notifier_block rcu_panic_block = {
-	.notifier_call = rcu_panic,
-};
-
-static void __init check_cpu_stall_init(void)
-{
-	atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
-}
-
 /*
  * Update CPU-local rcu_data state to record the newly noticed grace period.
  * This is used both when we started the grace period and when we notice
@@ -1071,6 +1042,145 @@
 }
  
 /*
+ * Determine the value that ->completed will have at the end of the
+ * next subsequent grace period.  This is used to tag callbacks so that
+ * a CPU can invoke callbacks in a timely fashion even if that CPU has
+ * been dyntick-idle for an extended period with callbacks under the
+ * influence of RCU_FAST_NO_HZ.
+ *
+ * The caller must hold rnp->lock with interrupts disabled.
+ */
+static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
+				       struct rcu_node *rnp)
+{
+	/*
+	 * If RCU is idle, we just wait for the next grace period.
+	 * But we can only be sure that RCU is idle if we are looking
+	 * at the root rcu_node structure -- otherwise, a new grace
+	 * period might have started, but just not yet gotten around
+	 * to initializing the current non-root rcu_node structure.
+	 */
+	if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed)
+		return rnp->completed + 1;
+
+	/*
+	 * Otherwise, wait for a possible partial grace period and
+	 * then the subsequent full grace period.
+	 */
+	return rnp->completed + 2;
+}
+
+/*
+ * If there is room, assign a ->completed number to any callbacks on
+ * this CPU that have not already been assigned.  Also accelerate any
+ * callbacks that were previously assigned a ->completed number that has
+ * since proven to be too conservative, which can happen if callbacks get
+ * assigned a ->completed number while RCU is idle, but with reference to
+ * a non-root rcu_node structure.  This function is idempotent, so it does
+ * not hurt to call it repeatedly.
+ *
+ * The caller must hold rnp->lock with interrupts disabled.
+ */
+static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
+			       struct rcu_data *rdp)
+{
+	unsigned long c;
+	int i;
+
+	/* If the CPU has no callbacks, nothing to do. */
+	if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
+		return;
+
+	/*
+	 * Starting from the sublist containing the callbacks most
+	 * recently assigned a ->completed number and working down, find the
+	 * first sublist that is not assignable to an upcoming grace period.
+	 * Such a sublist has something in it (first two tests) and has
+	 * a ->completed number assigned that will complete sooner than
+	 * the ->completed number for newly arrived callbacks (last test).
+	 *
+	 * The key point is that any later sublist can be assigned the
+	 * same ->completed number as the newly arrived callbacks, which
+	 * means that the callbacks in any of these later sublist can be
+	 * grouped into a single sublist, whether or not they have already
+	 * been assigned a ->completed number.
+	 */
+	c = rcu_cbs_completed(rsp, rnp);
+	for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--)
+		if (rdp->nxttail[i] != rdp->nxttail[i - 1] &&
+		    !ULONG_CMP_GE(rdp->nxtcompleted[i], c))
+			break;
+
+	/*
+	 * If there are no sublist for unassigned callbacks, leave.
+	 * At the same time, advance "i" one sublist, so that "i" will
+	 * index into the sublist where all the remaining callbacks should
+	 * be grouped into.
+	 */
+	if (++i >= RCU_NEXT_TAIL)
+		return;
+
+	/*
+	 * Assign all subsequent callbacks' ->completed number to the next
+	 * full grace period and group them all in the sublist initially
+	 * indexed by "i".
+	 */
+	for (; i <= RCU_NEXT_TAIL; i++) {
+		rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL];
+		rdp->nxtcompleted[i] = c;
+	}
+
+	/* Trace depending on how much we were able to accelerate. */
+	if (!*rdp->nxttail[RCU_WAIT_TAIL])
+		trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccWaitCB");
+	else
+		trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccReadyCB");
+}
+
+/*
+ * Move any callbacks whose grace period has completed to the
+ * RCU_DONE_TAIL sublist, then compact the remaining sublists and
+ * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL
+ * sublist.  This function is idempotent, so it does not hurt to
+ * invoke it repeatedly.  As long as it is not invoked -too- often...
+ *
+ * The caller must hold rnp->lock with interrupts disabled.
+ */
+static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
+			    struct rcu_data *rdp)
+{
+	int i, j;
+
+	/* If the CPU has no callbacks, nothing to do. */
+	if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
+		return;
+
+	/*
+	 * Find all callbacks whose ->completed numbers indicate that they
+	 * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
+	 */
+	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
+		if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i]))
+			break;
+		rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i];
+	}
+	/* Clean up any sublist tail pointers that were misordered above. */
+	for (j = RCU_WAIT_TAIL; j < i; j++)
+		rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL];
+
+	/* Copy down callbacks to fill in empty sublists. */
+	for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
+		if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL])
+			break;
+		rdp->nxttail[j] = rdp->nxttail[i];
+		rdp->nxtcompleted[j] = rdp->nxtcompleted[i];
+	}
+
+	/* Classify any remaining callbacks. */
+	rcu_accelerate_cbs(rsp, rnp, rdp);
+}
+
+/*
  * Advance this CPU's callbacks, but only if the current grace period
  * has ended.  This may be called only from the CPU to whom the rdp
  * belongs.  In addition, the corresponding leaf rcu_node structure's
  
  
@@ -1080,13 +1190,16 @@
 __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
 {
 	/* Did another grace period end? */
-	if (rdp->completed != rnp->completed) {
+	if (rdp->completed == rnp->completed) {
  
-		/* Advance callbacks.  No harm if list empty. */
-		rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
-		rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
-		rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+		/* No, so just accelerate recent callbacks. */
+		rcu_accelerate_cbs(rsp, rnp, rdp);
  
+	} else {
+
+		/* Advance callbacks. */
+		rcu_advance_cbs(rsp, rnp, rdp);
+
 		/* Remember that we saw this grace-period completion. */
 		rdp->completed = rnp->completed;
 		trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend");
  
@@ -1392,17 +1505,10 @@
 	/*
 	 * Because there is no grace period in progress right now,
 	 * any callbacks we have up to this point will be satisfied
-	 * by the next grace period.  So promote all callbacks to be
-	 * handled after the end of the next grace period.  If the
-	 * CPU is not yet aware of the end of the previous grace period,
-	 * we need to allow for the callback advancement that will
-	 * occur when it does become aware.  Deadlock prevents us from
-	 * making it aware at this point: We cannot acquire a leaf
-	 * rcu_node ->lock while holding the root rcu_node ->lock.
+	 * by the next grace period.  So this is a good place to
+	 * assign a grace period number to recently posted callbacks.
 	 */
-	rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
-	if (rdp->completed == rsp->completed)
-		rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+	rcu_accelerate_cbs(rsp, rnp, rdp);
  
 	rsp->gp_flags = RCU_GP_FLAG_INIT;
 	raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
@@ -1527,7 +1633,7 @@
 		 * This GP can't end until cpu checks in, so all of our
 		 * callbacks can be processed during the next GP.
 		 */
-		rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+		rcu_accelerate_cbs(rsp, rnp, rdp);
  
 		rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
 	}
@@ -1779,7 +1885,7 @@
 	long bl, count, count_lazy;
 	int i;
  
-	/* If no callbacks are ready, just return.*/
+	/* If no callbacks are ready, just return. */
 	if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
 		trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);
 		trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
  
  
  
@@ -2008,19 +2114,19 @@
  
 	WARN_ON_ONCE(rdp->beenonline == 0);
  
-	/*
-	 * Advance callbacks in response to end of earlier grace
-	 * period that some other CPU ended.
-	 */
+	/* Handle the end of a grace period that some other CPU ended.  */
 	rcu_process_gp_end(rsp, rdp);
  
 	/* Update RCU state based on any recent quiescent states. */
 	rcu_check_quiescent_state(rsp, rdp);
  
 	/* Does this CPU require a not-yet-started grace period? */
+	local_irq_save(flags);
 	if (cpu_needs_another_gp(rsp, rdp)) {
-		raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
+		raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
 		rcu_start_gp(rsp, flags);  /* releases above lock */
+	} else {
+		local_irq_restore(flags);
 	}
  
 	/* If there are callbacks ready, invoke them. */
@@ -2719,9 +2825,6 @@
 	rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
 	WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
 	WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
-#ifdef CONFIG_RCU_USER_QS
-	WARN_ON_ONCE(rdp->dynticks->in_user);
-#endif
 	rdp->cpu = cpu;
 	rdp->rsp = rsp;
 	rcu_boot_init_nocb_percpu_data(rdp);
@@ -2938,6 +3041,10 @@
  
 	BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf));  /* Fix buf[] init! */
  
+	/* Silence gcc 4.8 warning about array index out of range. */
+	if (rcu_num_lvls > RCU_NUM_LVLS)
+		panic("rcu_init_one: rcu_num_lvls overflow");
+
 	/* Initialize the level-tracking arrays. */
  
 	for (i = 0; i < rcu_num_lvls; i++)
@@ -3074,7 +3181,6 @@
 	cpu_notifier(rcu_cpu_notify, 0);
 	for_each_online_cpu(cpu)
 		rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
-	check_cpu_stall_init();
 }
  
 #include "rcutree_plugin.h"
@@ -102,10 +102,6 @@
 				    /* idle-period nonlazy_posted snapshot. */
 	int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
 #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
-#ifdef CONFIG_RCU_USER_QS
-	bool ignore_user_qs;	    /* Treat userspace as extended QS or not */
-	bool in_user;		    /* Is the CPU in userland from RCU POV? */
-#endif
 };
  
 /* RCU's kthread states for tracing. */
@@ -282,6 +278,8 @@
 	 */
 	struct rcu_head *nxtlist;
 	struct rcu_head **nxttail[RCU_NEXT_SIZE];
+	unsigned long	nxtcompleted[RCU_NEXT_SIZE];
+					/* grace periods for sublists. */
 	long		qlen_lazy;	/* # of lazy queued callbacks */
 	long		qlen;		/* # of queued callbacks, incl lazy */
 	long		qlen_last_fqs_check;
@@ -343,11 +341,6 @@
  
 #define RCU_JIFFIES_TILL_FORCE_QS	 3	/* for rsp->jiffies_force_qs */
  
-#ifdef CONFIG_PROVE_RCU
-#define RCU_STALL_DELAY_DELTA	       (5 * HZ)
-#else
-#define RCU_STALL_DELAY_DELTA	       0
-#endif
 #define RCU_STALL_RAT_DELAY		2	/* Allow other CPUs time */
 						/*  to take at least one */
 						/*  scheduling clock irq */
@@ -605,61 +605,6 @@
  
 	 For more details, see Documentation/lockdep-design.txt.
  
-config PROVE_RCU
-	bool "RCU debugging: prove RCU correctness"
-	depends on PROVE_LOCKING
-	default n
-	help
-	 This feature enables lockdep extensions that check for correct
-	 use of RCU APIs.  This is currently under development.  Say Y
-	 if you want to debug RCU usage or help work on the PROVE_RCU
-	 feature.
-
-	 Say N if you are unsure.
-
-config PROVE_RCU_REPEATEDLY
-	bool "RCU debugging: don't disable PROVE_RCU on first splat"
-	depends on PROVE_RCU
-	default n
-	help
-	 By itself, PROVE_RCU will disable checking upon issuing the
-	 first warning (or "splat").  This feature prevents such
-	 disabling, allowing multiple RCU-lockdep warnings to be printed
-	 on a single reboot.
-
-	 Say Y to allow multiple RCU-lockdep warnings per boot.
-
-	 Say N if you are unsure.
-
-config PROVE_RCU_DELAY
-	bool "RCU debugging: preemptible RCU race provocation"
-	depends on DEBUG_KERNEL && PREEMPT_RCU
-	default n
-	help
-	 There is a class of races that involve an unlikely preemption
-	 of __rcu_read_unlock() just after ->rcu_read_lock_nesting has
-	 been set to INT_MIN.  This feature inserts a delay at that
-	 point to increase the probability of these races.
-
-	 Say Y to increase probability of preemption of __rcu_read_unlock().
-
-	 Say N if you are unsure.
-
-config SPARSE_RCU_POINTER
-	bool "RCU debugging: sparse-based checks for pointer usage"
-	default n
-	help
-	 This feature enables the __rcu sparse annotation for
-	 RCU-protected pointers.  This annotation will cause sparse
-	 to flag any non-RCU used of annotated pointers.  This can be
-	 helpful when debugging RCU usage.  Please note that this feature
-	 is not intended to enforce code cleanliness; it is instead merely
-	 a debugging aid.
-
-	 Say Y to make sparse flag questionable use of RCU-protected pointers
-
-	 Say N if you are unsure.
-
 config LOCKDEP
 	bool
 	depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT
@@ -937,6 +882,63 @@
 	  BOOT_PRINTK_DELAY also may cause LOCKUP_DETECTOR to detect
 	  what it believes to be lockup conditions.
  
+menu "RCU Debugging"
+
+config PROVE_RCU
+	bool "RCU debugging: prove RCU correctness"
+	depends on PROVE_LOCKING
+	default n
+	help
+	 This feature enables lockdep extensions that check for correct
+	 use of RCU APIs.  This is currently under development.  Say Y
+	 if you want to debug RCU usage or help work on the PROVE_RCU
+	 feature.
+
+	 Say N if you are unsure.
+
+config PROVE_RCU_REPEATEDLY
+	bool "RCU debugging: don't disable PROVE_RCU on first splat"
+	depends on PROVE_RCU
+	default n
+	help
+	 By itself, PROVE_RCU will disable checking upon issuing the
+	 first warning (or "splat").  This feature prevents such
+	 disabling, allowing multiple RCU-lockdep warnings to be printed
+	 on a single reboot.
+
+	 Say Y to allow multiple RCU-lockdep warnings per boot.
+
+	 Say N if you are unsure.
+
+config PROVE_RCU_DELAY
+	bool "RCU debugging: preemptible RCU race provocation"
+	depends on DEBUG_KERNEL && PREEMPT_RCU
+	default n
+	help
+	 There is a class of races that involve an unlikely preemption
+	 of __rcu_read_unlock() just after ->rcu_read_lock_nesting has
+	 been set to INT_MIN.  This feature inserts a delay at that
+	 point to increase the probability of these races.
+
+	 Say Y to increase probability of preemption of __rcu_read_unlock().
+
+	 Say N if you are unsure.
+
+config SPARSE_RCU_POINTER
+	bool "RCU debugging: sparse-based checks for pointer usage"
+	default n
+	help
+	 This feature enables the __rcu sparse annotation for
+	 RCU-protected pointers.  This annotation will cause sparse
+	 to flag any non-RCU used of annotated pointers.  This can be
+	 helpful when debugging RCU usage.  Please note that this feature
+	 is not intended to enforce code cleanliness; it is instead merely
+	 a debugging aid.
+
+	 Say Y to make sparse flag questionable use of RCU-protected pointers
+
+	 Say N if you are unsure.
+
 config RCU_TORTURE_TEST
 	tristate "torture tests for RCU"
 	depends on DEBUG_KERNEL
@@ -970,7 +972,7 @@
  
 config RCU_CPU_STALL_TIMEOUT
 	int "RCU CPU stall timeout in seconds"
-	depends on TREE_RCU || TREE_PREEMPT_RCU
+	depends on RCU_STALL_COMMON
 	range 3 300
 	default 21
 	help
@@ -1015,6 +1017,8 @@
  
 	  Say Y here if you want to enable RCU tracing
 	  Say N if you are unsure.
+
+endmenu # "RCU Debugging"
  
 config KPROBES_SANITY_TEST
 	bool "Kprobes sanity tests"
...	...	@@ -756,7 +756,7 @@
756	756	* preemptible RCU implementations (TREE_PREEMPT_RCU and TINY_PREEMPT_RCU)
757	757	* in CONFIG_PREEMPT kernel builds, RCU read-side critical sections may
758	758	* be preempted, but explicit blocking is illegal. Finally, in preemptible
759		- * RCU implementations in real-time (CONFIG_PREEMPT_RT) kernel builds,
	759	+ * RCU implementations in real-time (with -rt patchset) kernel builds,
760	760	* RCU read-side critical sections may be preempted and they may also
761	761	* block, but only when acquiring spinlocks that are subject to priority
762	762	* inheritance.
...	...	@@ -44,8 +44,10 @@
44	44	* of a new grace period or the end of an old grace period ("cpustart"
45	45	* and "cpuend", respectively), a CPU passing through a quiescent
46	46	* state ("cpuqs"), a CPU coming online or going offline ("cpuonl"
47		- * and "cpuofl", respectively), and a CPU being kicked for being too
48		- * long in dyntick-idle mode ("kick").
	47	+ * and "cpuofl", respectively), a CPU being kicked for being too
	48	+ * long in dyntick-idle mode ("kick"), a CPU accelerating its new
	49	+ * callbacks to RCU_NEXT_READY_TAIL ("AccReadyCB"), and a CPU
	50	+ * accelerating its new callbacks to RCU_WAIT_TAIL ("AccWaitCB").
49	51	*/
50	52	TRACE_EVENT(rcu_grace_period,
51	53
...	...	@@ -393,7 +395,7 @@
393	395	*/
394	396	TRACE_EVENT(rcu_batch_start,
395	397
396		- TP_PROTO(char *rcuname, long qlen_lazy, long qlen, int blimit),
	398	+ TP_PROTO(char *rcuname, long qlen_lazy, long qlen, long blimit),
397	399
398	400	TP_ARGS(rcuname, qlen_lazy, qlen, blimit),
399	401
...	...	@@ -401,7 +403,7 @@
401	403	__field(char *, rcuname)
402	404	__field(long, qlen_lazy)
403	405	__field(long, qlen)
404		- __field(int, blimit)
	406	+ __field(long, blimit)
405	407	),
406	408
407	409	TP_fast_assign(
...	...	@@ -411,7 +413,7 @@
411	413	__entry->blimit = blimit;
412	414	),
413	415
414		- TP_printk("%s CBs=%ld/%ld bl=%d",
	416	+ TP_printk("%s CBs=%ld/%ld bl=%ld",
415	417	__entry->rcuname, __entry->qlen_lazy, __entry->qlen,
416	418	__entry->blimit)
417	419	);
...	...	@@ -453,7 +453,7 @@
453	453
454	454	config TREE_PREEMPT_RCU
455	455	bool "Preemptible tree-based hierarchical RCU"
456		- depends on PREEMPT && SMP
	456	+ depends on PREEMPT
457	457	help
458	458	This option selects the RCU implementation that is
459	459	designed for very large SMP systems with hundreds or
...	...	@@ -461,6 +461,8 @@
461	461	is also required. It also scales down nicely to
462	462	smaller systems.
463	463
	464	+ Select this option if you are unsure.
	465	+
464	466	config TINY_RCU
465	467	bool "UP-only small-memory-footprint RCU"
466	468	depends on !PREEMPT && !SMP
...	...	@@ -485,6 +487,14 @@
485	487	help
486	488	This option enables preemptible-RCU code that is common between
487	489	the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations.
	490	+
	491	+config RCU_STALL_COMMON
	492	+ def_bool ( TREE_RCU \|\| TREE_PREEMPT_RCU \|\| RCU_TRACE )
	493	+ help
	494	+ This option enables RCU CPU stall code that is common between
	495	+ the TINY and TREE variants of RCU. The purpose is to allow
	496	+ the tiny variants to disable RCU CPU stall warnings, while
	497	+ making these warnings mandatory for the tree variants.
488	498
489	499	config CONTEXT_TRACKING
490	500	bool
	1	+/*
	2	+ * Context tracking: Probe on high level context boundaries such as kernel
	3	+ * and userspace. This includes syscalls and exceptions entry/exit.
	4	+ *
	5	+ * This is used by RCU to remove its dependency on the timer tick while a CPU
	6	+ * runs in userspace.
	7	+ *
	8	+ * Started by Frederic Weisbecker:
	9	+ *
	10	+ * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com>
	11	+ *
	12	+ * Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton,
	13	+ * Steven Rostedt, Peter Zijlstra for suggestions and improvements.
	14	+ *
	15	+ */
	16	+
1	17	#include <linux/context_tracking.h>
2	18	#include <linux/rcupdate.h>
3	19	#include <linux/sched.h>
...	...	@@ -6,8 +22,8 @@
6	22
7	23	struct context_tracking {
8	24	/*
9		- * When active is false, hooks are not set to
10		- * minimize overhead: TIF flags are cleared
	25	+ * When active is false, probes are unset in order
	26	+ * to minimize overhead: TIF flags are cleared
11	27	* and calls to user_enter/exit are ignored. This
12	28	* may be further optimized using static keys.
13	29	*/
...	...	@@ -24,6 +40,15 @@
24	40	#endif
25	41	};
26	42
	43	+/**
	44	+ * user_enter - Inform the context tracking that the CPU is going to
	45	+ * enter userspace mode.
	46	+ *
	47	+ * This function must be called right before we switch from the kernel
	48	+ * to userspace, when it's guaranteed the remaining kernel instructions
	49	+ * to execute won't use any RCU read side critical section because this
	50	+ * function sets RCU in extended quiescent state.
	51	+ */
27	52	void user_enter(void)
28	53	{
29	54	unsigned long flags;
30	55
31	56
32	57
33	58
34	59
...	...	@@ -39,40 +64,70 @@
39	64	if (in_interrupt())
40	65	return;
41	66
	67	+ /* Kernel threads aren't supposed to go to userspace */
42	68	WARN_ON_ONCE(!current->mm);
43	69
44	70	local_irq_save(flags);
45	71	if (__this_cpu_read(context_tracking.active) &&
46	72	__this_cpu_read(context_tracking.state) != IN_USER) {
47	73	__this_cpu_write(context_tracking.state, IN_USER);
	74	+ /*
	75	+ * At this stage, only low level arch entry code remains and
	76	+ * then we'll run in userspace. We can assume there won't be
	77	+ * any RCU read-side critical section until the next call to
	78	+ * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
	79	+ * on the tick.
	80	+ */
48	81	rcu_user_enter();
49	82	}
50	83	local_irq_restore(flags);
51	84	}
52	85
	86	+
	87	+/**
	88	+ * user_exit - Inform the context tracking that the CPU is
	89	+ * exiting userspace mode and entering the kernel.
	90	+ *
	91	+ * This function must be called after we entered the kernel from userspace
	92	+ * before any use of RCU read side critical section. This potentially include
	93	+ * any high level kernel code like syscalls, exceptions, signal handling, etc...
	94	+ *
	95	+ * This call supports re-entrancy. This way it can be called from any exception
	96	+ * handler without needing to know if we came from userspace or not.
	97	+ */
53	98	void user_exit(void)
54	99	{
55	100	unsigned long flags;
56	101
57		- /*
58		- * Some contexts may involve an exception occuring in an irq,
59		- * leading to that nesting:
60		- * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
61		- * This would mess up the dyntick_nesting count though. And rcu_irq_*()
62		- * helpers are enough to protect RCU uses inside the exception. So
63		- * just return immediately if we detect we are in an IRQ.
64		- */
65	102	if (in_interrupt())
66	103	return;
67	104
68	105	local_irq_save(flags);
69	106	if (__this_cpu_read(context_tracking.state) == IN_USER) {
70	107	__this_cpu_write(context_tracking.state, IN_KERNEL);
	108	+ /*
	109	+ * We are going to run code that may use RCU. Inform
	110	+ * RCU core about that (ie: we may need the tick again).
	111	+ */
71	112	rcu_user_exit();
72	113	}
73	114	local_irq_restore(flags);
74	115	}
75	116
	117	+
	118	+/**
	119	+ * context_tracking_task_switch - context switch the syscall callbacks
	120	+ * @prev: the task that is being switched out
	121	+ * @next: the task that is being switched in
	122	+ *
	123	+ * The context tracking uses the syscall slow path to implement its user-kernel
	124	+ * boundaries probes on syscalls. This way it doesn't impact the syscall fast
	125	+ * path on CPUs that don't do context tracking.
	126	+ *
	127	+ * But we need to clear the flag on the previous task because it may later
	128	+ * migrate to some CPU that doesn't do the context tracking. As such the TIF
	129	+ * flag may not be desired there.
	130	+ */
76	131	void context_tracking_task_switch(struct task_struct *prev,
77	132	struct task_struct *next)
78	133	{
...	...	@@ -111,5 +111,12 @@
111	111
112	112	extern int rcu_expedited;
113	113
	114	+#ifdef CONFIG_RCU_STALL_COMMON
	115	+
	116	+extern int rcu_cpu_stall_suppress;
	117	+int rcu_jiffies_till_stall_check(void);
	118	+
	119	+#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
	120	+
114	121	#endif /* __LINUX_RCU_H */
...	...	@@ -415,4 +415,55 @@
415	415	#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \
416	416	do { } while (0)
417	417	#endif
	418	+
	419	+#ifdef CONFIG_RCU_STALL_COMMON
	420	+
	421	+#ifdef CONFIG_PROVE_RCU
	422	+#define RCU_STALL_DELAY_DELTA (5 * HZ)
	423	+#else
	424	+#define RCU_STALL_DELAY_DELTA 0
	425	+#endif
	426	+
	427	+int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
	428	+int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
	429	+
	430	+module_param(rcu_cpu_stall_suppress, int, 0644);
	431	+module_param(rcu_cpu_stall_timeout, int, 0644);
	432	+
	433	+int rcu_jiffies_till_stall_check(void)
	434	+{
	435	+ int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
	436	+
	437	+ /*
	438	+ * Limit check must be consistent with the Kconfig limits
	439	+ * for CONFIG_RCU_CPU_STALL_TIMEOUT.
	440	+ */
	441	+ if (till_stall_check < 3) {
	442	+ ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
	443	+ till_stall_check = 3;
	444	+ } else if (till_stall_check > 300) {
	445	+ ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
	446	+ till_stall_check = 300;
	447	+ }
	448	+ return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
	449	+}
	450	+
	451	+static int rcu_panic(struct notifier_block this, unsigned long ev, void ptr)
	452	+{
	453	+ rcu_cpu_stall_suppress = 1;
	454	+ return NOTIFY_DONE;
	455	+}
	456	+
	457	+static struct notifier_block rcu_panic_block = {
	458	+ .notifier_call = rcu_panic,
	459	+};
	460	+
	461	+static int __init check_cpu_stall_init(void)
	462	+{
	463	+ atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
	464	+ return 0;
	465	+}
	466	+early_initcall(check_cpu_stall_init);
	467	+
	468	+#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
...	...	@@ -51,10 +51,10 @@
51	51	void (func)(struct rcu_head rcu),
52	52	struct rcu_ctrlblk *rcp);
53	53
54		-#include "rcutiny_plugin.h"
55		-
56	54	static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
57	55
	56	+#include "rcutiny_plugin.h"
	57	+
58	58	/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
59	59	static void rcu_idle_enter_common(long long newval)
60	60	{
...	...	@@ -193,7 +193,7 @@
193	193	* interrupts don't count, we must be running at the first interrupt
194	194	* level.
195	195	*/
196		-int rcu_is_cpu_rrupt_from_idle(void)
	196	+static int rcu_is_cpu_rrupt_from_idle(void)
197	197	{
198	198	return rcu_dynticks_nesting <= 1;
199	199	}
...	...	@@ -205,6 +205,7 @@
205	205	*/
206	206	static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
207	207	{
	208	+ reset_cpu_stall_ticks(rcp);
208	209	if (rcp->rcucblist != NULL &&
209	210	rcp->donetail != rcp->curtail) {
210	211	rcp->donetail = rcp->curtail;
...	...	@@ -251,6 +252,7 @@
251	252	*/
252	253	void rcu_check_callbacks(int cpu, int user)
253	254	{
	255	+ check_cpu_stalls();
254	256	if (user \|\| rcu_is_cpu_rrupt_from_idle())
255	257	rcu_sched_qs(cpu);
256	258	else if (!in_softirq())
...	...	@@ -33,6 +33,9 @@
33	33	struct rcu_head *donetail; / ->next pointer of last "done" CB. */
34	34	struct rcu_head *curtail; / ->next pointer of last CB. */
35	35	RCU_TRACE(long qlen); /* Number of pending CBs. */
	36	+ RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */
	37	+ RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */
	38	+ RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */
36	39	RCU_TRACE(char name); / Name of RCU type. */
37	40	};
38	41
...	...	@@ -54,6 +57,51 @@
54	57	EXPORT_SYMBOL_GPL(rcu_scheduler_active);
55	58	#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
56	59
	60	+#ifdef CONFIG_RCU_TRACE
	61	+
	62	+static void check_cpu_stall(struct rcu_ctrlblk *rcp)
	63	+{
	64	+ unsigned long j;
	65	+ unsigned long js;
	66	+
	67	+ if (rcu_cpu_stall_suppress)
	68	+ return;
	69	+ rcp->ticks_this_gp++;
	70	+ j = jiffies;
	71	+ js = rcp->jiffies_stall;
	72	+ if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
	73	+ pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
	74	+ rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
	75	+ jiffies - rcp->gp_start, rcp->qlen);
	76	+ dump_stack();
	77	+ }
	78	+ if (*rcp->curtail && ULONG_CMP_GE(j, js))
	79	+ rcp->jiffies_stall = jiffies +
	80	+ 3 * rcu_jiffies_till_stall_check() + 3;
	81	+ else if (ULONG_CMP_GE(j, js))
	82	+ rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
	83	+}
	84	+
	85	+static void check_cpu_stall_preempt(void);
	86	+
	87	+#endif /* #ifdef CONFIG_RCU_TRACE */
	88	+
	89	+static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
	90	+{
	91	+#ifdef CONFIG_RCU_TRACE
	92	+ rcp->ticks_this_gp = 0;
	93	+ rcp->gp_start = jiffies;
	94	+ rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
	95	+#endif /* #ifdef CONFIG_RCU_TRACE */
	96	+}
	97	+
	98	+static void check_cpu_stalls(void)
	99	+{
	100	+ RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk));
	101	+ RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk));
	102	+ RCU_TRACE(check_cpu_stall_preempt());
	103	+}
	104	+
57	105	#ifdef CONFIG_TINY_PREEMPT_RCU
58	106
59	107	#include <linux/delay.h>
...	...	@@ -448,6 +496,7 @@
448	496	/* Official start of GP. */
449	497	rcu_preempt_ctrlblk.gpnum++;
450	498	RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
	499	+ reset_cpu_stall_ticks(&rcu_preempt_ctrlblk.rcb);
451	500
452	501	/* Any blocked RCU readers block new GP. */
453	502	if (rcu_preempt_blocked_readers_any())
...	...	@@ -1053,6 +1102,13 @@
1053	1102	MODULE_AUTHOR("Paul E. McKenney");
1054	1103	MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
1055	1104	MODULE_LICENSE("GPL");
	1105	+
	1106	+static void check_cpu_stall_preempt(void)
	1107	+{
	1108	+#ifdef CONFIG_TINY_PREEMPT_RCU
	1109	+ check_cpu_stall(&rcu_preempt_ctrlblk.rcb);
	1110	+#endif /* #ifdef CONFIG_TINY_PREEMPT_RCU */
	1111	+}
1056	1112
1057	1113	#endif /* #ifdef CONFIG_RCU_TRACE */
...	...	@@ -1782,7 +1782,7 @@
1782	1782	barrier_cbs_wq =
1783	1783	kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]),
1784	1784	GFP_KERNEL);
1785		- if (barrier_cbs_tasks == NULL \|\| barrier_cbs_wq == 0)
	1785	+ if (barrier_cbs_tasks == NULL \|\| !barrier_cbs_wq)
1786	1786	return -ENOMEM;
1787	1787	for (i = 0; i < n_barrier_cbs; i++) {
1788	1788	init_waitqueue_head(&barrier_cbs_wq[i]);
...	...	@@ -105,7 +105,7 @@
105	105	* The rcu_scheduler_active variable transitions from zero to one just
106	106	* before the first task is spawned. So when this variable is zero, RCU
107	107	* can assume that there is but one task, allowing RCU to (for example)
108		- * optimized synchronize_sched() to a simple barrier(). When this variable
	108	+ * optimize synchronize_sched() to a simple barrier(). When this variable
109	109	* is one, RCU must actually do all the hard work required to detect real
110	110	* grace periods. This variable is also used to suppress boot-time false
111	111	* positives from lockdep-RCU error checking.
...	...	@@ -217,12 +217,6 @@
217	217	module_param(qhimark, long, 0444);
218	218	module_param(qlowmark, long, 0444);
219	219
220		-int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
221		-int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
222		-
223		-module_param(rcu_cpu_stall_suppress, int, 0644);
224		-module_param(rcu_cpu_stall_timeout, int, 0644);
225		-
226	220	static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS;
227	221	static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS;
228	222
229	223
230	224
...	...	@@ -305,17 +299,27 @@
305	299	}
306	300
307	301	/*
308		- * Does the current CPU require a yet-as-unscheduled grace period?
	302	+ * Does the current CPU require a not-yet-started grace period?
	303	+ * The caller must have disabled interrupts to prevent races with
	304	+ * normal callback registry.
309	305	*/
310	306	static int
311	307	cpu_needs_another_gp(struct rcu_state rsp, struct rcu_data rdp)
312	308	{
313		- struct rcu_head **ntp;
	309	+ int i;
314	310
315		- ntp = rdp->nxttail[RCU_DONE_TAIL +
316		- (ACCESS_ONCE(rsp->completed) != rdp->completed)];
317		- return rdp->nxttail[RCU_DONE_TAIL] && ntp && *ntp &&
318		- !rcu_gp_in_progress(rsp);
	311	+ if (rcu_gp_in_progress(rsp))
	312	+ return 0; /* No, a grace period is already in progress. */
	313	+ if (!rdp->nxttail[RCU_NEXT_TAIL])
	314	+ return 0; /* No, this is a no-CBs (or offline) CPU. */
	315	+ if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
	316	+ return 1; /* Yes, this CPU has newly registered callbacks. */
	317	+ for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
	318	+ if (rdp->nxttail[i - 1] != rdp->nxttail[i] &&
	319	+ ULONG_CMP_LT(ACCESS_ONCE(rsp->completed),
	320	+ rdp->nxtcompleted[i]))
	321	+ return 1; /* Yes, CBs for future grace period. */
	322	+ return 0; /* No grace period needed. */
319	323	}
320	324
321	325	/*
...	...	@@ -336,7 +340,7 @@
336	340	static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
337	341	bool user)
338	342	{
339		- trace_rcu_dyntick("Start", oldval, 0);
	343	+ trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting);
340	344	if (!user && !is_idle_task(current)) {
341	345	struct task_struct *idle = idle_task(smp_processor_id());
342	346
...	...	@@ -727,7 +731,7 @@
727	731	* interrupt from idle, return true. The caller must have at least
728	732	* disabled preemption.
729	733	*/
730		-int rcu_is_cpu_rrupt_from_idle(void)
	734	+static int rcu_is_cpu_rrupt_from_idle(void)
731	735	{
732	736	return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
733	737	}
734	738
...	...	@@ -793,28 +797,10 @@
793	797	return 0;
794	798	}
795	799
796		-static int jiffies_till_stall_check(void)
797		-{
798		- int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
799		-
800		- /*
801		- * Limit check must be consistent with the Kconfig limits
802		- * for CONFIG_RCU_CPU_STALL_TIMEOUT.
803		- */
804		- if (till_stall_check < 3) {
805		- ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
806		- till_stall_check = 3;
807		- } else if (till_stall_check > 300) {
808		- ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
809		- till_stall_check = 300;
810		- }
811		- return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
812		-}
813		-
814	800	static void record_gp_stall_check_time(struct rcu_state *rsp)
815	801	{
816	802	rsp->gp_start = jiffies;
817		- rsp->jiffies_stall = jiffies + jiffies_till_stall_check();
	803	+ rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
818	804	}
819	805
820	806	/*
...	...	@@ -857,7 +843,7 @@
857	843	raw_spin_unlock_irqrestore(&rnp->lock, flags);
858	844	return;
859	845	}
860		- rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3;
	846	+ rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
861	847	raw_spin_unlock_irqrestore(&rnp->lock, flags);
862	848
863	849	/*
...	...	@@ -935,7 +921,7 @@
935	921	raw_spin_lock_irqsave(&rnp->lock, flags);
936	922	if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
937	923	rsp->jiffies_stall = jiffies +
938		- 3 * jiffies_till_stall_check() + 3;
	924	+ 3 * rcu_jiffies_till_stall_check() + 3;
939	925	raw_spin_unlock_irqrestore(&rnp->lock, flags);
940	926
941	927	set_need_resched(); /* kick ourselves to get things going. */
...	...	@@ -966,12 +952,6 @@
966	952	}
967	953	}
968	954
969		-static int rcu_panic(struct notifier_block this, unsigned long ev, void ptr)
970		-{
971		- rcu_cpu_stall_suppress = 1;
972		- return NOTIFY_DONE;
973		-}
974		-
975	955	/**
976	956	* rcu_cpu_stall_reset - prevent further stall warnings in current grace period
977	957	*
...	...	@@ -989,15 +969,6 @@
989	969	rsp->jiffies_stall = jiffies + ULONG_MAX / 2;
990	970	}
991	971
992		-static struct notifier_block rcu_panic_block = {
993		- .notifier_call = rcu_panic,
994		-};
995		-
996		-static void __init check_cpu_stall_init(void)
997		-{
998		- atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
999		-}
1000		-
1001	972	/*
1002	973	* Update CPU-local rcu_data state to record the newly noticed grace period.
1003	974	* This is used both when we started the grace period and when we notice
...	...	@@ -1071,6 +1042,145 @@
1071	1042	}
1072	1043
1073	1044	/*
	1045	+ * Determine the value that ->completed will have at the end of the
	1046	+ * next subsequent grace period. This is used to tag callbacks so that
	1047	+ * a CPU can invoke callbacks in a timely fashion even if that CPU has
	1048	+ * been dyntick-idle for an extended period with callbacks under the
	1049	+ * influence of RCU_FAST_NO_HZ.
	1050	+ *
	1051	+ * The caller must hold rnp->lock with interrupts disabled.
	1052	+ */
	1053	+static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
	1054	+ struct rcu_node *rnp)
	1055	+{
	1056	+ /*
	1057	+ * If RCU is idle, we just wait for the next grace period.
	1058	+ * But we can only be sure that RCU is idle if we are looking
	1059	+ * at the root rcu_node structure -- otherwise, a new grace
	1060	+ * period might have started, but just not yet gotten around
	1061	+ * to initializing the current non-root rcu_node structure.
	1062	+ */
	1063	+ if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed)
	1064	+ return rnp->completed + 1;
	1065	+
	1066	+ /*
	1067	+ * Otherwise, wait for a possible partial grace period and
	1068	+ * then the subsequent full grace period.
	1069	+ */
	1070	+ return rnp->completed + 2;
	1071	+}
	1072	+
	1073	+/*
	1074	+ * If there is room, assign a ->completed number to any callbacks on
	1075	+ * this CPU that have not already been assigned. Also accelerate any
	1076	+ * callbacks that were previously assigned a ->completed number that has
	1077	+ * since proven to be too conservative, which can happen if callbacks get
	1078	+ * assigned a ->completed number while RCU is idle, but with reference to
	1079	+ * a non-root rcu_node structure. This function is idempotent, so it does
	1080	+ * not hurt to call it repeatedly.
	1081	+ *
	1082	+ * The caller must hold rnp->lock with interrupts disabled.
	1083	+ */
	1084	+static void rcu_accelerate_cbs(struct rcu_state rsp, struct rcu_node rnp,
	1085	+ struct rcu_data *rdp)
	1086	+{
	1087	+ unsigned long c;
	1088	+ int i;
	1089	+
	1090	+ /* If the CPU has no callbacks, nothing to do. */
	1091	+ if (!rdp->nxttail[RCU_NEXT_TAIL] \|\| !*rdp->nxttail[RCU_DONE_TAIL])
	1092	+ return;
	1093	+
	1094	+ /*
	1095	+ * Starting from the sublist containing the callbacks most
	1096	+ * recently assigned a ->completed number and working down, find the
	1097	+ * first sublist that is not assignable to an upcoming grace period.
	1098	+ * Such a sublist has something in it (first two tests) and has
	1099	+ * a ->completed number assigned that will complete sooner than
	1100	+ * the ->completed number for newly arrived callbacks (last test).
	1101	+ *
	1102	+ * The key point is that any later sublist can be assigned the
	1103	+ * same ->completed number as the newly arrived callbacks, which
	1104	+ * means that the callbacks in any of these later sublist can be
	1105	+ * grouped into a single sublist, whether or not they have already
	1106	+ * been assigned a ->completed number.
	1107	+ */
	1108	+ c = rcu_cbs_completed(rsp, rnp);
	1109	+ for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--)
	1110	+ if (rdp->nxttail[i] != rdp->nxttail[i - 1] &&
	1111	+ !ULONG_CMP_GE(rdp->nxtcompleted[i], c))
	1112	+ break;
	1113	+
	1114	+ /*
	1115	+ * If there are no sublist for unassigned callbacks, leave.
	1116	+ * At the same time, advance "i" one sublist, so that "i" will
	1117	+ * index into the sublist where all the remaining callbacks should
	1118	+ * be grouped into.
	1119	+ */
	1120	+ if (++i >= RCU_NEXT_TAIL)
	1121	+ return;
	1122	+
	1123	+ /*
	1124	+ * Assign all subsequent callbacks' ->completed number to the next
	1125	+ * full grace period and group them all in the sublist initially
	1126	+ * indexed by "i".
	1127	+ */
	1128	+ for (; i <= RCU_NEXT_TAIL; i++) {
	1129	+ rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL];
	1130	+ rdp->nxtcompleted[i] = c;
	1131	+ }
	1132	+
	1133	+ /* Trace depending on how much we were able to accelerate. */
	1134	+ if (!*rdp->nxttail[RCU_WAIT_TAIL])
	1135	+ trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccWaitCB");
	1136	+ else
	1137	+ trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccReadyCB");
	1138	+}
	1139	+
	1140	+/*
	1141	+ * Move any callbacks whose grace period has completed to the
	1142	+ * RCU_DONE_TAIL sublist, then compact the remaining sublists and
	1143	+ * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL
	1144	+ * sublist. This function is idempotent, so it does not hurt to
	1145	+ * invoke it repeatedly. As long as it is not invoked -too- often...
	1146	+ *
	1147	+ * The caller must hold rnp->lock with interrupts disabled.
	1148	+ */
	1149	+static void rcu_advance_cbs(struct rcu_state rsp, struct rcu_node rnp,
	1150	+ struct rcu_data *rdp)
	1151	+{
	1152	+ int i, j;
	1153	+
	1154	+ /* If the CPU has no callbacks, nothing to do. */
	1155	+ if (!rdp->nxttail[RCU_NEXT_TAIL] \|\| !*rdp->nxttail[RCU_DONE_TAIL])
	1156	+ return;
	1157	+
	1158	+ /*
	1159	+ * Find all callbacks whose ->completed numbers indicate that they
	1160	+ * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
	1161	+ */
	1162	+ for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
	1163	+ if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i]))
	1164	+ break;
	1165	+ rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i];
	1166	+ }
	1167	+ /* Clean up any sublist tail pointers that were misordered above. */
	1168	+ for (j = RCU_WAIT_TAIL; j < i; j++)
	1169	+ rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL];
	1170	+
	1171	+ /* Copy down callbacks to fill in empty sublists. */
	1172	+ for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
	1173	+ if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL])
	1174	+ break;
	1175	+ rdp->nxttail[j] = rdp->nxttail[i];
	1176	+ rdp->nxtcompleted[j] = rdp->nxtcompleted[i];
	1177	+ }
	1178	+
	1179	+ /* Classify any remaining callbacks. */
	1180	+ rcu_accelerate_cbs(rsp, rnp, rdp);
	1181	+}
	1182	+
	1183	+/*
1074	1184	* Advance this CPU's callbacks, but only if the current grace period
1075	1185	* has ended. This may be called only from the CPU to whom the rdp
1076	1186	* belongs. In addition, the corresponding leaf rcu_node structure's
1077	1187
1078	1188
...	...	@@ -1080,13 +1190,16 @@
1080	1190	__rcu_process_gp_end(struct rcu_state rsp, struct rcu_node rnp, struct rcu_data *rdp)
1081	1191	{
1082	1192	/* Did another grace period end? */
1083		- if (rdp->completed != rnp->completed) {
	1193	+ if (rdp->completed == rnp->completed) {
1084	1194
1085		- /* Advance callbacks. No harm if list empty. */
1086		- rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
1087		- rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
1088		- rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
	1195	+ /* No, so just accelerate recent callbacks. */
	1196	+ rcu_accelerate_cbs(rsp, rnp, rdp);
1089	1197
	1198	+ } else {
	1199	+
	1200	+ /* Advance callbacks. */
	1201	+ rcu_advance_cbs(rsp, rnp, rdp);
	1202	+
1090	1203	/* Remember that we saw this grace-period completion. */
1091	1204	rdp->completed = rnp->completed;
1092	1205	trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend");
1093	1206
...	...	@@ -1392,17 +1505,10 @@
1392	1505	/*
1393	1506	* Because there is no grace period in progress right now,
1394	1507	* any callbacks we have up to this point will be satisfied
1395		- * by the next grace period. So promote all callbacks to be
1396		- * handled after the end of the next grace period. If the
1397		- * CPU is not yet aware of the end of the previous grace period,
1398		- * we need to allow for the callback advancement that will
1399		- * occur when it does become aware. Deadlock prevents us from
1400		- * making it aware at this point: We cannot acquire a leaf
1401		- * rcu_node ->lock while holding the root rcu_node ->lock.
	1508	+ * by the next grace period. So this is a good place to
	1509	+ * assign a grace period number to recently posted callbacks.
1402	1510	*/
1403		- rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
1404		- if (rdp->completed == rsp->completed)
1405		- rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
	1511	+ rcu_accelerate_cbs(rsp, rnp, rdp);
1406	1512
1407	1513	rsp->gp_flags = RCU_GP_FLAG_INIT;
1408	1514	raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
...	...	@@ -1527,7 +1633,7 @@
1527	1633	* This GP can't end until cpu checks in, so all of our
1528	1634	* callbacks can be processed during the next GP.
1529	1635	*/
1530		- rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
	1636	+ rcu_accelerate_cbs(rsp, rnp, rdp);
1531	1637
1532	1638	rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
1533	1639	}
...	...	@@ -1779,7 +1885,7 @@
1779	1885	long bl, count, count_lazy;
1780	1886	int i;
1781	1887
1782		- /* If no callbacks are ready, just return.*/
	1888	+ /* If no callbacks are ready, just return. */
1783	1889	if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
1784	1890	trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);
1785	1891	trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
1786	1892
1787	1893
1788	1894
...	...	@@ -2008,19 +2114,19 @@
2008	2114
2009	2115	WARN_ON_ONCE(rdp->beenonline == 0);
2010	2116
2011		- /*
2012		- * Advance callbacks in response to end of earlier grace
2013		- * period that some other CPU ended.
2014		- */
	2117	+ /* Handle the end of a grace period that some other CPU ended. */
2015	2118	rcu_process_gp_end(rsp, rdp);
2016	2119
2017	2120	/* Update RCU state based on any recent quiescent states. */
2018	2121	rcu_check_quiescent_state(rsp, rdp);
2019	2122
2020	2123	/* Does this CPU require a not-yet-started grace period? */
	2124	+ local_irq_save(flags);
2021	2125	if (cpu_needs_another_gp(rsp, rdp)) {
2022		- raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
	2126	+ raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
2023	2127	rcu_start_gp(rsp, flags); /* releases above lock */
	2128	+ } else {
	2129	+ local_irq_restore(flags);
2024	2130	}
2025	2131
2026	2132	/* If there are callbacks ready, invoke them. */
...	...	@@ -2719,9 +2825,6 @@
2719	2825	rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
2720	2826	WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
2721	2827	WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
2722		-#ifdef CONFIG_RCU_USER_QS
2723		- WARN_ON_ONCE(rdp->dynticks->in_user);
2724		-#endif
2725	2828	rdp->cpu = cpu;
2726	2829	rdp->rsp = rsp;
2727	2830	rcu_boot_init_nocb_percpu_data(rdp);
...	...	@@ -2938,6 +3041,10 @@
2938	3041
2939	3042	BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
2940	3043
	3044	+ /* Silence gcc 4.8 warning about array index out of range. */
	3045	+ if (rcu_num_lvls > RCU_NUM_LVLS)
	3046	+ panic("rcu_init_one: rcu_num_lvls overflow");
	3047	+
2941	3048	/* Initialize the level-tracking arrays. */
2942	3049
2943	3050	for (i = 0; i < rcu_num_lvls; i++)
...	...	@@ -3074,7 +3181,6 @@
3074	3181	cpu_notifier(rcu_cpu_notify, 0);
3075	3182	for_each_online_cpu(cpu)
3076	3183	rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
3077		- check_cpu_stall_init();
3078	3184	}
3079	3185
3080	3186	#include "rcutree_plugin.h"