Merge branches 'doc.2013.03.12a', 'fixes.2013.03.13a' and 'idlenocb.2013.03.26b' into HEAD

doc.2013.03.12a: Documentation changes. fixes.2013.03.13a: Miscellaneous fixes. idlenocb.2013.03.26b: Remove restrictions on no-CBs CPUs, make RCU_FAST_NO_HZ take advantage of numbered callbacks, add callback acceleration based on numbered callbacks.

Merge branches 'doc.2013.03.12a', 'fixes.2013.03.13a' and 'idlenocb.2013.03.26b' into HEAD
doc.2013.03.12a: Documentation changes. fixes.2013.03.13a: Miscellaneous fixes. idlenocb.2013.03.26b: Remove restrictions on no-CBs CPUs, make RCU_FAST_NO_HZ take advantage of numbered callbacks, add callback acceleration based on numbered callbacks.
Paul E. McKenney
3 parents 3f944adb9d 81e59494a5 910ee45db2
Showing 11 changed files Side-by-side Diff
Documentation/RCU/stallwarn.txt
Documentation/kernel-parameters.txt
include/linux/list_bl.h
include/linux/rculist_bl.h
include/linux/rcupdate.h
include/trace/events/rcu.h
init/Kconfig
kernel/rcutree.c
kernel/rcutree.h
kernel/rcutree_plugin.h
kernel/rcutree_trace.c
@@ -92,14 +92,14 @@
 more information is printed with the stall-warning message, for example:
  
 	INFO: rcu_preempt detected stall on CPU
-	0: (63959 ticks this GP) idle=241/3fffffffffffffff/0
+	0: (63959 ticks this GP) idle=241/3fffffffffffffff/0 softirq=82/543
 	   (t=65000 jiffies)
  
 In kernels with CONFIG_RCU_FAST_NO_HZ, even more information is
 printed:
  
 	INFO: rcu_preempt detected stall on CPU
-	0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 drain=0 . timer not pending
+	0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 nonlazy_posted: 25 .D
 	   (t=65000 jiffies)
  
 The "(64628 ticks this GP)" indicates that this CPU has taken more
@@ -116,13 +116,28 @@
 be a small positive number if in the idle loop and a very large positive
 number (as shown above) otherwise.
  
-For CONFIG_RCU_FAST_NO_HZ kernels, the "drain=0" indicates that the CPU is
-not in the process of trying to force itself into dyntick-idle state, the
-"." indicates that the CPU has not given up forcing RCU into dyntick-idle
-mode (it would be "H" otherwise), and the "timer not pending" indicates
-that the CPU has not recently forced RCU into dyntick-idle mode (it
-would otherwise indicate the number of microseconds remaining in this
-forced state).
+The "softirq=" portion of the message tracks the number of RCU softirq
+handlers that the stalled CPU has executed.  The number before the "/"
+is the number that had executed since boot at the time that this CPU
+last noted the beginning of a grace period, which might be the current
+(stalled) grace period, or it might be some earlier grace period (for
+example, if the CPU might have been in dyntick-idle mode for an extended
+time period.  The number after the "/" is the number that have executed
+since boot until the current time.  If this latter number stays constant
+across repeated stall-warning messages, it is possible that RCU's softirq
+handlers are no longer able to execute on this CPU.  This can happen if
+the stalled CPU is spinning with interrupts are disabled, or, in -rt
+kernels, if a high-priority process is starving RCU's softirq handler.
+
+For CONFIG_RCU_FAST_NO_HZ kernels, the "last_accelerate:" prints the
+low-order 16 bits (in hex) of the jiffies counter when this CPU last
+invoked rcu_try_advance_all_cbs() from rcu_needs_cpu() or last invoked
+rcu_accelerate_cbs() from rcu_prepare_for_idle().  The "nonlazy_posted:"
+prints the number of non-lazy callbacks posted since the last call to
+rcu_needs_cpu().  Finally, an "L" indicates that there are currently
+no non-lazy callbacks ("." is printed otherwise, as shown above) and
+"D" indicates that dyntick-idle processing is enabled ("." is printed
+otherwise, for example, if disabled via the "nohz=" kernel boot parameter).
  
  
 Multiple Warnings From One Stall
@@ -2461,9 +2461,12 @@
 			In kernels built with CONFIG_RCU_NOCB_CPU=y, set
 			the specified list of CPUs to be no-callback CPUs.
 			Invocation of these CPUs' RCU callbacks will
-			be offloaded to "rcuoN" kthreads created for
-			that purpose.  This reduces OS jitter on the
+			be offloaded to "rcuox/N" kthreads created for
+			that purpose, where "x" is "b" for RCU-bh, "p"
+			for RCU-preempt, and "s" for RCU-sched, and "N"
+			is the CPU number.  This reduces OS jitter on the
 			offloaded CPUs, which can be useful for HPC and
+
 			real-time workloads.  It can also improve energy
 			efficiency for asymmetric multiprocessors.
  
@@ -2487,6 +2490,17 @@
 			leaf rcu_node structure.  Useful for very large
 			systems.
  
+	rcutree.jiffies_till_first_fqs= [KNL,BOOT]
+			Set delay from grace-period initialization to
+			first attempt to force quiescent states.
+			Units are jiffies, minimum value is zero,
+			and maximum value is HZ.
+
+	rcutree.jiffies_till_next_fqs= [KNL,BOOT]
+			Set delay between subsequent attempts to force
+			quiescent states.  Units are jiffies, minimum
+			value is one, and maximum value is HZ.
+
 	rcutree.qhimark=	[KNL,BOOT]
 			Set threshold of queued
 			RCU callbacks over which batch limiting is disabled.
  
@@ -2501,16 +2515,15 @@
 	rcutree.rcu_cpu_stall_timeout= [KNL,BOOT]
 			Set timeout for RCU CPU stall warning messages.
  
-	rcutree.jiffies_till_first_fqs= [KNL,BOOT]
-			Set delay from grace-period initialization to
-			first attempt to force quiescent states.
-			Units are jiffies, minimum value is zero,
-			and maximum value is HZ.
+	rcutree.rcu_idle_gp_delay=	[KNL,BOOT]
+			Set wakeup interval for idle CPUs that have
+			RCU callbacks (RCU_FAST_NO_HZ=y).
  
-	rcutree.jiffies_till_next_fqs= [KNL,BOOT]
-			Set delay between subsequent attempts to force
-			quiescent states.  Units are jiffies, minimum
-			value is one, and maximum value is HZ.
+	rcutree.rcu_idle_lazy_gp_delay=	[KNL,BOOT]
+			Set wakeup interval for idle CPUs that have
+			only "lazy" RCU callbacks (RCU_FAST_NO_HZ=y).
+			Lazy RCU callbacks are those which RCU can
+			prove do nothing more than free memory.
  
 	rcutorture.fqs_duration= [KNL,BOOT]
 			Set duration of force_quiescent_state bursts.
@@ -125,6 +125,11 @@
 	__bit_spin_unlock(0, (unsigned long *)b);
 }
  
+static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
+{
+	return bit_spin_is_locked(0, (unsigned long *)b);
+}
+
 /**
  * hlist_bl_for_each_entry	- iterate over list of given type
  * @tpos:	the type * to use as a loop cursor.
@@ -20,7 +20,7 @@
 static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h)
 {
 	return (struct hlist_bl_node *)
-		((unsigned long)rcu_dereference(h->first) & ~LIST_BL_LOCKMASK);
+		((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK);
 }
  
 /**
@@ -80,6 +80,7 @@
 #define UINT_CMP_LT(a, b)	(UINT_MAX / 2 < (a) - (b))
 #define ULONG_CMP_GE(a, b)	(ULONG_MAX / 2 >= (a) - (b))
 #define ULONG_CMP_LT(a, b)	(ULONG_MAX / 2 < (a) - (b))
+#define ulong2long(a)		(*(long *)(&(a)))
  
 /* Exported common interfaces */
  
@@ -72,6 +72,58 @@
 );
  
 /*
+ * Tracepoint for future grace-period events, including those for no-callbacks
+ * CPUs.  The caller should pull the data from the rcu_node structure,
+ * other than rcuname, which comes from the rcu_state structure, and event,
+ * which is one of the following:
+ *
+ * "Startleaf": Request a nocb grace period based on leaf-node data.
+ * "Startedleaf": Leaf-node start proved sufficient.
+ * "Startedleafroot": Leaf-node start proved sufficient after checking root.
+ * "Startedroot": Requested a nocb grace period based on root-node data.
+ * "StartWait": Start waiting for the requested grace period.
+ * "ResumeWait": Resume waiting after signal.
+ * "EndWait": Complete wait.
+ * "Cleanup": Clean up rcu_node structure after previous GP.
+ * "CleanupMore": Clean up, and another no-CB GP is needed.
+ */
+TRACE_EVENT(rcu_future_grace_period,
+
+	TP_PROTO(char *rcuname, unsigned long gpnum, unsigned long completed,
+		 unsigned long c, u8 level, int grplo, int grphi,
+		 char *gpevent),
+
+	TP_ARGS(rcuname, gpnum, completed, c, level, grplo, grphi, gpevent),
+
+	TP_STRUCT__entry(
+		__field(char *, rcuname)
+		__field(unsigned long, gpnum)
+		__field(unsigned long, completed)
+		__field(unsigned long, c)
+		__field(u8, level)
+		__field(int, grplo)
+		__field(int, grphi)
+		__field(char *, gpevent)
+	),
+
+	TP_fast_assign(
+		__entry->rcuname = rcuname;
+		__entry->gpnum = gpnum;
+		__entry->completed = completed;
+		__entry->c = c;
+		__entry->level = level;
+		__entry->grplo = grplo;
+		__entry->grphi = grphi;
+		__entry->gpevent = gpevent;
+	),
+
+	TP_printk("%s %lu %lu %lu %u %d %d %s",
+		  __entry->rcuname, __entry->gpnum, __entry->completed,
+		  __entry->c, __entry->level, __entry->grplo, __entry->grphi,
+		  __entry->gpevent)
+);
+
+/*
  * Tracepoint for grace-period-initialization events.  These are
  * distinguished by the type of RCU, the new grace-period number, the
  * rcu_node structure level, the starting and ending CPU covered by the
@@ -601,6 +653,9 @@
 #define trace_rcu_grace_period(rcuname, gpnum, gpevent) do { } while (0)
 #define trace_rcu_grace_period_init(rcuname, gpnum, level, grplo, grphi, \
 				    qsmask) do { } while (0)
+#define trace_rcu_future_grace_period(rcuname, gpnum, completed, c, \
+				      level, grplo, grphi, event) \
+				      do { } while (0)
 #define trace_rcu_preempt_task(rcuname, pid, gpnum) do { } while (0)
 #define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0)
 #define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, \
@@ -582,13 +582,16 @@
 	depends on NO_HZ && SMP
 	default n
 	help
-	  This option causes RCU to attempt to accelerate grace periods in
-	  order to allow CPUs to enter dynticks-idle state more quickly.
-	  On the other hand, this option increases the overhead of the
-	  dynticks-idle checking, thus degrading scheduling latency.
+	  This option permits CPUs to enter dynticks-idle state even if
+	  they have RCU callbacks queued, and prevents RCU from waking
+	  these CPUs up more than roughly once every four jiffies (by
+	  default, you can adjust this using the rcutree.rcu_idle_gp_delay
+	  parameter), thus improving energy efficiency.  On the other
+	  hand, this option increases the duration of RCU grace periods,
+	  for example, slowing down synchronize_rcu().
  
-	  Say Y if energy efficiency is critically important, and you don't
-	  	care about real-time response.
+	  Say Y if energy efficiency is critically important, and you
+	  	don't care about increased grace-period durations.
  
 	  Say N if you are unsure.
  
@@ -655,7 +658,7 @@
 	  Accept the default if unsure.
  
 config RCU_NOCB_CPU
-	bool "Offload RCU callback processing from boot-selected CPUs"
+	bool "Offload RCU callback processing from boot-selected CPUs (EXPERIMENTAL"
 	depends on TREE_RCU || TREE_PREEMPT_RCU
 	default n
 	help
  
  
@@ -666,15 +669,55 @@
  
 	  This option offloads callback invocation from the set of
 	  CPUs specified at boot time by the rcu_nocbs parameter.
-	  For each such CPU, a kthread ("rcuoN") will be created to
-	  invoke callbacks, where the "N" is the CPU being offloaded.
-	  Nothing prevents this kthread from running on the specified
-	  CPUs, but (1) the kthreads may be preempted between each
-	  callback, and (2) affinity or cgroups can be used to force
-	  the kthreads to run on whatever set of CPUs is desired.
+	  For each such CPU, a kthread ("rcuox/N") will be created to
+	  invoke callbacks, where the "N" is the CPU being offloaded,
+	  and where the "x" is "b" for RCU-bh, "p" for RCU-preempt, and
+	  "s" for RCU-sched.  Nothing prevents this kthread from running
+	  on the specified CPUs, but (1) the kthreads may be preempted
+	  between each callback, and (2) affinity or cgroups can be used
+	  to force the kthreads to run on whatever set of CPUs is desired.
  
-	  Say Y here if you want reduced OS jitter on selected CPUs.
+	  Say Y here if you want to help to debug reduced OS jitter.
 	  Say N here if you are unsure.
+
+choice
+	prompt "Build-forced no-CBs CPUs"
+	default RCU_NOCB_CPU_NONE
+	help
+	  This option allows no-CBs CPUs to be specified at build time.
+	  Additional no-CBs CPUs may be specified by the rcu_nocbs=
+	  boot parameter.
+
+config RCU_NOCB_CPU_NONE
+	bool "No build_forced no-CBs CPUs"
+	depends on RCU_NOCB_CPU
+	help
+	  This option does not force any of the CPUs to be no-CBs CPUs.
+	  Only CPUs designated by the rcu_nocbs= boot parameter will be
+	  no-CBs CPUs.
+
+config RCU_NOCB_CPU_ZERO
+	bool "CPU 0 is a build_forced no-CBs CPU"
+	depends on RCU_NOCB_CPU
+	help
+	  This option forces CPU 0 to be a no-CBs CPU.  Additional CPUs
+	  may be designated as no-CBs CPUs using the rcu_nocbs= boot
+	  parameter will be no-CBs CPUs.
+
+	  Select this if CPU 0 needs to be a no-CBs CPU for real-time
+	  or energy-efficiency reasons.
+
+config RCU_NOCB_CPU_ALL
+	bool "All CPUs are build_forced no-CBs CPUs"
+	depends on RCU_NOCB_CPU
+	help
+	  This option forces all CPUs to be no-CBs CPUs.  The rcu_nocbs=
+	  boot parameter will be ignored.
+
+	  Select this if all CPUs need to be no-CBs CPUs for real-time
+	  or energy-efficiency reasons.
+
+endchoice
  
 endmenu # "RCU Subsystem"
  
@@ -64,7 +64,7 @@
 static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
 static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
  
-#define RCU_STATE_INITIALIZER(sname, cr) { \
+#define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \
 	.level = { &sname##_state.node[0] }, \
 	.call = cr, \
 	.fqs_state = RCU_GP_IDLE, \
  
  
@@ -76,13 +76,14 @@
 	.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
 	.onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
 	.name = #sname, \
+	.abbr = sabbr, \
 }
  
 struct rcu_state rcu_sched_state =
-	RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched);
+	RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
 DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
  
-struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh);
+struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
  
 static struct rcu_state *rcu_state;
@@ -223,6 +224,8 @@
 module_param(jiffies_till_first_fqs, ulong, 0644);
 module_param(jiffies_till_next_fqs, ulong, 0644);
  
+static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
+				  struct rcu_data *rdp);
 static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *));
 static void force_quiescent_state(struct rcu_state *rsp);
 static int rcu_pending(int cpu);
@@ -310,6 +313,8 @@
  
 	if (rcu_gp_in_progress(rsp))
 		return 0;  /* No, a grace period is already in progress. */
+	if (rcu_nocb_needs_gp(rsp))
+		return 1;  /* Yes, a no-CBs CPU needs one. */
 	if (!rdp->nxttail[RCU_NEXT_TAIL])
 		return 0;  /* No, this is a no-CBs (or offline) CPU. */
 	if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
  
@@ -1035,10 +1040,11 @@
 {
 	int i;
  
+	if (init_nocb_callback_list(rdp))
+		return;
 	rdp->nxtlist = NULL;
 	for (i = 0; i < RCU_NEXT_SIZE; i++)
 		rdp->nxttail[i] = &rdp->nxtlist;
-	init_nocb_callback_list(rdp);
 }
  
 /*
@@ -1071,6 +1077,120 @@
 }
  
 /*
+ * Trace-event helper function for rcu_start_future_gp() and
+ * rcu_nocb_wait_gp().
+ */
+static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
+				unsigned long c, char *s)
+{
+	trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum,
+				      rnp->completed, c, rnp->level,
+				      rnp->grplo, rnp->grphi, s);
+}
+
+/*
+ * Start some future grace period, as needed to handle newly arrived
+ * callbacks.  The required future grace periods are recorded in each
+ * rcu_node structure's ->need_future_gp field.
+ *
+ * The caller must hold the specified rcu_node structure's ->lock.
+ */
+static unsigned long __maybe_unused
+rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
+{
+	unsigned long c;
+	int i;
+	struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
+
+	/*
+	 * Pick up grace-period number for new callbacks.  If this
+	 * grace period is already marked as needed, return to the caller.
+	 */
+	c = rcu_cbs_completed(rdp->rsp, rnp);
+	trace_rcu_future_gp(rnp, rdp, c, "Startleaf");
+	if (rnp->need_future_gp[c & 0x1]) {
+		trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf");
+		return c;
+	}
+
+	/*
+	 * If either this rcu_node structure or the root rcu_node structure
+	 * believe that a grace period is in progress, then we must wait
+	 * for the one following, which is in "c".  Because our request
+	 * will be noticed at the end of the current grace period, we don't
+	 * need to explicitly start one.
+	 */
+	if (rnp->gpnum != rnp->completed ||
+	    ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
+		rnp->need_future_gp[c & 0x1]++;
+		trace_rcu_future_gp(rnp, rdp, c, "Startedleaf");
+		return c;
+	}
+
+	/*
+	 * There might be no grace period in progress.  If we don't already
+	 * hold it, acquire the root rcu_node structure's lock in order to
+	 * start one (if needed).
+	 */
+	if (rnp != rnp_root)
+		raw_spin_lock(&rnp_root->lock);
+
+	/*
+	 * Get a new grace-period number.  If there really is no grace
+	 * period in progress, it will be smaller than the one we obtained
+	 * earlier.  Adjust callbacks as needed.  Note that even no-CBs
+	 * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed.
+	 */
+	c = rcu_cbs_completed(rdp->rsp, rnp_root);
+	for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++)
+		if (ULONG_CMP_LT(c, rdp->nxtcompleted[i]))
+			rdp->nxtcompleted[i] = c;
+
+	/*
+	 * If the needed for the required grace period is already
+	 * recorded, trace and leave.
+	 */
+	if (rnp_root->need_future_gp[c & 0x1]) {
+		trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot");
+		goto unlock_out;
+	}
+
+	/* Record the need for the future grace period. */
+	rnp_root->need_future_gp[c & 0x1]++;
+
+	/* If a grace period is not already in progress, start one. */
+	if (rnp_root->gpnum != rnp_root->completed) {
+		trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot");
+	} else {
+		trace_rcu_future_gp(rnp, rdp, c, "Startedroot");
+		rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
+	}
+unlock_out:
+	if (rnp != rnp_root)
+		raw_spin_unlock(&rnp_root->lock);
+	return c;
+}
+
+/*
+ * Clean up any old requests for the just-ended grace period.  Also return
+ * whether any additional grace periods have been requested.  Also invoke
+ * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads
+ * waiting for this grace period to complete.
+ */
+static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+{
+	int c = rnp->completed;
+	int needmore;
+	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+
+	rcu_nocb_gp_cleanup(rsp, rnp);
+	rnp->need_future_gp[c & 0x1] = 0;
+	needmore = rnp->need_future_gp[(c + 1) & 0x1];
+	trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup");
+	return needmore;
+}
+
+/*
  * If there is room, assign a ->completed number to any callbacks on
  * this CPU that have not already been assigned.  Also accelerate any
  * callbacks that were previously assigned a ->completed number that has
@@ -1129,6 +1249,8 @@
 		rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL];
 		rdp->nxtcompleted[i] = c;
 	}
+	/* Record any needed additional grace periods. */
+	rcu_start_future_gp(rnp, rdp);
  
 	/* Trace depending on how much we were able to accelerate. */
 	if (!*rdp->nxttail[RCU_WAIT_TAIL])
  
@@ -1308,9 +1430,9 @@
 		rdp = this_cpu_ptr(rsp->rda);
 		rcu_preempt_check_blocked_tasks(rnp);
 		rnp->qsmask = rnp->qsmaskinit;
-		rnp->gpnum = rsp->gpnum;
+		ACCESS_ONCE(rnp->gpnum) = rsp->gpnum;
 		WARN_ON_ONCE(rnp->completed != rsp->completed);
-		rnp->completed = rsp->completed;
+		ACCESS_ONCE(rnp->completed) = rsp->completed;
 		if (rnp == rdp->mynode)
 			rcu_start_gp_per_cpu(rsp, rnp, rdp);
 		rcu_preempt_boost_start_gp(rnp);
@@ -1319,7 +1441,8 @@
 					    rnp->grphi, rnp->qsmask);
 		raw_spin_unlock_irq(&rnp->lock);
 #ifdef CONFIG_PROVE_RCU_DELAY
-		if ((random32() % (rcu_num_nodes * 8)) == 0)
+		if ((random32() % (rcu_num_nodes * 8)) == 0 &&
+		    system_state == SYSTEM_RUNNING)
 			schedule_timeout_uninterruptible(2);
 #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
 		cond_resched();
@@ -1361,6 +1484,7 @@
 static void rcu_gp_cleanup(struct rcu_state *rsp)
 {
 	unsigned long gp_duration;
+	int nocb = 0;
 	struct rcu_data *rdp;
 	struct rcu_node *rnp = rcu_get_root(rsp);
  
  
  
@@ -1390,17 +1514,23 @@
 	 */
 	rcu_for_each_node_breadth_first(rsp, rnp) {
 		raw_spin_lock_irq(&rnp->lock);
-		rnp->completed = rsp->gpnum;
+		ACCESS_ONCE(rnp->completed) = rsp->gpnum;
+		rdp = this_cpu_ptr(rsp->rda);
+		if (rnp == rdp->mynode)
+			__rcu_process_gp_end(rsp, rnp, rdp);
+		nocb += rcu_future_gp_cleanup(rsp, rnp);
 		raw_spin_unlock_irq(&rnp->lock);
 		cond_resched();
 	}
 	rnp = rcu_get_root(rsp);
 	raw_spin_lock_irq(&rnp->lock);
+	rcu_nocb_gp_set(rnp, nocb);
  
 	rsp->completed = rsp->gpnum; /* Declare grace period done. */
 	trace_rcu_grace_period(rsp->name, rsp->completed, "end");
 	rsp->fqs_state = RCU_GP_IDLE;
 	rdp = this_cpu_ptr(rsp->rda);
+	rcu_advance_cbs(rsp, rnp, rdp);  /* Reduce false positives below. */
 	if (cpu_needs_another_gp(rsp, rdp))
 		rsp->gp_flags = 1;
 	raw_spin_unlock_irq(&rnp->lock);
  
  
  
  
  
  
  
  
@@ -1476,57 +1606,62 @@
 /*
  * Start a new RCU grace period if warranted, re-initializing the hierarchy
  * in preparation for detecting the next grace period.  The caller must hold
- * the root node's ->lock, which is released before return.  Hard irqs must
- * be disabled.
+ * the root node's ->lock and hard irqs must be disabled.
  *
  * Note that it is legal for a dying CPU (which is marked as offline) to
  * invoke this function.  This can happen when the dying CPU reports its
  * quiescent state.
  */
 static void
-rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
-	__releases(rcu_get_root(rsp)->lock)
+rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
+		      struct rcu_data *rdp)
 {
-	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
-	struct rcu_node *rnp = rcu_get_root(rsp);
-
-	if (!rsp->gp_kthread ||
-	    !cpu_needs_another_gp(rsp, rdp)) {
+	if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) {
 		/*
 		 * Either we have not yet spawned the grace-period
 		 * task, this CPU does not need another grace period,
 		 * or a grace period is already in progress.
 		 * Either way, don't start a new grace period.
 		 */
-		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
 	}
-
-	/*
-	 * Because there is no grace period in progress right now,
-	 * any callbacks we have up to this point will be satisfied
-	 * by the next grace period.  So this is a good place to
-	 * assign a grace period number to recently posted callbacks.
-	 */
-	rcu_accelerate_cbs(rsp, rnp, rdp);
-
 	rsp->gp_flags = RCU_GP_FLAG_INIT;
-	raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
  
-	/* Ensure that CPU is aware of completion of last grace period. */
-	rcu_process_gp_end(rsp, rdp);
-	local_irq_restore(flags);
-
 	/* Wake up rcu_gp_kthread() to start the grace period. */
 	wake_up(&rsp->gp_wq);
 }
  
 /*
+ * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's
+ * callbacks.  Note that rcu_start_gp_advanced() cannot do this because it
+ * is invoked indirectly from rcu_advance_cbs(), which would result in
+ * endless recursion -- or would do so if it wasn't for the self-deadlock
+ * that is encountered beforehand.
+ */
+static void
+rcu_start_gp(struct rcu_state *rsp)
+{
+	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+	struct rcu_node *rnp = rcu_get_root(rsp);
+
+	/*
+	 * If there is no grace period in progress right now, any
+	 * callbacks we have up to this point will be satisfied by the
+	 * next grace period.  Also, advancing the callbacks reduces the
+	 * probability of false positives from cpu_needs_another_gp()
+	 * resulting in pointless grace periods.  So, advance callbacks
+	 * then start the grace period!
+	 */
+	rcu_advance_cbs(rsp, rnp, rdp);
+	rcu_start_gp_advanced(rsp, rnp, rdp);
+}
+
+/*
  * Report a full set of quiescent states to the specified rcu_state
  * data structure.  This involves cleaning up after the prior grace
  * period and letting rcu_start_gp() start up the next grace period
- * if one is needed.  Note that the caller must hold rnp->lock, as
- * required by rcu_start_gp(), which will release it.
+ * if one is needed.  Note that the caller must hold rnp->lock, which
+ * is released before return.
  */
 static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
 	__releases(rcu_get_root(rsp)->lock)
@@ -2124,7 +2259,8 @@
 	local_irq_save(flags);
 	if (cpu_needs_another_gp(rsp, rdp)) {
 		raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
-		rcu_start_gp(rsp, flags);  /* releases above lock */
+		rcu_start_gp(rsp);
+		raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
 	} else {
 		local_irq_restore(flags);
 	}
@@ -2169,7 +2305,8 @@
  
 static void invoke_rcu_core(void)
 {
-	raise_softirq(RCU_SOFTIRQ);
+	if (cpu_online(smp_processor_id()))
+		raise_softirq(RCU_SOFTIRQ);
 }
  
 /*
  
@@ -2204,11 +2341,11 @@
  
 		/* Start a new grace period if one not already started. */
 		if (!rcu_gp_in_progress(rsp)) {
-			unsigned long nestflag;
 			struct rcu_node *rnp_root = rcu_get_root(rsp);
  
-			raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
-			rcu_start_gp(rsp, nestflag);  /* rlses rnp_root->lock */
+			raw_spin_lock(&rnp_root->lock);
+			rcu_start_gp(rsp);
+			raw_spin_unlock(&rnp_root->lock);
 		} else {
 			/* Give the grace period a kick. */
 			rdp->blimit = LONG_MAX;
  
  
  
@@ -2628,19 +2765,27 @@
 }
  
 /*
- * Check to see if any future RCU-related work will need to be done
- * by the current CPU, even if none need be done immediately, returning
- * 1 if so.
+ * Return true if the specified CPU has any callback.  If all_lazy is
+ * non-NULL, store an indication of whether all callbacks are lazy.
+ * (If there are no callbacks, all of them are deemed to be lazy.)
  */
-static int rcu_cpu_has_callbacks(int cpu)
+static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
 {
+	bool al = true;
+	bool hc = false;
+	struct rcu_data *rdp;
 	struct rcu_state *rsp;
  
-	/* RCU callbacks either ready or pending? */
-	for_each_rcu_flavor(rsp)
-		if (per_cpu_ptr(rsp->rda, cpu)->nxtlist)
-			return 1;
-	return 0;
+	for_each_rcu_flavor(rsp) {
+		rdp = per_cpu_ptr(rsp->rda, cpu);
+		if (rdp->qlen != rdp->qlen_lazy)
+			al = false;
+		if (rdp->nxtlist)
+			hc = true;
+	}
+	if (all_lazy)
+		*all_lazy = al;
+	return hc;
 }
  
 /*
@@ -2859,7 +3004,6 @@
 	rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
 	atomic_set(&rdp->dynticks->dynticks,
 		   (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
-	rcu_prepare_for_idle_init(cpu);
 	raw_spin_unlock(&rnp->lock);		/* irqs remain disabled. */
  
 	/* Add CPU to rcu_node bitmasks. */
@@ -2909,7 +3053,6 @@
 	struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
 	struct rcu_node *rnp = rdp->mynode;
 	struct rcu_state *rsp;
-	int ret = NOTIFY_OK;
  
 	trace_rcu_utilization("Start CPU hotplug");
 	switch (action) {
  
  
@@ -2923,21 +3066,12 @@
 		rcu_boost_kthread_setaffinity(rnp, -1);
 		break;
 	case CPU_DOWN_PREPARE:
-		if (nocb_cpu_expendable(cpu))
-			rcu_boost_kthread_setaffinity(rnp, cpu);
-		else
-			ret = NOTIFY_BAD;
+		rcu_boost_kthread_setaffinity(rnp, cpu);
 		break;
 	case CPU_DYING:
 	case CPU_DYING_FROZEN:
-		/*
-		 * The whole machine is "stopped" except this CPU, so we can
-		 * touch any data without introducing corruption. We send the
-		 * dying CPU's callbacks to an arbitrarily chosen online CPU.
-		 */
 		for_each_rcu_flavor(rsp)
 			rcu_cleanup_dying_cpu(rsp);
-		rcu_cleanup_after_idle(cpu);
 		break;
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
@@ -2950,7 +3084,7 @@
 		break;
 	}
 	trace_rcu_utilization("End CPU hotplug");
-	return ret;
+	return NOTIFY_OK;
 }
  
 /*
@@ -3085,6 +3219,7 @@
 			}
 			rnp->level = i;
 			INIT_LIST_HEAD(&rnp->blkd_tasks);
+			rcu_init_one_nocb(rnp);
 		}
 	}
  
@@ -3170,8 +3305,7 @@
 	rcu_init_one(&rcu_sched_state, &rcu_sched_data);
 	rcu_init_one(&rcu_bh_state, &rcu_bh_data);
 	__rcu_init_preempt();
-	rcu_init_nocb();
-	 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
  
 	/*
 	 * We don't need protection against CPU-hotplug here because
@@ -88,18 +88,13 @@
 	int dynticks_nmi_nesting;   /* Track NMI nesting level. */
 	atomic_t dynticks;	    /* Even value for idle, else odd. */
 #ifdef CONFIG_RCU_FAST_NO_HZ
-	int dyntick_drain;	    /* Prepare-for-idle state variable. */
-	unsigned long dyntick_holdoff;
-				    /* No retries for the jiffy of failure. */
-	struct timer_list idle_gp_timer;
-				    /* Wake up CPU sleeping with callbacks. */
-	unsigned long idle_gp_timer_expires;
-				    /* When to wake up CPU (for repost). */
-	bool idle_first_pass;	    /* First pass of attempt to go idle? */
+	bool all_lazy;		    /* Are all CPU's CBs lazy? */
 	unsigned long nonlazy_posted;
 				    /* # times non-lazy CBs posted to CPU. */
 	unsigned long nonlazy_posted_snap;
 				    /* idle-period nonlazy_posted snapshot. */
+	unsigned long last_accelerate;
+				    /* Last jiffy CBs were accelerated. */
 	int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
 #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
 };
@@ -134,9 +129,6 @@
 				/*  elements that need to drain to allow the */
 				/*  current expedited grace period to */
 				/*  complete (only for TREE_PREEMPT_RCU). */
-	atomic_t wakemask;	/* CPUs whose kthread needs to be awakened. */
-				/*  Since this has meaning only for leaf */
-				/*  rcu_node structures, 32 bits suffices. */
 	unsigned long qsmaskinit;
 				/* Per-GP initial value for qsmask & expmask. */
 	unsigned long grpmask;	/* Mask to apply to parent qsmask. */
@@ -196,6 +188,12 @@
 				/* Refused to boost: not sure why, though. */
 				/*  This can happen due to race conditions. */
 #endif /* #ifdef CONFIG_RCU_BOOST */
+#ifdef CONFIG_RCU_NOCB_CPU
+	wait_queue_head_t nocb_gp_wq[2];
+				/* Place for rcu_nocb_kthread() to wait GP. */
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
+	int need_future_gp[2];
+				/* Counts of upcoming no-CB GP requests. */
 	raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
 } ____cacheline_internodealigned_in_smp;
  
@@ -328,6 +326,11 @@
 	struct task_struct *nocb_kthread;
 #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
  
+	/* 8) RCU CPU stall data. */
+#ifdef CONFIG_RCU_CPU_STALL_INFO
+	unsigned int softirq_snap;	/* Snapshot of softirq activity. */
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
+
 	int cpu;
 	struct rcu_state *rsp;
 };
@@ -375,12 +378,6 @@
 	struct rcu_data __percpu *rda;		/* pointer of percu rcu_data. */
 	void (*call)(struct rcu_head *head,	/* call_rcu() flavor. */
 		     void (*func)(struct rcu_head *head));
-#ifdef CONFIG_RCU_NOCB_CPU
-	void (*call_remote)(struct rcu_head *head,
-		     void (*func)(struct rcu_head *head));
-						/* call_rcu() flavor, but for */
-						/*  placing on remote CPU. */
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
  
 	/* The following fields are guarded by the root rcu_node's lock. */
  
@@ -443,6 +440,7 @@
 	unsigned long gp_max;			/* Maximum GP duration in */
 						/*  jiffies. */
 	char *name;				/* Name of structure. */
+	char abbr;				/* Abbreviated name. */
 	struct list_head flavors;		/* List of RCU flavors. */
 };
  
@@ -520,7 +518,6 @@
 						 struct rcu_node *rnp);
 #endif /* #ifdef CONFIG_RCU_BOOST */
 static void __cpuinit rcu_prepare_kthreads(int cpu);
-static void rcu_prepare_for_idle_init(int cpu);
 static void rcu_cleanup_after_idle(int cpu);
 static void rcu_prepare_for_idle(int cpu);
 static void rcu_idle_count_callbacks_posted(void);
  
  
@@ -529,16 +526,18 @@
 static void print_cpu_stall_info_end(void);
 static void zero_cpu_stall_ticks(struct rcu_data *rdp);
 static void increment_cpu_stall_ticks(void);
+static int rcu_nocb_needs_gp(struct rcu_state *rsp);
+static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
+static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
+static void rcu_init_one_nocb(struct rcu_node *rnp);
 static bool is_nocb_cpu(int cpu);
 static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
 			    bool lazy);
 static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
 				      struct rcu_data *rdp);
-static bool nocb_cpu_expendable(int cpu);
 static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
 static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
-static void init_nocb_callback_list(struct rcu_data *rdp);
-static void __init rcu_init_nocb(void);
+static bool init_nocb_callback_list(struct rcu_data *rdp);
  
 #endif /* #ifndef RCU_TREE_NONCORE */
  
@@ -85,11 +85,21 @@
 	if (nr_cpu_ids != NR_CPUS)
 		printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
 #ifdef CONFIG_RCU_NOCB_CPU
+#ifndef CONFIG_RCU_NOCB_CPU_NONE
+	if (!have_rcu_nocb_mask) {
+		alloc_bootmem_cpumask_var(&rcu_nocb_mask);
+		have_rcu_nocb_mask = true;
+	}
+#ifdef CONFIG_RCU_NOCB_CPU_ZERO
+	pr_info("\tExperimental no-CBs CPU 0\n");
+	cpumask_set_cpu(0, rcu_nocb_mask);
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
+#ifdef CONFIG_RCU_NOCB_CPU_ALL
+	pr_info("\tExperimental no-CBs for all CPUs\n");
+	cpumask_setall(rcu_nocb_mask);
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
 	if (have_rcu_nocb_mask) {
-		if (cpumask_test_cpu(0, rcu_nocb_mask)) {
-			cpumask_clear_cpu(0, rcu_nocb_mask);
-			pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n");
-		}
 		cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
 		pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf);
 		if (rcu_nocb_poll)
@@ -101,7 +111,7 @@
 #ifdef CONFIG_TREE_PREEMPT_RCU
  
 struct rcu_state rcu_preempt_state =
-	RCU_STATE_INITIALIZER(rcu_preempt, call_rcu);
+	RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
 DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
 static struct rcu_state *rcu_state = &rcu_preempt_state;
  
  
@@ -1533,17 +1543,10 @@
 int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
 {
 	*delta_jiffies = ULONG_MAX;
-	return rcu_cpu_has_callbacks(cpu);
+	return rcu_cpu_has_callbacks(cpu, NULL);
 }
  
 /*
- * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it.
- */
-static void rcu_prepare_for_idle_init(int cpu)
-{
-}
-
-/*
  * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
  * after it.
  */
@@ -1577,16 +1580,6 @@
  *
  * The following three proprocessor symbols control this state machine:
  *
- * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt
- *	to satisfy RCU.  Beyond this point, it is better to incur a periodic
- *	scheduling-clock interrupt than to loop through the state machine
- *	at full power.
- * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are
- *	optional if RCU does not need anything immediately from this
- *	CPU, even if this CPU still has RCU callbacks queued.  The first
- *	times through the state machine are mandatory: we need to give
- *	the state machine a chance to communicate a quiescent state
- *	to the RCU core.
  * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
  *	to sleep in dyntick-idle mode with RCU callbacks pending.  This
  *	is sized to be roughly one RCU grace period.  Those energy-efficiency
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
@@ -1602,186 +1595,108 @@
  * adjustment, they can be converted into kernel config parameters, though
  * making the state machine smarter might be a better option.
  */
-#define RCU_IDLE_FLUSHES 5		/* Number of dyntick-idle tries. */
-#define RCU_IDLE_OPT_FLUSHES 3		/* Optional dyntick-idle tries. */
 #define RCU_IDLE_GP_DELAY 4		/* Roughly one grace period. */
 #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ)	/* Roughly six seconds. */
  
+static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY;
+module_param(rcu_idle_gp_delay, int, 0644);
+static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
+module_param(rcu_idle_lazy_gp_delay, int, 0644);
+
 extern int tick_nohz_enabled;
  
 /*
- * Does the specified flavor of RCU have non-lazy callbacks pending on
- * the specified CPU?  Both RCU flavor and CPU are specified by the
- * rcu_data structure.
+ * Try to advance callbacks for all flavors of RCU on the current CPU.
+ * Afterwards, if there are any callbacks ready for immediate invocation,
+ * return true.
  */
-static bool __rcu_cpu_has_nonlazy_callbacks(struct rcu_data *rdp)
+static bool rcu_try_advance_all_cbs(void)
 {
-	return rdp->qlen != rdp->qlen_lazy;
-}
+	bool cbs_ready = false;
+	struct rcu_data *rdp;
+	struct rcu_node *rnp;
+	struct rcu_state *rsp;
  
-#ifdef CONFIG_TREE_PREEMPT_RCU
+	for_each_rcu_flavor(rsp) {
+		rdp = this_cpu_ptr(rsp->rda);
+		rnp = rdp->mynode;
  
-/*
- * Are there non-lazy RCU-preempt callbacks?  (There cannot be if there
- * is no RCU-preempt in the kernel.)
- */
-static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu)
-{
-	struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
+		/*
+		 * Don't bother checking unless a grace period has
+		 * completed since we last checked and there are
+		 * callbacks not yet ready to invoke.
+		 */
+		if (rdp->completed != rnp->completed &&
+		    rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
+			rcu_process_gp_end(rsp, rdp);
  
-	return __rcu_cpu_has_nonlazy_callbacks(rdp);
+		if (cpu_has_callbacks_ready_to_invoke(rdp))
+			cbs_ready = true;
+	}
+	return cbs_ready;
 }
  
-#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-
-static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu)
-{
-	return 0;
-}
-
-#endif /* else #ifdef CONFIG_TREE_PREEMPT_RCU */
-
 /*
- * Does any flavor of RCU have non-lazy callbacks on the specified CPU?
- */
-static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
-{
-	return __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_sched_data, cpu)) ||
-	       __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_bh_data, cpu)) ||
-	       rcu_preempt_cpu_has_nonlazy_callbacks(cpu);
-}
-
-/*
- * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
- * callbacks on this CPU, (2) this CPU has not yet attempted to enter
- * dyntick-idle mode, or (3) this CPU is in the process of attempting to
- * enter dyntick-idle mode.  Otherwise, if we have recently tried and failed
- * to enter dyntick-idle mode, we refuse to try to enter it.  After all,
- * it is better to incur scheduling-clock interrupts than to spin
- * continuously for the same time duration!
+ * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
+ * to invoke.  If the CPU has callbacks, try to advance them.  Tell the
+ * caller to set the timeout based on whether or not there are non-lazy
+ * callbacks.
  *
- * The delta_jiffies argument is used to store the time when RCU is
- * going to need the CPU again if it still has callbacks.  The reason
- * for this is that rcu_prepare_for_idle() might need to post a timer,
- * but if so, it will do so after tick_nohz_stop_sched_tick() has set
- * the wakeup time for this CPU.  This means that RCU's timer can be
- * delayed until the wakeup time, which defeats the purpose of posting
- * a timer.
+ * The caller must have disabled interrupts.
  */
-int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
+int rcu_needs_cpu(int cpu, unsigned long *dj)
 {
 	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
  
-	/* Flag a new idle sojourn to the idle-entry state machine. */
-	rdtp->idle_first_pass = 1;
+	/* Snapshot to detect later posting of non-lazy callback. */
+	rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
+
 	/* If no callbacks, RCU doesn't need the CPU. */
-	if (!rcu_cpu_has_callbacks(cpu)) {
-		*delta_jiffies = ULONG_MAX;
+	if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) {
+		*dj = ULONG_MAX;
 		return 0;
 	}
-	if (rdtp->dyntick_holdoff == jiffies) {
-		/* RCU recently tried and failed, so don't try again. */
-		*delta_jiffies = 1;
+
+	/* Attempt to advance callbacks. */
+	if (rcu_try_advance_all_cbs()) {
+		/* Some ready to invoke, so initiate later invocation. */
+		invoke_rcu_core();
 		return 1;
 	}
-	/* Set up for the possibility that RCU will post a timer. */
-	if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
-		*delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies,
-					  RCU_IDLE_GP_DELAY) - jiffies;
+	rdtp->last_accelerate = jiffies;
+
+	/* Request timer delay depending on laziness, and round. */
+	if (rdtp->all_lazy) {
+		*dj = round_up(rcu_idle_gp_delay + jiffies,
+			       rcu_idle_gp_delay) - jiffies;
 	} else {
-		*delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY;
-		*delta_jiffies = round_jiffies(*delta_jiffies) - jiffies;
+		*dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
 	}
 	return 0;
 }
  
 /*
- * Handler for smp_call_function_single().  The only point of this
- * handler is to wake the CPU up, so the handler does only tracing.
- */
-void rcu_idle_demigrate(void *unused)
-{
-	trace_rcu_prep_idle("Demigrate");
-}
-
-/*
- * Timer handler used to force CPU to start pushing its remaining RCU
- * callbacks in the case where it entered dyntick-idle mode with callbacks
- * pending.  The hander doesn't really need to do anything because the
- * real work is done upon re-entry to idle, or by the next scheduling-clock
- * interrupt should idle not be re-entered.
+ * Prepare a CPU for idle from an RCU perspective.  The first major task
+ * is to sense whether nohz mode has been enabled or disabled via sysfs.
+ * The second major task is to check to see if a non-lazy callback has
+ * arrived at a CPU that previously had only lazy callbacks.  The third
+ * major task is to accelerate (that is, assign grace-period numbers to)
+ * any recently arrived callbacks.
  *
- * One special case: the timer gets migrated without awakening the CPU
- * on which the timer was scheduled on.  In this case, we must wake up
- * that CPU.  We do so with smp_call_function_single().
- */
-static void rcu_idle_gp_timer_func(unsigned long cpu_in)
-{
-	int cpu = (int)cpu_in;
-
-	trace_rcu_prep_idle("Timer");
-	if (cpu != smp_processor_id())
-		smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0);
-	else
-		WARN_ON_ONCE(1); /* Getting here can hang the system... */
-}
-
-/*
- * Initialize the timer used to pull CPUs out of dyntick-idle mode.
- */
-static void rcu_prepare_for_idle_init(int cpu)
-{
-	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
-	rdtp->dyntick_holdoff = jiffies - 1;
-	setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu);
-	rdtp->idle_gp_timer_expires = jiffies - 1;
-	rdtp->idle_first_pass = 1;
-}
-
-/*
- * Clean up for exit from idle.  Because we are exiting from idle, there
- * is no longer any point to ->idle_gp_timer, so cancel it.  This will
- * do nothing if this timer is not active, so just cancel it unconditionally.
- */
-static void rcu_cleanup_after_idle(int cpu)
-{
-	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
-	del_timer(&rdtp->idle_gp_timer);
-	trace_rcu_prep_idle("Cleanup after idle");
-	rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled);
-}
-
-/*
- * Check to see if any RCU-related work can be done by the current CPU,
- * and if so, schedule a softirq to get it done.  This function is part
- * of the RCU implementation; it is -not- an exported member of the RCU API.
- *
- * The idea is for the current CPU to clear out all work required by the
- * RCU core for the current grace period, so that this CPU can be permitted
- * to enter dyntick-idle mode.  In some cases, it will need to be awakened
- * at the end of the grace period by whatever CPU ends the grace period.
- * This allows CPUs to go dyntick-idle more quickly, and to reduce the
- * number of wakeups by a modest integer factor.
- *
- * Because it is not legal to invoke rcu_process_callbacks() with irqs
- * disabled, we do one pass of force_quiescent_state(), then do a
- * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
- * later.  The ->dyntick_drain field controls the sequencing.
- *
  * The caller must have disabled interrupts.
  */
 static void rcu_prepare_for_idle(int cpu)
 {
-	struct timer_list *tp;
+	struct rcu_data *rdp;
 	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+	struct rcu_node *rnp;
+	struct rcu_state *rsp;
 	int tne;
  
 	/* Handle nohz enablement switches conservatively. */
 	tne = ACCESS_ONCE(tick_nohz_enabled);
 	if (tne != rdtp->tick_nohz_enabled_snap) {
-		if (rcu_cpu_has_callbacks(cpu))
+		if (rcu_cpu_has_callbacks(cpu, NULL))
 			invoke_rcu_core(); /* force nohz to see update. */
 		rdtp->tick_nohz_enabled_snap = tne;
 		return;
  
  
  
  
  
  
  
  
  
  
  
  
@@ -1789,126 +1704,57 @@
 	if (!tne)
 		return;
  
-	/* Adaptive-tick mode, where usermode execution is idle to RCU. */
-	if (!is_idle_task(current)) {
-		rdtp->dyntick_holdoff = jiffies - 1;
-		if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
-			trace_rcu_prep_idle("User dyntick with callbacks");
-			rdtp->idle_gp_timer_expires =
-				round_up(jiffies + RCU_IDLE_GP_DELAY,
-					 RCU_IDLE_GP_DELAY);
-		} else if (rcu_cpu_has_callbacks(cpu)) {
-			rdtp->idle_gp_timer_expires =
-				round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
-			trace_rcu_prep_idle("User dyntick with lazy callbacks");
-		} else {
-			return;
-		}
-		tp = &rdtp->idle_gp_timer;
-		mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
+	/* If this is a no-CBs CPU, no callbacks, just return. */
+	if (is_nocb_cpu(cpu))
 		return;
-	}
  
 	/*
-	 * If this is an idle re-entry, for example, due to use of
-	 * RCU_NONIDLE() or the new idle-loop tracing API within the idle
-	 * loop, then don't take any state-machine actions, unless the
-	 * momentary exit from idle queued additional non-lazy callbacks.
-	 * Instead, repost the ->idle_gp_timer if this CPU has callbacks
-	 * pending.
+	 * If a non-lazy callback arrived at a CPU having only lazy
+	 * callbacks, invoke RCU core for the side-effect of recalculating
+	 * idle duration on re-entry to idle.
 	 */
-	if (!rdtp->idle_first_pass &&
-	    (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) {
-		if (rcu_cpu_has_callbacks(cpu)) {
-			tp = &rdtp->idle_gp_timer;
-			mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
-		}
+	if (rdtp->all_lazy &&
+	    rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) {
+		invoke_rcu_core();
 		return;
 	}
-	rdtp->idle_first_pass = 0;
-	rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1;
  
 	/*
-	 * If there are no callbacks on this CPU, enter dyntick-idle mode.
-	 * Also reset state to avoid prejudicing later attempts.
+	 * If we have not yet accelerated this jiffy, accelerate all
+	 * callbacks on this CPU.
 	 */
-	if (!rcu_cpu_has_callbacks(cpu)) {
-		rdtp->dyntick_holdoff = jiffies - 1;
-		rdtp->dyntick_drain = 0;
-		trace_rcu_prep_idle("No callbacks");
+	if (rdtp->last_accelerate == jiffies)
 		return;
+	rdtp->last_accelerate = jiffies;
+	for_each_rcu_flavor(rsp) {
+		rdp = per_cpu_ptr(rsp->rda, cpu);
+		if (!*rdp->nxttail[RCU_DONE_TAIL])
+			continue;
+		rnp = rdp->mynode;
+		raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+		rcu_accelerate_cbs(rsp, rnp, rdp);
+		raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 	}
+}
  
-	/*
-	 * If in holdoff mode, just return.  We will presumably have
-	 * refrained from disabling the scheduling-clock tick.
-	 */
-	if (rdtp->dyntick_holdoff == jiffies) {
-		trace_rcu_prep_idle("In holdoff");
-		return;
-	}
+/*
+ * Clean up for exit from idle.  Attempt to advance callbacks based on
+ * any grace periods that elapsed while the CPU was idle, and if any
+ * callbacks are now ready to invoke, initiate invocation.
+ */
+static void rcu_cleanup_after_idle(int cpu)
+{
+	struct rcu_data *rdp;
+	struct rcu_state *rsp;
  
-	/* Check and update the ->dyntick_drain sequencing. */
-	if (rdtp->dyntick_drain <= 0) {
-		/* First time through, initialize the counter. */
-		rdtp->dyntick_drain = RCU_IDLE_FLUSHES;
-	} else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES &&
-		   !rcu_pending(cpu) &&
-		   !local_softirq_pending()) {
-		/* Can we go dyntick-idle despite still having callbacks? */
-		rdtp->dyntick_drain = 0;
-		rdtp->dyntick_holdoff = jiffies;
-		if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
-			trace_rcu_prep_idle("Dyntick with callbacks");
-			rdtp->idle_gp_timer_expires =
-				round_up(jiffies + RCU_IDLE_GP_DELAY,
-					 RCU_IDLE_GP_DELAY);
-		} else {
-			rdtp->idle_gp_timer_expires =
-				round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
-			trace_rcu_prep_idle("Dyntick with lazy callbacks");
-		}
-		tp = &rdtp->idle_gp_timer;
-		mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
-		rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
-		return; /* Nothing more to do immediately. */
-	} else if (--(rdtp->dyntick_drain) <= 0) {
-		/* We have hit the limit, so time to give up. */
-		rdtp->dyntick_holdoff = jiffies;
-		trace_rcu_prep_idle("Begin holdoff");
-		invoke_rcu_core();  /* Force the CPU out of dyntick-idle. */
+	if (is_nocb_cpu(cpu))
 		return;
+	rcu_try_advance_all_cbs();
+	for_each_rcu_flavor(rsp) {
+		rdp = per_cpu_ptr(rsp->rda, cpu);
+		if (cpu_has_callbacks_ready_to_invoke(rdp))
+			invoke_rcu_core();
 	}
-
-	/*
-	 * Do one step of pushing the remaining RCU callbacks through
-	 * the RCU core state machine.
-	 */
-#ifdef CONFIG_TREE_PREEMPT_RCU
-	if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
-		rcu_preempt_qs(cpu);
-		force_quiescent_state(&rcu_preempt_state);
-	}
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-	if (per_cpu(rcu_sched_data, cpu).nxtlist) {
-		rcu_sched_qs(cpu);
-		force_quiescent_state(&rcu_sched_state);
-	}
-	if (per_cpu(rcu_bh_data, cpu).nxtlist) {
-		rcu_bh_qs(cpu);
-		force_quiescent_state(&rcu_bh_state);
-	}
-
-	/*
-	 * If RCU callbacks are still pending, RCU still needs this CPU.
-	 * So try forcing the callbacks through the grace period.
-	 */
-	if (rcu_cpu_has_callbacks(cpu)) {
-		trace_rcu_prep_idle("More callbacks");
-		invoke_rcu_core();
-	} else {
-		trace_rcu_prep_idle("Callbacks drained");
-	}
 }
  
 /*
  
@@ -2015,16 +1861,13 @@
 static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
 {
 	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-	struct timer_list *tltp = &rdtp->idle_gp_timer;
-	char c;
+	unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap;
  
-	c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.';
-	if (timer_pending(tltp))
-		sprintf(cp, "drain=%d %c timer=%lu",
-			rdtp->dyntick_drain, c, tltp->expires - jiffies);
-	else
-		sprintf(cp, "drain=%d %c timer not pending",
-			rdtp->dyntick_drain, c);
+	sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c",
+		rdtp->last_accelerate & 0xffff, jiffies & 0xffff,
+		ulong2long(nlpd),
+		rdtp->all_lazy ? 'L' : '.',
+		rdtp->tick_nohz_enabled_snap ? '.' : 'D');
 }
  
 #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
  
@@ -2070,10 +1913,11 @@
 		ticks_value = rsp->gpnum - rdp->gpnum;
 	}
 	print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
-	printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d %s\n",
+	printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n",
 	       cpu, ticks_value, ticks_title,
 	       atomic_read(&rdtp->dynticks) & 0xfff,
 	       rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
+	       rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
 	       fast_no_hz);
 }
  
@@ -2087,6 +1931,7 @@
 static void zero_cpu_stall_ticks(struct rcu_data *rdp)
 {
 	rdp->ticks_this_gp = 0;
+	rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
 }
  
 /* Increment ->ticks_this_gp for all flavors of RCU. */
@@ -2165,6 +2010,47 @@
 }
 early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
  
+/*
+ * Do any no-CBs CPUs need another grace period?
+ *
+ * Interrupts must be disabled.  If the caller does not hold the root
+ * rnp_node structure's ->lock, the results are advisory only.
+ */
+static int rcu_nocb_needs_gp(struct rcu_state *rsp)
+{
+	struct rcu_node *rnp = rcu_get_root(rsp);
+
+	return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1];
+}
+
+/*
+ * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
+ * grace period.
+ */
+static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+{
+	wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
+}
+
+/*
+ * Set the root rcu_node structure's ->need_future_gp field
+ * based on the sum of those of all rcu_node structures.  This does
+ * double-count the root rcu_node structure's requests, but this
+ * is necessary to handle the possibility of a rcu_nocb_kthread()
+ * having awakened during the time that the rcu_node structures
+ * were being updated for the end of the previous grace period.
+ */
+static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
+{
+	rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
+}
+
+static void rcu_init_one_nocb(struct rcu_node *rnp)
+{
+	init_waitqueue_head(&rnp->nocb_gp_wq[0]);
+	init_waitqueue_head(&rnp->nocb_gp_wq[1]);
+}
+
 /* Is the specified CPU a no-CPUs CPU? */
 static bool is_nocb_cpu(int cpu)
 {
@@ -2227,6 +2113,13 @@
 	if (!is_nocb_cpu(rdp->cpu))
 		return 0;
 	__call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy);
+	if (__is_kfree_rcu_offset((unsigned long)rhp->func))
+		trace_rcu_kfree_callback(rdp->rsp->name, rhp,
+					 (unsigned long)rhp->func,
+					 rdp->qlen_lazy, rdp->qlen);
+	else
+		trace_rcu_callback(rdp->rsp->name, rhp,
+				   rdp->qlen_lazy, rdp->qlen);
 	return 1;
 }
  
  
  
  
  
  
  
@@ -2265,98 +2158,39 @@
 }
  
 /*
- * There must be at least one non-no-CBs CPU in operation at any given
- * time, because no-CBs CPUs are not capable of initiating grace periods
- * independently.  This function therefore complains if the specified
- * CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to
- * avoid offlining the last such CPU.  (Recursion is a wonderful thing,
- * but you have to have a base case!)
+ * If necessary, kick off a new grace period, and either way wait
+ * for a subsequent grace period to complete.
  */
-static bool nocb_cpu_expendable(int cpu)
+static void rcu_nocb_wait_gp(struct rcu_data *rdp)
 {
-	cpumask_var_t non_nocb_cpus;
-	int ret;
+	unsigned long c;
+	bool d;
+	unsigned long flags;
+	struct rcu_node *rnp = rdp->mynode;
  
+	raw_spin_lock_irqsave(&rnp->lock, flags);
+	c = rcu_start_future_gp(rnp, rdp);
+	raw_spin_unlock_irqrestore(&rnp->lock, flags);
+
 	/*
-	 * If there are no no-CB CPUs or if this CPU is not a no-CB CPU,
-	 * then offlining this CPU is harmless.  Let it happen.
+	 * Wait for the grace period.  Do so interruptibly to avoid messing
+	 * up the load average.
 	 */
-	if (!have_rcu_nocb_mask || is_nocb_cpu(cpu))
-		return 1;
-
-	/* If no memory, play it safe and keep the CPU around. */
-	if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO))
-		return 0;
-	cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask);
-	cpumask_clear_cpu(cpu, non_nocb_cpus);
-	ret = !cpumask_empty(non_nocb_cpus);
-	free_cpumask_var(non_nocb_cpus);
-	return ret;
+	trace_rcu_future_gp(rnp, rdp, c, "StartWait");
+	for (;;) {
+		wait_event_interruptible(
+			rnp->nocb_gp_wq[c & 0x1],
+			(d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
+		if (likely(d))
+			break;
+		flush_signals(current);
+		trace_rcu_future_gp(rnp, rdp, c, "ResumeWait");
+	}
+	trace_rcu_future_gp(rnp, rdp, c, "EndWait");
+	smp_mb(); /* Ensure that CB invocation happens after GP end. */
 }
  
 /*
- * Helper structure for remote registry of RCU callbacks.
- * This is needed for when a no-CBs CPU needs to start a grace period.
- * If it just invokes call_rcu(), the resulting callback will be queued,
- * which can result in deadlock.
- */
-struct rcu_head_remote {
-	struct rcu_head *rhp;
-	call_rcu_func_t *crf;
-	void (*func)(struct rcu_head *rhp);
-};
-
-/*
- * Register a callback as specified by the rcu_head_remote struct.
- * This function is intended to be invoked via smp_call_function_single().
- */
-static void call_rcu_local(void *arg)
-{
-	struct rcu_head_remote *rhrp =
-		container_of(arg, struct rcu_head_remote, rhp);
-
-	rhrp->crf(rhrp->rhp, rhrp->func);
-}
-
-/*
- * Set up an rcu_head_remote structure and the invoke call_rcu_local()
- * on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via
- * smp_call_function_single().
- */
-static void invoke_crf_remote(struct rcu_head *rhp,
-			      void (*func)(struct rcu_head *rhp),
-			      call_rcu_func_t crf)
-{
-	struct rcu_head_remote rhr;
-
-	rhr.rhp = rhp;
-	rhr.crf = crf;
-	rhr.func = func;
-	smp_call_function_single(0, call_rcu_local, &rhr, 1);
-}
-
-/*
- * Helper functions to be passed to wait_rcu_gp(), each of which
- * invokes invoke_crf_remote() to register a callback appropriately.
- */
-static void __maybe_unused
-call_rcu_preempt_remote(struct rcu_head *rhp,
-			void (*func)(struct rcu_head *rhp))
-{
-	invoke_crf_remote(rhp, func, call_rcu);
-}
-static void call_rcu_bh_remote(struct rcu_head *rhp,
-			       void (*func)(struct rcu_head *rhp))
-{
-	invoke_crf_remote(rhp, func, call_rcu_bh);
-}
-static void call_rcu_sched_remote(struct rcu_head *rhp,
-				  void (*func)(struct rcu_head *rhp))
-{
-	invoke_crf_remote(rhp, func, call_rcu_sched);
-}
-
-/*
  * Per-rcu_data kthread, but only for no-CBs CPUs.  Each kthread invokes
  * callbacks queued by the corresponding no-CBs CPU.
  */
@@ -2390,7 +2224,7 @@
 		cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
 		ACCESS_ONCE(rdp->nocb_p_count) += c;
 		ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl;
-		wait_rcu_gp(rdp->rsp->call_remote);
+		rcu_nocb_wait_gp(rdp);
  
 		/* Each pass through the following loop invokes a callback. */
 		trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
  
  
  
  
  
  
  
@@ -2436,33 +2270,42 @@
 		return;
 	for_each_cpu(cpu, rcu_nocb_mask) {
 		rdp = per_cpu_ptr(rsp->rda, cpu);
-		t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu);
+		t = kthread_run(rcu_nocb_kthread, rdp,
+				"rcuo%c/%d", rsp->abbr, cpu);
 		BUG_ON(IS_ERR(t));
 		ACCESS_ONCE(rdp->nocb_kthread) = t;
 	}
 }
  
 /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
-static void init_nocb_callback_list(struct rcu_data *rdp)
+static bool init_nocb_callback_list(struct rcu_data *rdp)
 {
 	if (rcu_nocb_mask == NULL ||
 	    !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask))
-		return;
+		return false;
 	rdp->nxttail[RCU_NEXT_TAIL] = NULL;
+	return true;
 }
  
-/* Initialize the ->call_remote fields in the rcu_state structures. */
-static void __init rcu_init_nocb(void)
+#else /* #ifdef CONFIG_RCU_NOCB_CPU */
+
+static int rcu_nocb_needs_gp(struct rcu_state *rsp)
 {
-#ifdef CONFIG_PREEMPT_RCU
-	rcu_preempt_state.call_remote = call_rcu_preempt_remote;
-#endif /* #ifdef CONFIG_PREEMPT_RCU */
-	rcu_bh_state.call_remote = call_rcu_bh_remote;
-	rcu_sched_state.call_remote = call_rcu_sched_remote;
+	return 0;
 }
  
-#else /* #ifdef CONFIG_RCU_NOCB_CPU */
+static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+{
+}
  
+static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
+{
+}
+
+static void rcu_init_one_nocb(struct rcu_node *rnp)
+{
+}
+
 static bool is_nocb_cpu(int cpu)
 {
 	return false;
@@ -2480,11 +2323,6 @@
 	return 0;
 }
  
-static bool nocb_cpu_expendable(int cpu)
-{
-	return 1;
-}
-
 static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
 {
 }
  
@@ -2493,12 +2331,9 @@
 {
 }
  
-static void init_nocb_callback_list(struct rcu_data *rdp)
+static bool init_nocb_callback_list(struct rcu_data *rdp)
 {
-}
-
-static void __init rcu_init_nocb(void)
-{
+	return false;
 }
  
 #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
@@ -46,8 +46,6 @@
 #define RCU_TREE_NONCORE
 #include "rcutree.h"
  
-#define ulong2long(a) (*(long *)(&(a)))
-
 static int r_open(struct inode *inode, struct file *file,
 					const struct seq_operations *op)
 {
...	...	@@ -92,14 +92,14 @@
92	92	more information is printed with the stall-warning message, for example:
93	93
94	94	INFO: rcu_preempt detected stall on CPU
95		- 0: (63959 ticks this GP) idle=241/3fffffffffffffff/0
	95	+ 0: (63959 ticks this GP) idle=241/3fffffffffffffff/0 softirq=82/543
96	96	(t=65000 jiffies)
97	97
98	98	In kernels with CONFIG_RCU_FAST_NO_HZ, even more information is
99	99	printed:
100	100
101	101	INFO: rcu_preempt detected stall on CPU
102		- 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 drain=0 . timer not pending
	102	+ 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 nonlazy_posted: 25 .D
103	103	(t=65000 jiffies)
104	104
105	105	The "(64628 ticks this GP)" indicates that this CPU has taken more
...	...	@@ -116,13 +116,28 @@
116	116	be a small positive number if in the idle loop and a very large positive
117	117	number (as shown above) otherwise.
118	118
119		-For CONFIG_RCU_FAST_NO_HZ kernels, the "drain=0" indicates that the CPU is
120		-not in the process of trying to force itself into dyntick-idle state, the
121		-"." indicates that the CPU has not given up forcing RCU into dyntick-idle
122		-mode (it would be "H" otherwise), and the "timer not pending" indicates
123		-that the CPU has not recently forced RCU into dyntick-idle mode (it
124		-would otherwise indicate the number of microseconds remaining in this
125		-forced state).
	119	+The "softirq=" portion of the message tracks the number of RCU softirq
	120	+handlers that the stalled CPU has executed. The number before the "/"
	121	+is the number that had executed since boot at the time that this CPU
	122	+last noted the beginning of a grace period, which might be the current
	123	+(stalled) grace period, or it might be some earlier grace period (for
	124	+example, if the CPU might have been in dyntick-idle mode for an extended
	125	+time period. The number after the "/" is the number that have executed
	126	+since boot until the current time. If this latter number stays constant
	127	+across repeated stall-warning messages, it is possible that RCU's softirq
	128	+handlers are no longer able to execute on this CPU. This can happen if
	129	+the stalled CPU is spinning with interrupts are disabled, or, in -rt
	130	+kernels, if a high-priority process is starving RCU's softirq handler.
	131	+
	132	+For CONFIG_RCU_FAST_NO_HZ kernels, the "last_accelerate:" prints the
	133	+low-order 16 bits (in hex) of the jiffies counter when this CPU last
	134	+invoked rcu_try_advance_all_cbs() from rcu_needs_cpu() or last invoked
	135	+rcu_accelerate_cbs() from rcu_prepare_for_idle(). The "nonlazy_posted:"
	136	+prints the number of non-lazy callbacks posted since the last call to
	137	+rcu_needs_cpu(). Finally, an "L" indicates that there are currently
	138	+no non-lazy callbacks ("." is printed otherwise, as shown above) and
	139	+"D" indicates that dyntick-idle processing is enabled ("." is printed
	140	+otherwise, for example, if disabled via the "nohz=" kernel boot parameter).
126	141
127	142
128	143	Multiple Warnings From One Stall
...	...	@@ -2461,9 +2461,12 @@
2461	2461	In kernels built with CONFIG_RCU_NOCB_CPU=y, set
2462	2462	the specified list of CPUs to be no-callback CPUs.
2463	2463	Invocation of these CPUs' RCU callbacks will
2464		- be offloaded to "rcuoN" kthreads created for
2465		- that purpose. This reduces OS jitter on the
	2464	+ be offloaded to "rcuox/N" kthreads created for
	2465	+ that purpose, where "x" is "b" for RCU-bh, "p"
	2466	+ for RCU-preempt, and "s" for RCU-sched, and "N"
	2467	+ is the CPU number. This reduces OS jitter on the
2466	2468	offloaded CPUs, which can be useful for HPC and
	2469	+
2467	2470	real-time workloads. It can also improve energy
2468	2471	efficiency for asymmetric multiprocessors.
2469	2472
...	...	@@ -2487,6 +2490,17 @@
2487	2490	leaf rcu_node structure. Useful for very large
2488	2491	systems.
2489	2492
	2493	+ rcutree.jiffies_till_first_fqs= [KNL,BOOT]
	2494	+ Set delay from grace-period initialization to
	2495	+ first attempt to force quiescent states.
	2496	+ Units are jiffies, minimum value is zero,
	2497	+ and maximum value is HZ.
	2498	+
	2499	+ rcutree.jiffies_till_next_fqs= [KNL,BOOT]
	2500	+ Set delay between subsequent attempts to force
	2501	+ quiescent states. Units are jiffies, minimum
	2502	+ value is one, and maximum value is HZ.
	2503	+
2490	2504	rcutree.qhimark= [KNL,BOOT]
2491	2505	Set threshold of queued
2492	2506	RCU callbacks over which batch limiting is disabled.
2493	2507
...	...	@@ -2501,16 +2515,15 @@
2501	2515	rcutree.rcu_cpu_stall_timeout= [KNL,BOOT]
2502	2516	Set timeout for RCU CPU stall warning messages.
2503	2517
2504		- rcutree.jiffies_till_first_fqs= [KNL,BOOT]
2505		- Set delay from grace-period initialization to
2506		- first attempt to force quiescent states.
2507		- Units are jiffies, minimum value is zero,
2508		- and maximum value is HZ.
	2518	+ rcutree.rcu_idle_gp_delay= [KNL,BOOT]
	2519	+ Set wakeup interval for idle CPUs that have
	2520	+ RCU callbacks (RCU_FAST_NO_HZ=y).
2509	2521
2510		- rcutree.jiffies_till_next_fqs= [KNL,BOOT]
2511		- Set delay between subsequent attempts to force
2512		- quiescent states. Units are jiffies, minimum
2513		- value is one, and maximum value is HZ.
	2522	+ rcutree.rcu_idle_lazy_gp_delay= [KNL,BOOT]
	2523	+ Set wakeup interval for idle CPUs that have
	2524	+ only "lazy" RCU callbacks (RCU_FAST_NO_HZ=y).
	2525	+ Lazy RCU callbacks are those which RCU can
	2526	+ prove do nothing more than free memory.
2514	2527
2515	2528	rcutorture.fqs_duration= [KNL,BOOT]
2516	2529	Set duration of force_quiescent_state bursts.
...	...	@@ -125,6 +125,11 @@
125	125	__bit_spin_unlock(0, (unsigned long *)b);
126	126	}
127	127
	128	+static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
	129	+{
	130	+ return bit_spin_is_locked(0, (unsigned long *)b);
	131	+}
	132	+
128	133	/**
129	134	* hlist_bl_for_each_entry - iterate over list of given type
130	135	* @tpos: the type * to use as a loop cursor.
...	...	@@ -20,7 +20,7 @@
20	20	static inline struct hlist_bl_node hlist_bl_first_rcu(struct hlist_bl_head h)
21	21	{
22	22	return (struct hlist_bl_node *)
23		- ((unsigned long)rcu_dereference(h->first) & ~LIST_BL_LOCKMASK);
	23	+ ((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK);
24	24	}
25	25
26	26	/**
...	...	@@ -80,6 +80,7 @@
80	80	#define UINT_CMP_LT(a, b) (UINT_MAX / 2 < (a) - (b))
81	81	#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
82	82	#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
	83	+#define ulong2long(a) ((long )(&(a)))
83	84
84	85	/* Exported common interfaces */
85	86
...	...	@@ -72,6 +72,58 @@
72	72	);
73	73
74	74	/*
	75	+ * Tracepoint for future grace-period events, including those for no-callbacks
	76	+ * CPUs. The caller should pull the data from the rcu_node structure,
	77	+ * other than rcuname, which comes from the rcu_state structure, and event,
	78	+ * which is one of the following:
	79	+ *
	80	+ * "Startleaf": Request a nocb grace period based on leaf-node data.
	81	+ * "Startedleaf": Leaf-node start proved sufficient.
	82	+ * "Startedleafroot": Leaf-node start proved sufficient after checking root.
	83	+ * "Startedroot": Requested a nocb grace period based on root-node data.
	84	+ * "StartWait": Start waiting for the requested grace period.
	85	+ * "ResumeWait": Resume waiting after signal.
	86	+ * "EndWait": Complete wait.
	87	+ * "Cleanup": Clean up rcu_node structure after previous GP.
	88	+ * "CleanupMore": Clean up, and another no-CB GP is needed.
	89	+ */
	90	+TRACE_EVENT(rcu_future_grace_period,
	91	+
	92	+ TP_PROTO(char *rcuname, unsigned long gpnum, unsigned long completed,
	93	+ unsigned long c, u8 level, int grplo, int grphi,
	94	+ char *gpevent),
	95	+
	96	+ TP_ARGS(rcuname, gpnum, completed, c, level, grplo, grphi, gpevent),
	97	+
	98	+ TP_STRUCT__entry(
	99	+ __field(char *, rcuname)
	100	+ __field(unsigned long, gpnum)
	101	+ __field(unsigned long, completed)
	102	+ __field(unsigned long, c)
	103	+ __field(u8, level)
	104	+ __field(int, grplo)
	105	+ __field(int, grphi)
	106	+ __field(char *, gpevent)
	107	+ ),
	108	+
	109	+ TP_fast_assign(
	110	+ __entry->rcuname = rcuname;
	111	+ __entry->gpnum = gpnum;
	112	+ __entry->completed = completed;
	113	+ __entry->c = c;
	114	+ __entry->level = level;
	115	+ __entry->grplo = grplo;
	116	+ __entry->grphi = grphi;
	117	+ __entry->gpevent = gpevent;
	118	+ ),
	119	+
	120	+ TP_printk("%s %lu %lu %lu %u %d %d %s",
	121	+ __entry->rcuname, __entry->gpnum, __entry->completed,
	122	+ __entry->c, __entry->level, __entry->grplo, __entry->grphi,
	123	+ __entry->gpevent)
	124	+);
	125	+
	126	+/*
75	127	* Tracepoint for grace-period-initialization events. These are
76	128	* distinguished by the type of RCU, the new grace-period number, the
77	129	* rcu_node structure level, the starting and ending CPU covered by the
...	...	@@ -601,6 +653,9 @@
601	653	#define trace_rcu_grace_period(rcuname, gpnum, gpevent) do { } while (0)
602	654	#define trace_rcu_grace_period_init(rcuname, gpnum, level, grplo, grphi, \
603	655	qsmask) do { } while (0)
	656	+#define trace_rcu_future_grace_period(rcuname, gpnum, completed, c, \
	657	+ level, grplo, grphi, event) \
	658	+ do { } while (0)
604	659	#define trace_rcu_preempt_task(rcuname, pid, gpnum) do { } while (0)
605	660	#define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0)
606	661	#define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, \
...	...	@@ -582,13 +582,16 @@
582	582	depends on NO_HZ && SMP
583	583	default n
584	584	help
585		- This option causes RCU to attempt to accelerate grace periods in
586		- order to allow CPUs to enter dynticks-idle state more quickly.
587		- On the other hand, this option increases the overhead of the
588		- dynticks-idle checking, thus degrading scheduling latency.
	585	+ This option permits CPUs to enter dynticks-idle state even if
	586	+ they have RCU callbacks queued, and prevents RCU from waking
	587	+ these CPUs up more than roughly once every four jiffies (by
	588	+ default, you can adjust this using the rcutree.rcu_idle_gp_delay
	589	+ parameter), thus improving energy efficiency. On the other
	590	+ hand, this option increases the duration of RCU grace periods,
	591	+ for example, slowing down synchronize_rcu().
589	592
590		- Say Y if energy efficiency is critically important, and you don't
591		- care about real-time response.
	593	+ Say Y if energy efficiency is critically important, and you
	594	+ don't care about increased grace-period durations.
592	595
593	596	Say N if you are unsure.
594	597
...	...	@@ -655,7 +658,7 @@
655	658	Accept the default if unsure.
656	659
657	660	config RCU_NOCB_CPU
658		- bool "Offload RCU callback processing from boot-selected CPUs"
	661	+ bool "Offload RCU callback processing from boot-selected CPUs (EXPERIMENTAL"
659	662	depends on TREE_RCU \|\| TREE_PREEMPT_RCU
660	663	default n
661	664	help
662	665
663	666
...	...	@@ -666,15 +669,55 @@
666	669
667	670	This option offloads callback invocation from the set of
668	671	CPUs specified at boot time by the rcu_nocbs parameter.
669		- For each such CPU, a kthread ("rcuoN") will be created to
670		- invoke callbacks, where the "N" is the CPU being offloaded.
671		- Nothing prevents this kthread from running on the specified
672		- CPUs, but (1) the kthreads may be preempted between each
673		- callback, and (2) affinity or cgroups can be used to force
674		- the kthreads to run on whatever set of CPUs is desired.
	672	+ For each such CPU, a kthread ("rcuox/N") will be created to
	673	+ invoke callbacks, where the "N" is the CPU being offloaded,
	674	+ and where the "x" is "b" for RCU-bh, "p" for RCU-preempt, and
	675	+ "s" for RCU-sched. Nothing prevents this kthread from running
	676	+ on the specified CPUs, but (1) the kthreads may be preempted
	677	+ between each callback, and (2) affinity or cgroups can be used
	678	+ to force the kthreads to run on whatever set of CPUs is desired.
675	679
676		- Say Y here if you want reduced OS jitter on selected CPUs.
	680	+ Say Y here if you want to help to debug reduced OS jitter.
677	681	Say N here if you are unsure.
	682	+
	683	+choice
	684	+ prompt "Build-forced no-CBs CPUs"
	685	+ default RCU_NOCB_CPU_NONE
	686	+ help
	687	+ This option allows no-CBs CPUs to be specified at build time.
	688	+ Additional no-CBs CPUs may be specified by the rcu_nocbs=
	689	+ boot parameter.
	690	+
	691	+config RCU_NOCB_CPU_NONE
	692	+ bool "No build_forced no-CBs CPUs"
	693	+ depends on RCU_NOCB_CPU
	694	+ help
	695	+ This option does not force any of the CPUs to be no-CBs CPUs.
	696	+ Only CPUs designated by the rcu_nocbs= boot parameter will be
	697	+ no-CBs CPUs.
	698	+
	699	+config RCU_NOCB_CPU_ZERO
	700	+ bool "CPU 0 is a build_forced no-CBs CPU"
	701	+ depends on RCU_NOCB_CPU
	702	+ help
	703	+ This option forces CPU 0 to be a no-CBs CPU. Additional CPUs
	704	+ may be designated as no-CBs CPUs using the rcu_nocbs= boot
	705	+ parameter will be no-CBs CPUs.
	706	+
	707	+ Select this if CPU 0 needs to be a no-CBs CPU for real-time
	708	+ or energy-efficiency reasons.
	709	+
	710	+config RCU_NOCB_CPU_ALL
	711	+ bool "All CPUs are build_forced no-CBs CPUs"
	712	+ depends on RCU_NOCB_CPU
	713	+ help
	714	+ This option forces all CPUs to be no-CBs CPUs. The rcu_nocbs=
	715	+ boot parameter will be ignored.
	716	+
	717	+ Select this if all CPUs need to be no-CBs CPUs for real-time
	718	+ or energy-efficiency reasons.
	719	+
	720	+endchoice
678	721
679	722	endmenu # "RCU Subsystem"
680	723
...	...	@@ -64,7 +64,7 @@
64	64	static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
65	65	static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
66	66
67		-#define RCU_STATE_INITIALIZER(sname, cr) { \
	67	+#define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \
68	68	.level = { &sname##_state.node[0] }, \
69	69	.call = cr, \
70	70	.fqs_state = RCU_GP_IDLE, \
71	71
72	72
...	...	@@ -76,13 +76,14 @@
76	76	.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
77	77	.onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
78	78	.name = #sname, \
	79	+ .abbr = sabbr, \
79	80	}
80	81
81	82	struct rcu_state rcu_sched_state =
82		- RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched);
	83	+ RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
83	84	DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
84	85
85		-struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh);
	86	+struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
86	87	DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
87	88
88	89	static struct rcu_state *rcu_state;
...	...	@@ -223,6 +224,8 @@
223	224	module_param(jiffies_till_first_fqs, ulong, 0644);
224	225	module_param(jiffies_till_next_fqs, ulong, 0644);
225	226
	227	+static void rcu_start_gp_advanced(struct rcu_state rsp, struct rcu_node rnp,
	228	+ struct rcu_data *rdp);
226	229	static void force_qs_rnp(struct rcu_state rsp, int (f)(struct rcu_data *));
227	230	static void force_quiescent_state(struct rcu_state *rsp);
228	231	static int rcu_pending(int cpu);
...	...	@@ -310,6 +313,8 @@
310	313
311	314	if (rcu_gp_in_progress(rsp))
312	315	return 0; /* No, a grace period is already in progress. */
	316	+ if (rcu_nocb_needs_gp(rsp))
	317	+ return 1; /* Yes, a no-CBs CPU needs one. */
313	318	if (!rdp->nxttail[RCU_NEXT_TAIL])
314	319	return 0; /* No, this is a no-CBs (or offline) CPU. */
315	320	if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
316	321
...	...	@@ -1035,10 +1040,11 @@
1035	1040	{
1036	1041	int i;
1037	1042
	1043	+ if (init_nocb_callback_list(rdp))
	1044	+ return;
1038	1045	rdp->nxtlist = NULL;
1039	1046	for (i = 0; i < RCU_NEXT_SIZE; i++)
1040	1047	rdp->nxttail[i] = &rdp->nxtlist;
1041		- init_nocb_callback_list(rdp);
1042	1048	}
1043	1049
1044	1050	/*
...	...	@@ -1071,6 +1077,120 @@
1071	1077	}
1072	1078
1073	1079	/*
	1080	+ * Trace-event helper function for rcu_start_future_gp() and
	1081	+ * rcu_nocb_wait_gp().
	1082	+ */
	1083	+static void trace_rcu_future_gp(struct rcu_node rnp, struct rcu_data rdp,
	1084	+ unsigned long c, char *s)
	1085	+{
	1086	+ trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum,
	1087	+ rnp->completed, c, rnp->level,
	1088	+ rnp->grplo, rnp->grphi, s);
	1089	+}
	1090	+
	1091	+/*
	1092	+ * Start some future grace period, as needed to handle newly arrived
	1093	+ * callbacks. The required future grace periods are recorded in each
	1094	+ * rcu_node structure's ->need_future_gp field.
	1095	+ *
	1096	+ * The caller must hold the specified rcu_node structure's ->lock.
	1097	+ */
	1098	+static unsigned long __maybe_unused
	1099	+rcu_start_future_gp(struct rcu_node rnp, struct rcu_data rdp)
	1100	+{
	1101	+ unsigned long c;
	1102	+ int i;
	1103	+ struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
	1104	+
	1105	+ /*
	1106	+ * Pick up grace-period number for new callbacks. If this
	1107	+ * grace period is already marked as needed, return to the caller.
	1108	+ */
	1109	+ c = rcu_cbs_completed(rdp->rsp, rnp);
	1110	+ trace_rcu_future_gp(rnp, rdp, c, "Startleaf");
	1111	+ if (rnp->need_future_gp[c & 0x1]) {
	1112	+ trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf");
	1113	+ return c;
	1114	+ }
	1115	+
	1116	+ /*
	1117	+ * If either this rcu_node structure or the root rcu_node structure
	1118	+ * believe that a grace period is in progress, then we must wait
	1119	+ * for the one following, which is in "c". Because our request
	1120	+ * will be noticed at the end of the current grace period, we don't
	1121	+ * need to explicitly start one.
	1122	+ */
	1123	+ if (rnp->gpnum != rnp->completed \|\|
	1124	+ ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
	1125	+ rnp->need_future_gp[c & 0x1]++;
	1126	+ trace_rcu_future_gp(rnp, rdp, c, "Startedleaf");
	1127	+ return c;
	1128	+ }
	1129	+
	1130	+ /*
	1131	+ * There might be no grace period in progress. If we don't already
	1132	+ * hold it, acquire the root rcu_node structure's lock in order to
	1133	+ * start one (if needed).
	1134	+ */
	1135	+ if (rnp != rnp_root)
	1136	+ raw_spin_lock(&rnp_root->lock);
	1137	+
	1138	+ /*
	1139	+ * Get a new grace-period number. If there really is no grace
	1140	+ * period in progress, it will be smaller than the one we obtained
	1141	+ * earlier. Adjust callbacks as needed. Note that even no-CBs
	1142	+ * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed.
	1143	+ */
	1144	+ c = rcu_cbs_completed(rdp->rsp, rnp_root);
	1145	+ for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++)
	1146	+ if (ULONG_CMP_LT(c, rdp->nxtcompleted[i]))
	1147	+ rdp->nxtcompleted[i] = c;
	1148	+
	1149	+ /*
	1150	+ * If the needed for the required grace period is already
	1151	+ * recorded, trace and leave.
	1152	+ */
	1153	+ if (rnp_root->need_future_gp[c & 0x1]) {
	1154	+ trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot");
	1155	+ goto unlock_out;
	1156	+ }
	1157	+
	1158	+ /* Record the need for the future grace period. */
	1159	+ rnp_root->need_future_gp[c & 0x1]++;
	1160	+
	1161	+ /* If a grace period is not already in progress, start one. */
	1162	+ if (rnp_root->gpnum != rnp_root->completed) {
	1163	+ trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot");
	1164	+ } else {
	1165	+ trace_rcu_future_gp(rnp, rdp, c, "Startedroot");
	1166	+ rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
	1167	+ }
	1168	+unlock_out:
	1169	+ if (rnp != rnp_root)
	1170	+ raw_spin_unlock(&rnp_root->lock);
	1171	+ return c;
	1172	+}
	1173	+
	1174	+/*
	1175	+ * Clean up any old requests for the just-ended grace period. Also return
	1176	+ * whether any additional grace periods have been requested. Also invoke
	1177	+ * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads
	1178	+ * waiting for this grace period to complete.
	1179	+ */
	1180	+static int rcu_future_gp_cleanup(struct rcu_state rsp, struct rcu_node rnp)
	1181	+{
	1182	+ int c = rnp->completed;
	1183	+ int needmore;
	1184	+ struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
	1185	+
	1186	+ rcu_nocb_gp_cleanup(rsp, rnp);
	1187	+ rnp->need_future_gp[c & 0x1] = 0;
	1188	+ needmore = rnp->need_future_gp[(c + 1) & 0x1];
	1189	+ trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup");
	1190	+ return needmore;
	1191	+}
	1192	+
	1193	+/*
1074	1194	* If there is room, assign a ->completed number to any callbacks on
1075	1195	* this CPU that have not already been assigned. Also accelerate any
1076	1196	* callbacks that were previously assigned a ->completed number that has
...	...	@@ -1129,6 +1249,8 @@
1129	1249	rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL];
1130	1250	rdp->nxtcompleted[i] = c;
1131	1251	}
	1252	+ /* Record any needed additional grace periods. */
	1253	+ rcu_start_future_gp(rnp, rdp);
1132	1254
1133	1255	/* Trace depending on how much we were able to accelerate. */
1134	1256	if (!*rdp->nxttail[RCU_WAIT_TAIL])
1135	1257
...	...	@@ -1308,9 +1430,9 @@
1308	1430	rdp = this_cpu_ptr(rsp->rda);
1309	1431	rcu_preempt_check_blocked_tasks(rnp);
1310	1432	rnp->qsmask = rnp->qsmaskinit;
1311		- rnp->gpnum = rsp->gpnum;
	1433	+ ACCESS_ONCE(rnp->gpnum) = rsp->gpnum;
1312	1434	WARN_ON_ONCE(rnp->completed != rsp->completed);
1313		- rnp->completed = rsp->completed;
	1435	+ ACCESS_ONCE(rnp->completed) = rsp->completed;
1314	1436	if (rnp == rdp->mynode)
1315	1437	rcu_start_gp_per_cpu(rsp, rnp, rdp);
1316	1438	rcu_preempt_boost_start_gp(rnp);
...	...	@@ -1319,7 +1441,8 @@
1319	1441	rnp->grphi, rnp->qsmask);
1320	1442	raw_spin_unlock_irq(&rnp->lock);
1321	1443	#ifdef CONFIG_PROVE_RCU_DELAY
1322		- if ((random32() % (rcu_num_nodes * 8)) == 0)
	1444	+ if ((random32() % (rcu_num_nodes * 8)) == 0 &&
	1445	+ system_state == SYSTEM_RUNNING)
1323	1446	schedule_timeout_uninterruptible(2);
1324	1447	#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
1325	1448	cond_resched();
...	...	@@ -1361,6 +1484,7 @@
1361	1484	static void rcu_gp_cleanup(struct rcu_state *rsp)
1362	1485	{
1363	1486	unsigned long gp_duration;
	1487	+ int nocb = 0;
1364	1488	struct rcu_data *rdp;
1365	1489	struct rcu_node *rnp = rcu_get_root(rsp);
1366	1490
1367	1491
1368	1492
...	...	@@ -1390,17 +1514,23 @@
1390	1514	*/
1391	1515	rcu_for_each_node_breadth_first(rsp, rnp) {
1392	1516	raw_spin_lock_irq(&rnp->lock);
1393		- rnp->completed = rsp->gpnum;
	1517	+ ACCESS_ONCE(rnp->completed) = rsp->gpnum;
	1518	+ rdp = this_cpu_ptr(rsp->rda);
	1519	+ if (rnp == rdp->mynode)
	1520	+ __rcu_process_gp_end(rsp, rnp, rdp);
	1521	+ nocb += rcu_future_gp_cleanup(rsp, rnp);
1394	1522	raw_spin_unlock_irq(&rnp->lock);
1395	1523	cond_resched();
1396	1524	}
1397	1525	rnp = rcu_get_root(rsp);
1398	1526	raw_spin_lock_irq(&rnp->lock);
	1527	+ rcu_nocb_gp_set(rnp, nocb);
1399	1528
1400	1529	rsp->completed = rsp->gpnum; /* Declare grace period done. */
1401	1530	trace_rcu_grace_period(rsp->name, rsp->completed, "end");
1402	1531	rsp->fqs_state = RCU_GP_IDLE;
1403	1532	rdp = this_cpu_ptr(rsp->rda);
	1533	+ rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */
1404	1534	if (cpu_needs_another_gp(rsp, rdp))
1405	1535	rsp->gp_flags = 1;
1406	1536	raw_spin_unlock_irq(&rnp->lock);
1407	1537
1408	1538
1409	1539
1410	1540
1411	1541
1412	1542
1413	1543
1414	1544
...	...	@@ -1476,57 +1606,62 @@
1476	1606	/*
1477	1607	* Start a new RCU grace period if warranted, re-initializing the hierarchy
1478	1608	* in preparation for detecting the next grace period. The caller must hold
1479		- * the root node's ->lock, which is released before return. Hard irqs must
1480		- * be disabled.
	1609	+ * the root node's ->lock and hard irqs must be disabled.
1481	1610	*
1482	1611	* Note that it is legal for a dying CPU (which is marked as offline) to
1483	1612	* invoke this function. This can happen when the dying CPU reports its
1484	1613	* quiescent state.
1485	1614	*/
1486	1615	static void
1487		-rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
1488		- __releases(rcu_get_root(rsp)->lock)
	1616	+rcu_start_gp_advanced(struct rcu_state rsp, struct rcu_node rnp,
	1617	+ struct rcu_data *rdp)
1489	1618	{
1490		- struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1491		- struct rcu_node *rnp = rcu_get_root(rsp);
1492		-
1493		- if (!rsp->gp_kthread \|\|
1494		- !cpu_needs_another_gp(rsp, rdp)) {
	1619	+ if (!rsp->gp_kthread \|\| !cpu_needs_another_gp(rsp, rdp)) {
1495	1620	/*
1496	1621	* Either we have not yet spawned the grace-period
1497	1622	* task, this CPU does not need another grace period,
1498	1623	* or a grace period is already in progress.
1499	1624	* Either way, don't start a new grace period.
1500	1625	*/
1501		- raw_spin_unlock_irqrestore(&rnp->lock, flags);
1502	1626	return;
1503	1627	}
1504		-
1505		- /*
1506		- * Because there is no grace period in progress right now,
1507		- * any callbacks we have up to this point will be satisfied
1508		- * by the next grace period. So this is a good place to
1509		- * assign a grace period number to recently posted callbacks.
1510		- */
1511		- rcu_accelerate_cbs(rsp, rnp, rdp);
1512		-
1513	1628	rsp->gp_flags = RCU_GP_FLAG_INIT;
1514		- raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
1515	1629
1516		- /* Ensure that CPU is aware of completion of last grace period. */
1517		- rcu_process_gp_end(rsp, rdp);
1518		- local_irq_restore(flags);
1519		-
1520	1630	/* Wake up rcu_gp_kthread() to start the grace period. */
1521	1631	wake_up(&rsp->gp_wq);
1522	1632	}
1523	1633
1524	1634	/*
	1635	+ * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's
	1636	+ * callbacks. Note that rcu_start_gp_advanced() cannot do this because it
	1637	+ * is invoked indirectly from rcu_advance_cbs(), which would result in
	1638	+ * endless recursion -- or would do so if it wasn't for the self-deadlock
	1639	+ * that is encountered beforehand.
	1640	+ */
	1641	+static void
	1642	+rcu_start_gp(struct rcu_state *rsp)
	1643	+{
	1644	+ struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
	1645	+ struct rcu_node *rnp = rcu_get_root(rsp);
	1646	+
	1647	+ /*
	1648	+ * If there is no grace period in progress right now, any
	1649	+ * callbacks we have up to this point will be satisfied by the
	1650	+ * next grace period. Also, advancing the callbacks reduces the
	1651	+ * probability of false positives from cpu_needs_another_gp()
	1652	+ * resulting in pointless grace periods. So, advance callbacks
	1653	+ * then start the grace period!
	1654	+ */
	1655	+ rcu_advance_cbs(rsp, rnp, rdp);
	1656	+ rcu_start_gp_advanced(rsp, rnp, rdp);
	1657	+}
	1658	+
	1659	+/*
1525	1660	* Report a full set of quiescent states to the specified rcu_state
1526	1661	* data structure. This involves cleaning up after the prior grace
1527	1662	* period and letting rcu_start_gp() start up the next grace period
1528		- * if one is needed. Note that the caller must hold rnp->lock, as
1529		- * required by rcu_start_gp(), which will release it.
	1663	+ * if one is needed. Note that the caller must hold rnp->lock, which
	1664	+ * is released before return.
1530	1665	*/
1531	1666	static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
1532	1667	__releases(rcu_get_root(rsp)->lock)
...	...	@@ -2124,7 +2259,8 @@
2124	2259	local_irq_save(flags);
2125	2260	if (cpu_needs_another_gp(rsp, rdp)) {
2126	2261	raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
2127		- rcu_start_gp(rsp, flags); /* releases above lock */
	2262	+ rcu_start_gp(rsp);
	2263	+ raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
2128	2264	} else {
2129	2265	local_irq_restore(flags);
2130	2266	}
...	...	@@ -2169,7 +2305,8 @@
2169	2305
2170	2306	static void invoke_rcu_core(void)
2171	2307	{
2172		- raise_softirq(RCU_SOFTIRQ);
	2308	+ if (cpu_online(smp_processor_id()))
	2309	+ raise_softirq(RCU_SOFTIRQ);
2173	2310	}
2174	2311
2175	2312	/*
2176	2313
...	...	@@ -2204,11 +2341,11 @@
2204	2341
2205	2342	/* Start a new grace period if one not already started. */
2206	2343	if (!rcu_gp_in_progress(rsp)) {
2207		- unsigned long nestflag;
2208	2344	struct rcu_node *rnp_root = rcu_get_root(rsp);
2209	2345
2210		- raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
2211		- rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */
	2346	+ raw_spin_lock(&rnp_root->lock);
	2347	+ rcu_start_gp(rsp);
	2348	+ raw_spin_unlock(&rnp_root->lock);
2212	2349	} else {
2213	2350	/* Give the grace period a kick. */
2214	2351	rdp->blimit = LONG_MAX;
2215	2352
2216	2353
2217	2354
...	...	@@ -2628,19 +2765,27 @@
2628	2765	}
2629	2766
2630	2767	/*
2631		- * Check to see if any future RCU-related work will need to be done
2632		- * by the current CPU, even if none need be done immediately, returning
2633		- * 1 if so.
	2768	+ * Return true if the specified CPU has any callback. If all_lazy is
	2769	+ * non-NULL, store an indication of whether all callbacks are lazy.
	2770	+ * (If there are no callbacks, all of them are deemed to be lazy.)
2634	2771	*/
2635		-static int rcu_cpu_has_callbacks(int cpu)
	2772	+static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
2636	2773	{
	2774	+ bool al = true;
	2775	+ bool hc = false;
	2776	+ struct rcu_data *rdp;
2637	2777	struct rcu_state *rsp;
2638	2778
2639		- /* RCU callbacks either ready or pending? */
2640		- for_each_rcu_flavor(rsp)
2641		- if (per_cpu_ptr(rsp->rda, cpu)->nxtlist)
2642		- return 1;
2643		- return 0;
	2779	+ for_each_rcu_flavor(rsp) {
	2780	+ rdp = per_cpu_ptr(rsp->rda, cpu);
	2781	+ if (rdp->qlen != rdp->qlen_lazy)
	2782	+ al = false;
	2783	+ if (rdp->nxtlist)
	2784	+ hc = true;
	2785	+ }
	2786	+ if (all_lazy)
	2787	+ *all_lazy = al;
	2788	+ return hc;
2644	2789	}
2645	2790
2646	2791	/*
...	...	@@ -2859,7 +3004,6 @@
2859	3004	rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
2860	3005	atomic_set(&rdp->dynticks->dynticks,
2861	3006	(atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
2862		- rcu_prepare_for_idle_init(cpu);
2863	3007	raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2864	3008
2865	3009	/* Add CPU to rcu_node bitmasks. */
...	...	@@ -2909,7 +3053,6 @@
2909	3053	struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
2910	3054	struct rcu_node *rnp = rdp->mynode;
2911	3055	struct rcu_state *rsp;
2912		- int ret = NOTIFY_OK;
2913	3056
2914	3057	trace_rcu_utilization("Start CPU hotplug");
2915	3058	switch (action) {
2916	3059
2917	3060
...	...	@@ -2923,21 +3066,12 @@
2923	3066	rcu_boost_kthread_setaffinity(rnp, -1);
2924	3067	break;
2925	3068	case CPU_DOWN_PREPARE:
2926		- if (nocb_cpu_expendable(cpu))
2927		- rcu_boost_kthread_setaffinity(rnp, cpu);
2928		- else
2929		- ret = NOTIFY_BAD;
	3069	+ rcu_boost_kthread_setaffinity(rnp, cpu);
2930	3070	break;
2931	3071	case CPU_DYING:
2932	3072	case CPU_DYING_FROZEN:
2933		- /*
2934		- * The whole machine is "stopped" except this CPU, so we can
2935		- * touch any data without introducing corruption. We send the
2936		- * dying CPU's callbacks to an arbitrarily chosen online CPU.
2937		- */
2938	3073	for_each_rcu_flavor(rsp)
2939	3074	rcu_cleanup_dying_cpu(rsp);
2940		- rcu_cleanup_after_idle(cpu);
2941	3075	break;
2942	3076	case CPU_DEAD:
2943	3077	case CPU_DEAD_FROZEN:
...	...	@@ -2950,7 +3084,7 @@
2950	3084	break;
2951	3085	}
2952	3086	trace_rcu_utilization("End CPU hotplug");
2953		- return ret;
	3087	+ return NOTIFY_OK;
2954	3088	}
2955	3089
2956	3090	/*
...	...	@@ -3085,6 +3219,7 @@
3085	3219	}
3086	3220	rnp->level = i;
3087	3221	INIT_LIST_HEAD(&rnp->blkd_tasks);
	3222	+ rcu_init_one_nocb(rnp);
3088	3223	}
3089	3224	}
3090	3225
...	...	@@ -3170,8 +3305,7 @@
3170	3305	rcu_init_one(&rcu_sched_state, &rcu_sched_data);
3171	3306	rcu_init_one(&rcu_bh_state, &rcu_bh_data);
3172	3307	__rcu_init_preempt();
3173		- rcu_init_nocb();
3174		- open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
	3308	+ open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
3175	3309
3176	3310	/*
3177	3311	* We don't need protection against CPU-hotplug here because
...	...	@@ -88,18 +88,13 @@
88	88	int dynticks_nmi_nesting; /* Track NMI nesting level. */
89	89	atomic_t dynticks; /* Even value for idle, else odd. */
90	90	#ifdef CONFIG_RCU_FAST_NO_HZ
91		- int dyntick_drain; /* Prepare-for-idle state variable. */
92		- unsigned long dyntick_holdoff;
93		- /* No retries for the jiffy of failure. */
94		- struct timer_list idle_gp_timer;
95		- /* Wake up CPU sleeping with callbacks. */
96		- unsigned long idle_gp_timer_expires;
97		- /* When to wake up CPU (for repost). */
98		- bool idle_first_pass; /* First pass of attempt to go idle? */
	91	+ bool all_lazy; /* Are all CPU's CBs lazy? */
99	92	unsigned long nonlazy_posted;
100	93	/* # times non-lazy CBs posted to CPU. */
101	94	unsigned long nonlazy_posted_snap;
102	95	/* idle-period nonlazy_posted snapshot. */
	96	+ unsigned long last_accelerate;
	97	+ /* Last jiffy CBs were accelerated. */
103	98	int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
104	99	#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
105	100	};
...	...	@@ -134,9 +129,6 @@
134	129	/* elements that need to drain to allow the */
135	130	/* current expedited grace period to */
136	131	/* complete (only for TREE_PREEMPT_RCU). */
137		- atomic_t wakemask; /* CPUs whose kthread needs to be awakened. */
138		- /* Since this has meaning only for leaf */
139		- /* rcu_node structures, 32 bits suffices. */
140	132	unsigned long qsmaskinit;
141	133	/* Per-GP initial value for qsmask & expmask. */
142	134	unsigned long grpmask; /* Mask to apply to parent qsmask. */
...	...	@@ -196,6 +188,12 @@
196	188	/* Refused to boost: not sure why, though. */
197	189	/* This can happen due to race conditions. */
198	190	#endif /* #ifdef CONFIG_RCU_BOOST */
	191	+#ifdef CONFIG_RCU_NOCB_CPU
	192	+ wait_queue_head_t nocb_gp_wq[2];
	193	+ /* Place for rcu_nocb_kthread() to wait GP. */
	194	+#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
	195	+ int need_future_gp[2];
	196	+ /* Counts of upcoming no-CB GP requests. */
199	197	raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
200	198	} ____cacheline_internodealigned_in_smp;
201	199
...	...	@@ -328,6 +326,11 @@
328	326	struct task_struct *nocb_kthread;
329	327	#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
330	328
	329	+ /* 8) RCU CPU stall data. */
	330	+#ifdef CONFIG_RCU_CPU_STALL_INFO
	331	+ unsigned int softirq_snap; /* Snapshot of softirq activity. */
	332	+#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
	333	+
331	334	int cpu;
332	335	struct rcu_state *rsp;
333	336	};
...	...	@@ -375,12 +378,6 @@
375	378	struct rcu_data __percpu rda; / pointer of percu rcu_data. */
376	379	void (call)(struct rcu_head head, /* call_rcu() flavor. */
377	380	void (func)(struct rcu_head head));
378		-#ifdef CONFIG_RCU_NOCB_CPU
379		- void (call_remote)(struct rcu_head head,
380		- void (func)(struct rcu_head head));
381		- /* call_rcu() flavor, but for */
382		- /* placing on remote CPU. */
383		-#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
384	381
385	382	/* The following fields are guarded by the root rcu_node's lock. */
386	383
...	...	@@ -443,6 +440,7 @@
443	440	unsigned long gp_max; /* Maximum GP duration in */
444	441	/* jiffies. */
445	442	char name; / Name of structure. */
	443	+ char abbr; /* Abbreviated name. */
446	444	struct list_head flavors; /* List of RCU flavors. */
447	445	};
448	446
...	...	@@ -520,7 +518,6 @@
520	518	struct rcu_node *rnp);
521	519	#endif /* #ifdef CONFIG_RCU_BOOST */
522	520	static void __cpuinit rcu_prepare_kthreads(int cpu);
523		-static void rcu_prepare_for_idle_init(int cpu);
524	521	static void rcu_cleanup_after_idle(int cpu);
525	522	static void rcu_prepare_for_idle(int cpu);
526	523	static void rcu_idle_count_callbacks_posted(void);
527	524
528	525
...	...	@@ -529,16 +526,18 @@
529	526	static void print_cpu_stall_info_end(void);
530	527	static void zero_cpu_stall_ticks(struct rcu_data *rdp);
531	528	static void increment_cpu_stall_ticks(void);
	529	+static int rcu_nocb_needs_gp(struct rcu_state *rsp);
	530	+static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
	531	+static void rcu_nocb_gp_cleanup(struct rcu_state rsp, struct rcu_node rnp);
	532	+static void rcu_init_one_nocb(struct rcu_node *rnp);
532	533	static bool is_nocb_cpu(int cpu);
533	534	static bool __call_rcu_nocb(struct rcu_data rdp, struct rcu_head rhp,
534	535	bool lazy);
535	536	static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
536	537	struct rcu_data *rdp);
537		-static bool nocb_cpu_expendable(int cpu);
538	538	static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
539	539	static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
540		-static void init_nocb_callback_list(struct rcu_data *rdp);
541		-static void __init rcu_init_nocb(void);
	540	+static bool init_nocb_callback_list(struct rcu_data *rdp);
542	541
543	542	#endif /* #ifndef RCU_TREE_NONCORE */
544	543
...	...	@@ -46,8 +46,6 @@
46	46	#define RCU_TREE_NONCORE
47	47	#include "rcutree.h"
48	48
49		-#define ulong2long(a) ((long )(&(a)))
50		-
51	49	static int r_open(struct inode inode, struct file file,
52	50	const struct seq_operations *op)
53	51	{