Doug / smarc-fsl-linux-kernel | Embedian Git Server

Commit 7cb92499000e3c86dae653077b1465458a039ef6

Authored by Paul E. McKenney 2011-11-29 04:28:34 +0800

Committed by Paul E. McKenney 2011-12-12 02:32:07 +0800

Exists in master and in 6 other branches

rcu: Permit dyntick-idle with callbacks pending

The current implementation of RCU_FAST_NO_HZ prevents CPUs from entering
dyntick-idle state if they have RCU callbacks pending. Unfortunately,
this has the side-effect of often preventing them from entering this
state, especially if at least one other CPU is not in dyntick-idle state.
However, the resulting per-tick wakeup is wasteful in many cases: if the
CPU has already fully responded to the current RCU grace period, there
will be nothing for it to do until this grace period ends, which will
frequently take several jiffies.

This commit therefore permits a CPU that has done everything that the
current grace period has asked of it (rcu_pending() == 0) even if it
still as RCU callbacks pending. However, such a CPU posts a timer to
wake it up several jiffies later (6 jiffies, based on experience with
grace-period lengths). This wakeup is required to handle situations
that can result in all CPUs being in dyntick-idle mode, thus failing
to ever complete the current grace period. If a CPU wakes up before
the timer goes off, then it cancels that timer, thus avoiding spurious
wakeups.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

Showing 4 changed files with 78 additions and 5 deletions Inline Diff

include/trace/events/rcu.h
kernel/rcutree.c
kernel/rcutree.h
kernel/rcutree_plugin.h

include/trace/events/rcu.h

Diff comments View file @ 7cb9249

 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM rcu
 #if !defined(_TRACE_RCU_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_RCU_H
 #include <linux/tracepoint.h>
 /*
  * Tracepoint for start/end markers used for utilization calculations.
  * By convention, the string is of the following forms:
  *
  * "Start <activity>" -- Mark the start of the specified activity,
  *			 such as "context switch".  Nesting is permitted.
  * "End <activity>" -- Mark the end of the specified activity.
  *
  * An "@" character within "<activity>" is a comment character: Data
  * reduction scripts will ignore the "@" and the remainder of the line.
  */
 TRACE_EVENT(rcu_utilization,
 	TP_PROTO(char *s),
 	TP_ARGS(s),
 	TP_STRUCT__entry(
 		__field(char *, s)
 	),
 	TP_fast_assign(
 		__entry->s = s;
 	),
 	TP_printk("%s", __entry->s)
 );
 #ifdef CONFIG_RCU_TRACE
 #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU)
 /*
  * Tracepoint for grace-period events: starting and ending a grace
  * period ("start" and "end", respectively), a CPU noting the start
  * of a new grace period or the end of an old grace period ("cpustart"
  * and "cpuend", respectively), a CPU passing through a quiescent
  * state ("cpuqs"), a CPU coming online or going offline ("cpuonl"
  * and "cpuofl", respectively), and a CPU being kicked for being too
  * long in dyntick-idle mode ("kick").
  */
 TRACE_EVENT(rcu_grace_period,
 	TP_PROTO(char *rcuname, unsigned long gpnum, char *gpevent),
 	TP_ARGS(rcuname, gpnum, gpevent),
 	TP_STRUCT__entry(
 		__field(char *, rcuname)
 		__field(unsigned long, gpnum)
 		__field(char *, gpevent)
 	),
 	TP_fast_assign(
 		__entry->rcuname = rcuname;
 		__entry->gpnum = gpnum;
 		__entry->gpevent = gpevent;
 	),
 	TP_printk("%s %lu %s",
 		  __entry->rcuname, __entry->gpnum, __entry->gpevent)
 );
 /*
  * Tracepoint for grace-period-initialization events.  These are
  * distinguished by the type of RCU, the new grace-period number, the
  * rcu_node structure level, the starting and ending CPU covered by the
  * rcu_node structure, and the mask of CPUs that will be waited for.
  * All but the type of RCU are extracted from the rcu_node structure.
  */
 TRACE_EVENT(rcu_grace_period_init,
 	TP_PROTO(char *rcuname, unsigned long gpnum, u8 level,
 		 int grplo, int grphi, unsigned long qsmask),
 	TP_ARGS(rcuname, gpnum, level, grplo, grphi, qsmask),
 	TP_STRUCT__entry(
 		__field(char *, rcuname)
 		__field(unsigned long, gpnum)
 		__field(u8, level)
 		__field(int, grplo)
 		__field(int, grphi)
 		__field(unsigned long, qsmask)
 	),
 	TP_fast_assign(
 		__entry->rcuname = rcuname;
 		__entry->gpnum = gpnum;
 		__entry->level = level;
 		__entry->grplo = grplo;
 		__entry->grphi = grphi;
 		__entry->qsmask = qsmask;
 	),
 	TP_printk("%s %lu %u %d %d %lx",
 		  __entry->rcuname, __entry->gpnum, __entry->level,
 		  __entry->grplo, __entry->grphi, __entry->qsmask)
 );
 /*
  * Tracepoint for tasks blocking within preemptible-RCU read-side
  * critical sections.  Track the type of RCU (which one day might
  * include SRCU), the grace-period number that the task is blocking
  * (the current or the next), and the task's PID.
  */
 TRACE_EVENT(rcu_preempt_task,
 	TP_PROTO(char *rcuname, int pid, unsigned long gpnum),
 	TP_ARGS(rcuname, pid, gpnum),
 	TP_STRUCT__entry(
 		__field(char *, rcuname)
 		__field(unsigned long, gpnum)
 		__field(int, pid)
 	),
 	TP_fast_assign(
 		__entry->rcuname = rcuname;
 		__entry->gpnum = gpnum;
 		__entry->pid = pid;
 	),
 	TP_printk("%s %lu %d",
 		  __entry->rcuname, __entry->gpnum, __entry->pid)
 );
 /*
  * Tracepoint for tasks that blocked within a given preemptible-RCU
  * read-side critical section exiting that critical section.  Track the
  * type of RCU (which one day might include SRCU) and the task's PID.
  */
 TRACE_EVENT(rcu_unlock_preempted_task,
 	TP_PROTO(char *rcuname, unsigned long gpnum, int pid),
 	TP_ARGS(rcuname, gpnum, pid),
 	TP_STRUCT__entry(
 		__field(char *, rcuname)
 		__field(unsigned long, gpnum)
 		__field(int, pid)
 	),
 	TP_fast_assign(
 		__entry->rcuname = rcuname;
 		__entry->gpnum = gpnum;
 		__entry->pid = pid;
 	),
 	TP_printk("%s %lu %d", __entry->rcuname, __entry->gpnum, __entry->pid)
 );
 /*
  * Tracepoint for quiescent-state-reporting events.  These are
  * distinguished by the type of RCU, the grace-period number, the
  * mask of quiescent lower-level entities, the rcu_node structure level,
  * the starting and ending CPU covered by the rcu_node structure, and
  * whether there are any blocked tasks blocking the current grace period.
  * All but the type of RCU are extracted from the rcu_node structure.
  */
 TRACE_EVENT(rcu_quiescent_state_report,
 	TP_PROTO(char *rcuname, unsigned long gpnum,
 		 unsigned long mask, unsigned long qsmask,
 		 u8 level, int grplo, int grphi, int gp_tasks),
 	TP_ARGS(rcuname, gpnum, mask, qsmask, level, grplo, grphi, gp_tasks),
 	TP_STRUCT__entry(
 		__field(char *, rcuname)
 		__field(unsigned long, gpnum)
 		__field(unsigned long, mask)
 		__field(unsigned long, qsmask)
 		__field(u8, level)
 		__field(int, grplo)
 		__field(int, grphi)
 		__field(u8, gp_tasks)
 	),
 	TP_fast_assign(
 		__entry->rcuname = rcuname;
 		__entry->gpnum = gpnum;
 		__entry->mask = mask;
 		__entry->qsmask = qsmask;
 		__entry->level = level;
 		__entry->grplo = grplo;
 		__entry->grphi = grphi;
 		__entry->gp_tasks = gp_tasks;
 	),
 	TP_printk("%s %lu %lx>%lx %u %d %d %u",
 		  __entry->rcuname, __entry->gpnum,
 		  __entry->mask, __entry->qsmask, __entry->level,
 		  __entry->grplo, __entry->grphi, __entry->gp_tasks)
 );
 /*
  * Tracepoint for quiescent states detected by force_quiescent_state().
  * These trace events include the type of RCU, the grace-period number
  * that was blocked by the CPU, the CPU itself, and the type of quiescent
  * state, which can be "dti" for dyntick-idle mode, "ofl" for CPU offline,
  * or "kick" when kicking a CPU that has been in dyntick-idle mode for
  * too long.
  */
 TRACE_EVENT(rcu_fqs,
 	TP_PROTO(char *rcuname, unsigned long gpnum, int cpu, char *qsevent),
 	TP_ARGS(rcuname, gpnum, cpu, qsevent),
 	TP_STRUCT__entry(
 		__field(char *, rcuname)
 		__field(unsigned long, gpnum)
 		__field(int, cpu)
 		__field(char *, qsevent)
 	),
 	TP_fast_assign(
 		__entry->rcuname = rcuname;
 		__entry->gpnum = gpnum;
 		__entry->cpu = cpu;
 		__entry->qsevent = qsevent;
 	),
 	TP_printk("%s %lu %d %s",
 		  __entry->rcuname, __entry->gpnum,
 		  __entry->cpu, __entry->qsevent)
 );
 #endif /* #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) */
 /*
  * Tracepoint for dyntick-idle entry/exit events.  These take a string
  * as argument: "Start" for entering dyntick-idle mode, "End" for
  * leaving it, "--=" for events moving towards idle, and "++=" for events
  * moving away from idle.  "Error on entry: not idle task" and "Error on
  * exit: not idle task" indicate that a non-idle task is erroneously
  * toying with the idle loop.
  *
  * These events also take a pair of numbers, which indicate the nesting
  * depth before and after the event of interest.  Note that task-related
  * events use the upper bits of each number, while interrupt-related
  * events use the lower bits.
  */
 TRACE_EVENT(rcu_dyntick,
 	TP_PROTO(char *polarity, long long oldnesting, long long newnesting),
 	TP_ARGS(polarity, oldnesting, newnesting),
 	TP_STRUCT__entry(
 		__field(char *, polarity)
 		__field(long long, oldnesting)
 		__field(long long, newnesting)
 	),
 	TP_fast_assign(
 		__entry->polarity = polarity;
 		__entry->oldnesting = oldnesting;
 		__entry->newnesting = newnesting;
 	),
 	TP_printk("%s %llx %llx", __entry->polarity,
 		  __entry->oldnesting, __entry->newnesting)
 );
 /*
  * Tracepoint for RCU preparation for idle, the goal being to get RCU
  * processing done so that the current CPU can shut off its scheduling
  * clock and enter dyntick-idle mode.  One way to accomplish this is
  * to drain all RCU callbacks from this CPU, and the other is to have
  * done everything RCU requires for the current grace period.  In this
  * latter case, the CPU will be awakened at the end of the current grace
  * period in order to process the remainder of its callbacks.
  *
  * These tracepoints take a string as argument:
  *
  *	"No callbacks": Nothing to do, no callbacks on this CPU.
  *	"In holdoff": Nothing to do, holding off after unsuccessful attempt.
  *	"Begin holdoff": Attempt failed, don't retry until next jiffy.
+ *	"Dyntick with callbacks": Entering dyntick-idle despite callbacks.
  *	"More callbacks": Still more callbacks, try again to clear them out.
  *	"Callbacks drained": All callbacks processed, off to dyntick idle!
- *	"CPU awakened at GP end":
+ *	"Timer": Timer fired to cause CPU to continue processing callbacks.
  */
 TRACE_EVENT(rcu_prep_idle,
 	TP_PROTO(char *reason),
 	TP_ARGS(reason),
 	TP_STRUCT__entry(
 		__field(char *, reason)
 	),
 	TP_fast_assign(
 		__entry->reason = reason;
 	),
 	TP_printk("%s", __entry->reason)
 );
 /*
  * Tracepoint for the registration of a single RCU callback function.
  * The first argument is the type of RCU, the second argument is
  * a pointer to the RCU callback itself, and the third element is the
  * new RCU callback queue length for the current CPU.
  */
 TRACE_EVENT(rcu_callback,
 	TP_PROTO(char *rcuname, struct rcu_head *rhp, long qlen),
 	TP_ARGS(rcuname, rhp, qlen),
 	TP_STRUCT__entry(
 		__field(char *, rcuname)
 		__field(void *, rhp)
 		__field(void *, func)
 		__field(long, qlen)
 	),
 	TP_fast_assign(
 		__entry->rcuname = rcuname;
 		__entry->rhp = rhp;
 		__entry->func = rhp->func;
 		__entry->qlen = qlen;
 	),
 	TP_printk("%s rhp=%p func=%pf %ld",
 		  __entry->rcuname, __entry->rhp, __entry->func, __entry->qlen)
 );
 /*
  * Tracepoint for the registration of a single RCU callback of the special
  * kfree() form.  The first argument is the RCU type, the second argument
  * is a pointer to the RCU callback, the third argument is the offset
  * of the callback within the enclosing RCU-protected data structure,
  * and the fourth argument is the new RCU callback queue length for the
  * current CPU.
  */
 TRACE_EVENT(rcu_kfree_callback,
 	TP_PROTO(char *rcuname, struct rcu_head *rhp, unsigned long offset,
 		 long qlen),
 	TP_ARGS(rcuname, rhp, offset, qlen),
 	TP_STRUCT__entry(
 		__field(char *, rcuname)
 		__field(void *, rhp)
 		__field(unsigned long, offset)
 		__field(long, qlen)
 	),
 	TP_fast_assign(
 		__entry->rcuname = rcuname;
 		__entry->rhp = rhp;
 		__entry->offset = offset;
 		__entry->qlen = qlen;
 	),
 	TP_printk("%s rhp=%p func=%ld %ld",
 		  __entry->rcuname, __entry->rhp, __entry->offset,
 		  __entry->qlen)
 );
 /*
  * Tracepoint for marking the beginning rcu_do_batch, performed to start
  * RCU callback invocation.  The first argument is the RCU flavor,
  * the second is the total number of callbacks (including those that
  * are not yet ready to be invoked), and the third argument is the
  * current RCU-callback batch limit.
  */
 TRACE_EVENT(rcu_batch_start,
 	TP_PROTO(char *rcuname, long qlen, int blimit),
 	TP_ARGS(rcuname, qlen, blimit),
 	TP_STRUCT__entry(
 		__field(char *, rcuname)
 		__field(long, qlen)
 		__field(int, blimit)
 	),
 	TP_fast_assign(
 		__entry->rcuname = rcuname;
 		__entry->qlen = qlen;
 		__entry->blimit = blimit;
 	),
 	TP_printk("%s CBs=%ld bl=%d",
 		  __entry->rcuname, __entry->qlen, __entry->blimit)
 );
 /*
  * Tracepoint for the invocation of a single RCU callback function.
  * The first argument is the type of RCU, and the second argument is
  * a pointer to the RCU callback itself.
  */
 TRACE_EVENT(rcu_invoke_callback,
 	TP_PROTO(char *rcuname, struct rcu_head *rhp),
 	TP_ARGS(rcuname, rhp),
 	TP_STRUCT__entry(
 		__field(char *, rcuname)
 		__field(void *, rhp)
 		__field(void *, func)
 	),
 	TP_fast_assign(
 		__entry->rcuname = rcuname;
 		__entry->rhp = rhp;
 		__entry->func = rhp->func;
 	),
 	TP_printk("%s rhp=%p func=%pf",
 		  __entry->rcuname, __entry->rhp, __entry->func)
 );
 /*
  * Tracepoint for the invocation of a single RCU callback of the special
  * kfree() form.  The first argument is the RCU flavor, the second
  * argument is a pointer to the RCU callback, and the third argument
  * is the offset of the callback within the enclosing RCU-protected
  * data structure.
  */
 TRACE_EVENT(rcu_invoke_kfree_callback,
 	TP_PROTO(char *rcuname, struct rcu_head *rhp, unsigned long offset),
 	TP_ARGS(rcuname, rhp, offset),
 	TP_STRUCT__entry(
 		__field(char *, rcuname)
 		__field(void *, rhp)
 		__field(unsigned long, offset)
 	),
 	TP_fast_assign(
 		__entry->rcuname = rcuname;
 		__entry->rhp = rhp;
 		__entry->offset	= offset;
 	),
 	TP_printk("%s rhp=%p func=%ld",
 		  __entry->rcuname, __entry->rhp, __entry->offset)
 );
 /*
  * Tracepoint for exiting rcu_do_batch after RCU callbacks have been
  * invoked.  The first argument is the name of the RCU flavor and
  * the second argument is number of callbacks actually invoked.
  */
 TRACE_EVENT(rcu_batch_end,
 	TP_PROTO(char *rcuname, int callbacks_invoked),
 	TP_ARGS(rcuname, callbacks_invoked),
 	TP_STRUCT__entry(
 		__field(char *, rcuname)
 		__field(int, callbacks_invoked)
 	),
 	TP_fast_assign(
 		__entry->rcuname = rcuname;
 		__entry->callbacks_invoked = callbacks_invoked;
 	),
 	TP_printk("%s CBs-invoked=%d",
 		  __entry->rcuname, __entry->callbacks_invoked)
 );
 /*
  * Tracepoint for rcutorture readers.  The first argument is the name
  * of the RCU flavor from rcutorture's viewpoint and the second argument
  * is the callback address.
  */
 TRACE_EVENT(rcu_torture_read,
 	TP_PROTO(char *rcutorturename, struct rcu_head *rhp),
 	TP_ARGS(rcutorturename, rhp),
 	TP_STRUCT__entry(
 		__field(char *, rcutorturename)
 		__field(struct rcu_head *, rhp)
 	),
 	TP_fast_assign(
 		__entry->rcutorturename = rcutorturename;
 		__entry->rhp = rhp;
 	),
 	TP_printk("%s torture read %p",
 		  __entry->rcutorturename, __entry->rhp)
 );
 #else /* #ifdef CONFIG_RCU_TRACE */
 #define trace_rcu_grace_period(rcuname, gpnum, gpevent) do { } while (0)
 #define trace_rcu_grace_period_init(rcuname, gpnum, level, grplo, grphi, qsmask) do { } while (0)
 #define trace_rcu_preempt_task(rcuname, pid, gpnum) do { } while (0)
 #define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0)
 #define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, grplo, grphi, gp_tasks) do { } while (0)
 #define trace_rcu_fqs(rcuname, gpnum, cpu, qsevent) do { } while (0)
 #define trace_rcu_dyntick(polarity, oldnesting, newnesting) do { } while (0)
 #define trace_rcu_prep_idle(reason) do { } while (0)
 #define trace_rcu_callback(rcuname, rhp, qlen) do { } while (0)
 #define trace_rcu_kfree_callback(rcuname, rhp, offset, qlen) do { } while (0)
 #define trace_rcu_batch_start(rcuname, qlen, blimit) do { } while (0)
 #define trace_rcu_invoke_callback(rcuname, rhp) do { } while (0)
 #define trace_rcu_invoke_kfree_callback(rcuname, rhp, offset) do { } while (0)
 #define trace_rcu_batch_end(rcuname, callbacks_invoked) do { } while (0)
 #define trace_rcu_torture_read(rcutorturename, rhp) do { } while (0)
 #endif /* #else #ifdef CONFIG_RCU_TRACE */
 #endif /* _TRACE_RCU_H */
 /* This part must be outside protection */
 #include <trace/define_trace.h>

kernel/rcutree.c

Diff comments View file @ 7cb9249

 /*
  * Read-Copy Update mechanism for mutual exclusion
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  * (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  *
  * Copyright IBM Corporation, 2008
  *
  * Authors: Dipankar Sarma <dipankar@in.ibm.com>
  *	    Manfred Spraul <manfred@colorfullife.com>
  *	    Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version
  *
  * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
  * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
  *
  * For detailed explanation of Read-Copy Update mechanism see -
  *	Documentation/RCU
  */
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/smp.h>
 #include <linux/rcupdate.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
 #include <linux/nmi.h>
 #include <linux/atomic.h>
 #include <linux/bitops.h>
 #include <linux/export.h>
 #include <linux/completion.h>
 #include <linux/moduleparam.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
 #include <linux/cpu.h>
 #include <linux/mutex.h>
 #include <linux/time.h>
 #include <linux/kernel_stat.h>
 #include <linux/wait.h>
 #include <linux/kthread.h>
 #include <linux/prefetch.h>
 #include "rcutree.h"
 #include <trace/events/rcu.h>
 #include "rcu.h"
 /* Data structures. */
 static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
 #define RCU_STATE_INITIALIZER(structname) { \
 	.level = { &structname##_state.node[0] }, \
 	.levelcnt = { \
 		NUM_RCU_LVL_0,  /* root of hierarchy. */ \
 		NUM_RCU_LVL_1, \
 		NUM_RCU_LVL_2, \
 		NUM_RCU_LVL_3, \
 		NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
 	}, \
 	.fqs_state = RCU_GP_IDLE, \
 	.gpnum = -300, \
 	.completed = -300, \
 	.onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
 	.fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \
 	.n_force_qs = 0, \
 	.n_force_qs_ngp = 0, \
 	.name = #structname, \
 }
 struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched);
 DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
 struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh);
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
 static struct rcu_state *rcu_state;
 /*
  * The rcu_scheduler_active variable transitions from zero to one just
  * before the first task is spawned.  So when this variable is zero, RCU
  * can assume that there is but one task, allowing RCU to (for example)
  * optimized synchronize_sched() to a simple barrier().  When this variable
  * is one, RCU must actually do all the hard work required to detect real
  * grace periods.  This variable is also used to suppress boot-time false
  * positives from lockdep-RCU error checking.
  */
 int rcu_scheduler_active __read_mostly;
 EXPORT_SYMBOL_GPL(rcu_scheduler_active);
 /*
  * The rcu_scheduler_fully_active variable transitions from zero to one
  * during the early_initcall() processing, which is after the scheduler
  * is capable of creating new tasks.  So RCU processing (for example,
  * creating tasks for RCU priority boosting) must be delayed until after
  * rcu_scheduler_fully_active transitions from zero to one.  We also
  * currently delay invocation of any RCU callbacks until after this point.
  *
  * It might later prove better for people registering RCU callbacks during
  * early boot to take responsibility for these callbacks, but one step at
  * a time.
  */
 static int rcu_scheduler_fully_active __read_mostly;
 #ifdef CONFIG_RCU_BOOST
 /*
  * Control variables for per-CPU and per-rcu_node kthreads.  These
  * handle all flavors of RCU.
  */
 static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
 DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
 DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
 DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
 DEFINE_PER_CPU(char, rcu_cpu_has_work);
 #endif /* #ifdef CONFIG_RCU_BOOST */
 static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
 static void invoke_rcu_core(void);
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
 /*
  * Track the rcutorture test sequence number and the update version
  * number within a given test.  The rcutorture_testseq is incremented
  * on every rcutorture module load and unload, so has an odd value
  * when a test is running.  The rcutorture_vernum is set to zero
  * when rcutorture starts and is incremented on each rcutorture update.
  * These variables enable correlating rcutorture output with the
  * RCU tracing information.
  */
 unsigned long rcutorture_testseq;
 unsigned long rcutorture_vernum;
 /*
  * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s
  * permit this function to be invoked without holding the root rcu_node
  * structure's ->lock, but of course results can be subject to change.
  */
 static int rcu_gp_in_progress(struct rcu_state *rsp)
 {
 	return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum);
 }
 /*
  * Note a quiescent state.  Because we do not need to know
  * how many quiescent states passed, just if there was at least
  * one since the start of the grace period, this just sets a flag.
  * The caller must have disabled preemption.
  */
 void rcu_sched_qs(int cpu)
 {
 	struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
 	rdp->passed_quiesce_gpnum = rdp->gpnum;
 	barrier();
 	if (rdp->passed_quiesce == 0)
 		trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs");
 	rdp->passed_quiesce = 1;
 }
 void rcu_bh_qs(int cpu)
 {
 	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
 	rdp->passed_quiesce_gpnum = rdp->gpnum;
 	barrier();
 	if (rdp->passed_quiesce == 0)
 		trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs");
 	rdp->passed_quiesce = 1;
 }
 /*
  * Note a context switch.  This is a quiescent state for RCU-sched,
  * and requires special handling for preemptible RCU.
  * The caller must have disabled preemption.
  */
 void rcu_note_context_switch(int cpu)
 {
 	trace_rcu_utilization("Start context switch");
 	rcu_sched_qs(cpu);
 	rcu_preempt_note_context_switch(cpu);
 	trace_rcu_utilization("End context switch");
 }
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
 DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
 	.dynticks_nesting = DYNTICK_TASK_NESTING,
 	.dynticks = ATOMIC_INIT(1),
 };
 static int blimit = 10;		/* Maximum callbacks per rcu_do_batch. */
 static int qhimark = 10000;	/* If this many pending, ignore blimit. */
 static int qlowmark = 100;	/* Once only this many pending, use blimit. */
 module_param(blimit, int, 0);
 module_param(qhimark, int, 0);
 module_param(qlowmark, int, 0);
 int rcu_cpu_stall_suppress __read_mostly;
 module_param(rcu_cpu_stall_suppress, int, 0644);
 static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
 static int rcu_pending(int cpu);
 /*
  * Return the number of RCU-sched batches processed thus far for debug & stats.
  */
 long rcu_batches_completed_sched(void)
 {
 	return rcu_sched_state.completed;
 }
 EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
 /*
  * Return the number of RCU BH batches processed thus far for debug & stats.
  */
 long rcu_batches_completed_bh(void)
 {
 	return rcu_bh_state.completed;
 }
 EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
 /*
  * Force a quiescent state for RCU BH.
  */
 void rcu_bh_force_quiescent_state(void)
 {
 	force_quiescent_state(&rcu_bh_state, 0);
 }
 EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
 /*
  * Record the number of times rcutorture tests have been initiated and
  * terminated.  This information allows the debugfs tracing stats to be
  * correlated to the rcutorture messages, even when the rcutorture module
  * is being repeatedly loaded and unloaded.  In other words, we cannot
  * store this state in rcutorture itself.
  */
 void rcutorture_record_test_transition(void)
 {
 	rcutorture_testseq++;
 	rcutorture_vernum = 0;
 }
 EXPORT_SYMBOL_GPL(rcutorture_record_test_transition);
 /*
  * Record the number of writer passes through the current rcutorture test.
  * This is also used to correlate debugfs tracing stats with the rcutorture
  * messages.
  */
 void rcutorture_record_progress(unsigned long vernum)
 {
 	rcutorture_vernum++;
 }
 EXPORT_SYMBOL_GPL(rcutorture_record_progress);
 /*
  * Force a quiescent state for RCU-sched.
  */
 void rcu_sched_force_quiescent_state(void)
 {
 	force_quiescent_state(&rcu_sched_state, 0);
 }
 EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
 /*
  * Does the CPU have callbacks ready to be invoked?
  */
 static int
 cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
 {
 	return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL];
 }
 /*
  * Does the current CPU require a yet-as-unscheduled grace period?
  */
 static int
 cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
 {
 	return *rdp->nxttail[RCU_DONE_TAIL] && !rcu_gp_in_progress(rsp);
 }
 /*
  * Return the root node of the specified rcu_state structure.
  */
 static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
 {
 	return &rsp->node[0];
 }
 #ifdef CONFIG_SMP
 /*
  * If the specified CPU is offline, tell the caller that it is in
  * a quiescent state.  Otherwise, whack it with a reschedule IPI.
  * Grace periods can end up waiting on an offline CPU when that
  * CPU is in the process of coming online -- it will be added to the
  * rcu_node bitmasks before it actually makes it online.  The same thing
  * can happen while a CPU is in the process of coming online.  Because this
  * race is quite rare, we check for it after detecting that the grace
  * period has been delayed rather than checking each and every CPU
  * each and every time we start a new grace period.
  */
 static int rcu_implicit_offline_qs(struct rcu_data *rdp)
 {
 	/*
 	 * If the CPU is offline, it is in a quiescent state.  We can
 	 * trust its state not to change because interrupts are disabled.
 	 */
 	if (cpu_is_offline(rdp->cpu)) {
 		trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl");
 		rdp->offline_fqs++;
 		return 1;
 	}
 	/*
 	 * The CPU is online, so send it a reschedule IPI.  This forces
 	 * it through the scheduler, and (inefficiently) also handles cases
 	 * where idle loops fail to inform RCU about the CPU being idle.
 	 */
 	if (rdp->cpu != smp_processor_id())
 		smp_send_reschedule(rdp->cpu);
 	else
 		set_need_resched();
 	rdp->resched_ipi++;
 	return 0;
 }
 #endif /* #ifdef CONFIG_SMP */
 /*
  * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle
  *
  * If the new value of the ->dynticks_nesting counter now is zero,
  * we really have entered idle, and must do the appropriate accounting.
  * The caller must have disabled interrupts.
  */
 static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
 {
 	if (rdtp->dynticks_nesting) {
 		trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting);
 		return;
 	}
 	trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting);
 	if (!is_idle_task(current)) {
 		struct task_struct *idle = idle_task(smp_processor_id());
 		trace_rcu_dyntick("Error on entry: not idle task",
 				   oldval, rdtp->dynticks_nesting);
 		ftrace_dump(DUMP_ALL);
 		WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
 			  current->pid, current->comm,
 			  idle->pid, idle->comm); /* must be idle task! */
 	}
 	rcu_prepare_for_idle(smp_processor_id());
 	/* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
 	smp_mb__before_atomic_inc();  /* See above. */
 	atomic_inc(&rdtp->dynticks);
 	smp_mb__after_atomic_inc();  /* Force ordering with next sojourn. */
 	WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
 }
 /**
  * rcu_idle_enter - inform RCU that current CPU is entering idle
  *
  * Enter idle mode, in other words, -leave- the mode in which RCU
  * read-side critical sections can occur.  (Though RCU read-side
  * critical sections can occur in irq handlers in idle, a possibility
  * handled by irq_enter() and irq_exit().)
  *
  * We crowbar the ->dynticks_nesting field to zero to allow for
  * the possibility of usermode upcalls having messed up our count
  * of interrupt nesting level during the prior busy period.
  */
 void rcu_idle_enter(void)
 {
 	unsigned long flags;
 	long long oldval;
 	struct rcu_dynticks *rdtp;
 	local_irq_save(flags);
 	rdtp = &__get_cpu_var(rcu_dynticks);
 	oldval = rdtp->dynticks_nesting;
 	rdtp->dynticks_nesting = 0;
 	rcu_idle_enter_common(rdtp, oldval);
 	local_irq_restore(flags);
 }
 /**
  * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
  *
  * Exit from an interrupt handler, which might possibly result in entering
  * idle mode, in other words, leaving the mode in which read-side critical
  * sections can occur.
  *
  * This code assumes that the idle loop never does anything that might
  * result in unbalanced calls to irq_enter() and irq_exit().  If your
  * architecture violates this assumption, RCU will give you what you
  * deserve, good and hard.  But very infrequently and irreproducibly.
  *
  * Use things like work queues to work around this limitation.
  *
  * You have been warned.
  */
 void rcu_irq_exit(void)
 {
 	unsigned long flags;
 	long long oldval;
 	struct rcu_dynticks *rdtp;
 	local_irq_save(flags);
 	rdtp = &__get_cpu_var(rcu_dynticks);
 	oldval = rdtp->dynticks_nesting;
 	rdtp->dynticks_nesting--;
 	WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
 	rcu_idle_enter_common(rdtp, oldval);
 	local_irq_restore(flags);
 }
 /*
  * rcu_idle_exit_common - inform RCU that current CPU is moving away from idle
  *
  * If the new value of the ->dynticks_nesting counter was previously zero,
  * we really have exited idle, and must do the appropriate accounting.
  * The caller must have disabled interrupts.
  */
 static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
 {
 	if (oldval) {
 		trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting);
 		return;
 	}
 	smp_mb__before_atomic_inc();  /* Force ordering w/previous sojourn. */
 	atomic_inc(&rdtp->dynticks);
 	/* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
 	smp_mb__after_atomic_inc();  /* See above. */
 	WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+	rcu_cleanup_after_idle(smp_processor_id());
 	trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting);
 	if (!is_idle_task(current)) {
 		struct task_struct *idle = idle_task(smp_processor_id());
 		trace_rcu_dyntick("Error on exit: not idle task",
 				  oldval, rdtp->dynticks_nesting);
 		ftrace_dump(DUMP_ALL);
 		WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
 			  current->pid, current->comm,
 			  idle->pid, idle->comm); /* must be idle task! */
 	}
 }
 /**
  * rcu_idle_exit - inform RCU that current CPU is leaving idle
  *
  * Exit idle mode, in other words, -enter- the mode in which RCU
  * read-side critical sections can occur.
  *
  * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NESTING to
  * allow for the possibility of usermode upcalls messing up our count
  * of interrupt nesting level during the busy period that is just
  * now starting.
  */
 void rcu_idle_exit(void)
 {
 	unsigned long flags;
 	struct rcu_dynticks *rdtp;
 	long long oldval;
 	local_irq_save(flags);
 	rdtp = &__get_cpu_var(rcu_dynticks);
 	oldval = rdtp->dynticks_nesting;
 	WARN_ON_ONCE(oldval != 0);
 	rdtp->dynticks_nesting = DYNTICK_TASK_NESTING;
 	rcu_idle_exit_common(rdtp, oldval);
 	local_irq_restore(flags);
 }
 /**
  * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
  *
  * Enter an interrupt handler, which might possibly result in exiting
  * idle mode, in other words, entering the mode in which read-side critical
  * sections can occur.
  *
  * Note that the Linux kernel is fully capable of entering an interrupt
  * handler that it never exits, for example when doing upcalls to
  * user mode!  This code assumes that the idle loop never does upcalls to
  * user mode.  If your architecture does do upcalls from the idle loop (or
  * does anything else that results in unbalanced calls to the irq_enter()
  * and irq_exit() functions), RCU will give you what you deserve, good
  * and hard.  But very infrequently and irreproducibly.
  *
  * Use things like work queues to work around this limitation.
  *
  * You have been warned.
  */
 void rcu_irq_enter(void)
 {
 	unsigned long flags;
 	struct rcu_dynticks *rdtp;
 	long long oldval;
 	local_irq_save(flags);
 	rdtp = &__get_cpu_var(rcu_dynticks);
 	oldval = rdtp->dynticks_nesting;
 	rdtp->dynticks_nesting++;
 	WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
 	rcu_idle_exit_common(rdtp, oldval);
 	local_irq_restore(flags);
 }
 /**
  * rcu_nmi_enter - inform RCU of entry to NMI context
  *
  * If the CPU was idle with dynamic ticks active, and there is no
  * irq handler running, this updates rdtp->dynticks_nmi to let the
  * RCU grace-period handling know that the CPU is active.
  */
 void rcu_nmi_enter(void)
 {
 	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
 	if (rdtp->dynticks_nmi_nesting == 0 &&
 	    (atomic_read(&rdtp->dynticks) & 0x1))
 		return;
 	rdtp->dynticks_nmi_nesting++;
 	smp_mb__before_atomic_inc();  /* Force delay from prior write. */
 	atomic_inc(&rdtp->dynticks);
 	/* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
 	smp_mb__after_atomic_inc();  /* See above. */
 	WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
 }
 /**
  * rcu_nmi_exit - inform RCU of exit from NMI context
  *
  * If the CPU was idle with dynamic ticks active, and there is no
  * irq handler running, this updates rdtp->dynticks_nmi to let the
  * RCU grace-period handling know that the CPU is no longer active.
  */
 void rcu_nmi_exit(void)
 {
 	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
 	if (rdtp->dynticks_nmi_nesting == 0 ||
 	    --rdtp->dynticks_nmi_nesting != 0)
 		return;
 	/* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
 	smp_mb__before_atomic_inc();  /* See above. */
 	atomic_inc(&rdtp->dynticks);
 	smp_mb__after_atomic_inc();  /* Force delay to next write. */
 	WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
 }
 #ifdef CONFIG_PROVE_RCU
 /**
  * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle
  *
  * If the current CPU is in its idle loop and is neither in an interrupt
  * or NMI handler, return true.
  */
 int rcu_is_cpu_idle(void)
 {
 	int ret;
 	preempt_disable();
 	ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0;
 	preempt_enable();
 	return ret;
 }
 EXPORT_SYMBOL(rcu_is_cpu_idle);
 #endif /* #ifdef CONFIG_PROVE_RCU */
 /**
  * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
  *
  * If the current CPU is idle or running at a first-level (not nested)
  * interrupt from idle, return true.  The caller must have at least
  * disabled preemption.
  */
 int rcu_is_cpu_rrupt_from_idle(void)
 {
 	return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
 }
 #ifdef CONFIG_SMP
 /*
  * Snapshot the specified CPU's dynticks counter so that we can later
  * credit them with an implicit quiescent state.  Return 1 if this CPU
  * is in dynticks idle mode, which is an extended quiescent state.
  */
 static int dyntick_save_progress_counter(struct rcu_data *rdp)
 {
 	rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
 	return (rdp->dynticks_snap & 0x1) == 0;
 }
 /*
  * Return true if the specified CPU has passed through a quiescent
  * state by virtue of being in or having passed through an dynticks
  * idle state since the last call to dyntick_save_progress_counter()
  * for this same CPU.
  */
 static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 {
 	unsigned int curr;
 	unsigned int snap;
 	curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
 	snap = (unsigned int)rdp->dynticks_snap;
 	/*
 	 * If the CPU passed through or entered a dynticks idle phase with
 	 * no active irq/NMI handlers, then we can safely pretend that the CPU
 	 * already acknowledged the request to pass through a quiescent
 	 * state.  Either way, that CPU cannot possibly be in an RCU
 	 * read-side critical section that started before the beginning
 	 * of the current RCU grace period.
 	 */
 	if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) {
 		trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti");
 		rdp->dynticks_fqs++;
 		return 1;
 	}
 	/* Go check for the CPU being offline. */
 	return rcu_implicit_offline_qs(rdp);
 }
 #endif /* #ifdef CONFIG_SMP */
 int rcu_cpu_stall_suppress __read_mostly;
 static void record_gp_stall_check_time(struct rcu_state *rsp)
 {
 	rsp->gp_start = jiffies;
 	rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
 }
 static void print_other_cpu_stall(struct rcu_state *rsp)
 {
 	int cpu;
 	long delta;
 	unsigned long flags;
 	int ndetected;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 	/* Only let one CPU complain about others per time interval. */
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	delta = jiffies - rsp->jiffies_stall;
 	if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
 	}
 	rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
 	/*
 	 * Now rat on any tasks that got kicked up to the root rcu_node
 	 * due to CPU offlining.
 	 */
 	ndetected = rcu_print_task_stall(rnp);
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	/*
 	 * OK, time to rat on our buddy...
 	 * See Documentation/RCU/stallwarn.txt for info on how to debug
 	 * RCU CPU stall warnings.
 	 */
 	printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
 	       rsp->name);
 	rcu_for_each_leaf_node(rsp, rnp) {
 		raw_spin_lock_irqsave(&rnp->lock, flags);
 		ndetected += rcu_print_task_stall(rnp);
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		if (rnp->qsmask == 0)
 			continue;
 		for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
 			if (rnp->qsmask & (1UL << cpu)) {
 				printk(" %d", rnp->grplo + cpu);
 				ndetected++;
 			}
 	}
 	printk("} (detected by %d, t=%ld jiffies)\n",
 	       smp_processor_id(), (long)(jiffies - rsp->gp_start));
 	if (ndetected == 0)
 		printk(KERN_ERR "INFO: Stall ended before state dump start\n");
 	else if (!trigger_all_cpu_backtrace())
 		dump_stack();
 	/* If so configured, complain about tasks blocking the grace period. */
 	rcu_print_detail_task_stall(rsp);
 	force_quiescent_state(rsp, 0);  /* Kick them all. */
 }
 static void print_cpu_stall(struct rcu_state *rsp)
 {
 	unsigned long flags;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 	/*
 	 * OK, time to rat on ourselves...
 	 * See Documentation/RCU/stallwarn.txt for info on how to debug
 	 * RCU CPU stall warnings.
 	 */
 	printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
 	       rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
 	if (!trigger_all_cpu_backtrace())
 		dump_stack();
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
 		rsp->jiffies_stall =
 			jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	set_need_resched();  /* kick ourselves to get things going. */
 }
 static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 {
 	unsigned long j;
 	unsigned long js;
 	struct rcu_node *rnp;
 	if (rcu_cpu_stall_suppress)
 		return;
 	j = ACCESS_ONCE(jiffies);
 	js = ACCESS_ONCE(rsp->jiffies_stall);
 	rnp = rdp->mynode;
 	if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) {
 		/* We haven't checked in, so go dump stack. */
 		print_cpu_stall(rsp);
 	} else if (rcu_gp_in_progress(rsp) &&
 		   ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
 		/* They had a few time units to dump stack, so complain. */
 		print_other_cpu_stall(rsp);
 	}
 }
 static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
 {
 	rcu_cpu_stall_suppress = 1;
 	return NOTIFY_DONE;
 }
 /**
  * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
  *
  * Set the stall-warning timeout way off into the future, thus preventing
  * any RCU CPU stall-warning messages from appearing in the current set of
  * RCU grace periods.
  *
  * The caller must disable hard irqs.
  */
 void rcu_cpu_stall_reset(void)
 {
 	rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2;
 	rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2;
 	rcu_preempt_stall_reset();
 }
 static struct notifier_block rcu_panic_block = {
 	.notifier_call = rcu_panic,
 };
 static void __init check_cpu_stall_init(void)
 {
 	atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
 }
 /*
  * Update CPU-local rcu_data state to record the newly noticed grace period.
  * This is used both when we started the grace period and when we notice
  * that someone else started the grace period.  The caller must hold the
  * ->lock of the leaf rcu_node structure corresponding to the current CPU,
  *  and must have irqs disabled.
  */
 static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
 {
 	if (rdp->gpnum != rnp->gpnum) {
 		/*
 		 * If the current grace period is waiting for this CPU,
 		 * set up to detect a quiescent state, otherwise don't
 		 * go looking for one.
 		 */
 		rdp->gpnum = rnp->gpnum;
 		trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart");
 		if (rnp->qsmask & rdp->grpmask) {
 			rdp->qs_pending = 1;
 			rdp->passed_quiesce = 0;
 		} else
 			rdp->qs_pending = 0;
 	}
 }
 static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
 {
 	unsigned long flags;
 	struct rcu_node *rnp;
 	local_irq_save(flags);
 	rnp = rdp->mynode;
 	if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */
 	    !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
 		local_irq_restore(flags);
 		return;
 	}
 	__note_new_gpnum(rsp, rnp, rdp);
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 /*
  * Did someone else start a new RCU grace period start since we last
  * checked?  Update local state appropriately if so.  Must be called
  * on the CPU corresponding to rdp.
  */
 static int
 check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
 {
 	unsigned long flags;
 	int ret = 0;
 	local_irq_save(flags);
 	if (rdp->gpnum != rsp->gpnum) {
 		note_new_gpnum(rsp, rdp);
 		ret = 1;
 	}
 	local_irq_restore(flags);
 	return ret;
 }
 /*
  * Advance this CPU's callbacks, but only if the current grace period
  * has ended.  This may be called only from the CPU to whom the rdp
  * belongs.  In addition, the corresponding leaf rcu_node structure's
  * ->lock must be held by the caller, with irqs disabled.
  */
 static void
 __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
 {
 	/* Did another grace period end? */
 	if (rdp->completed != rnp->completed) {
 		/* Advance callbacks.  No harm if list empty. */
 		rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
 		rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
 		rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
 		/* Remember that we saw this grace-period completion. */
 		rdp->completed = rnp->completed;
 		trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend");
 		/*
 		 * If we were in an extended quiescent state, we may have
 		 * missed some grace periods that others CPUs handled on
 		 * our behalf. Catch up with this state to avoid noting
 		 * spurious new grace periods.  If another grace period
 		 * has started, then rnp->gpnum will have advanced, so
 		 * we will detect this later on.
 		 */
 		if (ULONG_CMP_LT(rdp->gpnum, rdp->completed))
 			rdp->gpnum = rdp->completed;
 		/*
 		 * If RCU does not need a quiescent state from this CPU,
 		 * then make sure that this CPU doesn't go looking for one.
 		 */
 		if ((rnp->qsmask & rdp->grpmask) == 0)
 			rdp->qs_pending = 0;
 	}
 }
 /*
  * Advance this CPU's callbacks, but only if the current grace period
  * has ended.  This may be called only from the CPU to whom the rdp
  * belongs.
  */
 static void
 rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
 {
 	unsigned long flags;
 	struct rcu_node *rnp;
 	local_irq_save(flags);
 	rnp = rdp->mynode;
 	if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */
 	    !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
 		local_irq_restore(flags);
 		return;
 	}
 	__rcu_process_gp_end(rsp, rnp, rdp);
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 /*
  * Do per-CPU grace-period initialization for running CPU.  The caller
  * must hold the lock of the leaf rcu_node structure corresponding to
  * this CPU.
  */
 static void
 rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
 {
 	/* Prior grace period ended, so advance callbacks for current CPU. */
 	__rcu_process_gp_end(rsp, rnp, rdp);
 	/*
 	 * Because this CPU just now started the new grace period, we know
 	 * that all of its callbacks will be covered by this upcoming grace
 	 * period, even the ones that were registered arbitrarily recently.
 	 * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL.
 	 *
 	 * Other CPUs cannot be sure exactly when the grace period started.
 	 * Therefore, their recently registered callbacks must pass through
 	 * an additional RCU_NEXT_READY stage, so that they will be handled
 	 * by the next RCU grace period.
 	 */
 	rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
 	rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
 	/* Set state so that this CPU will detect the next quiescent state. */
 	__note_new_gpnum(rsp, rnp, rdp);
 }
 /*
  * Start a new RCU grace period if warranted, re-initializing the hierarchy
  * in preparation for detecting the next grace period.  The caller must hold
  * the root node's ->lock, which is released before return.  Hard irqs must
  * be disabled.
  */
 static void
 rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 	__releases(rcu_get_root(rsp)->lock)
 {
 	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
 	struct rcu_node *rnp = rcu_get_root(rsp);
 	if (!rcu_scheduler_fully_active ||
 	    !cpu_needs_another_gp(rsp, rdp)) {
 		/*
 		 * Either the scheduler hasn't yet spawned the first
 		 * non-idle task or this CPU does not need another
 		 * grace period.  Either way, don't start a new grace
 		 * period.
 		 */
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
 	}
 	if (rsp->fqs_active) {
 		/*
 		 * This CPU needs a grace period, but force_quiescent_state()
 		 * is running.  Tell it to start one on this CPU's behalf.
 		 */
 		rsp->fqs_need_gp = 1;
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
 	}
 	/* Advance to a new grace period and initialize state. */
 	rsp->gpnum++;
 	trace_rcu_grace_period(rsp->name, rsp->gpnum, "start");
 	WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT);
 	rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */
 	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
 	record_gp_stall_check_time(rsp);
 	/* Special-case the common single-level case. */
 	if (NUM_RCU_NODES == 1) {
 		rcu_preempt_check_blocked_tasks(rnp);
 		rnp->qsmask = rnp->qsmaskinit;
 		rnp->gpnum = rsp->gpnum;
 		rnp->completed = rsp->completed;
 		rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state OK */
 		rcu_start_gp_per_cpu(rsp, rnp, rdp);
 		rcu_preempt_boost_start_gp(rnp);
 		trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
 					    rnp->level, rnp->grplo,
 					    rnp->grphi, rnp->qsmask);
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
 	}
 	raw_spin_unlock(&rnp->lock);  /* leave irqs disabled. */
 	/* Exclude any concurrent CPU-hotplug operations. */
 	raw_spin_lock(&rsp->onofflock);  /* irqs already disabled. */
 	/*
 	 * Set the quiescent-state-needed bits in all the rcu_node
 	 * structures for all currently online CPUs in breadth-first
 	 * order, starting from the root rcu_node structure.  This
 	 * operation relies on the layout of the hierarchy within the
 	 * rsp->node[] array.  Note that other CPUs will access only
 	 * the leaves of the hierarchy, which still indicate that no
 	 * grace period is in progress, at least until the corresponding
 	 * leaf node has been initialized.  In addition, we have excluded
 	 * CPU-hotplug operations.
 	 *
 	 * Note that the grace period cannot complete until we finish
 	 * the initialization process, as there will be at least one
 	 * qsmask bit set in the root node until that time, namely the
 	 * one corresponding to this CPU, due to the fact that we have
 	 * irqs disabled.
 	 */
 	rcu_for_each_node_breadth_first(rsp, rnp) {
 		raw_spin_lock(&rnp->lock);	/* irqs already disabled. */
 		rcu_preempt_check_blocked_tasks(rnp);
 		rnp->qsmask = rnp->qsmaskinit;
 		rnp->gpnum = rsp->gpnum;
 		rnp->completed = rsp->completed;
 		if (rnp == rdp->mynode)
 			rcu_start_gp_per_cpu(rsp, rnp, rdp);
 		rcu_preempt_boost_start_gp(rnp);
 		trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
 					    rnp->level, rnp->grplo,
 					    rnp->grphi, rnp->qsmask);
 		raw_spin_unlock(&rnp->lock);	/* irqs remain disabled. */
 	}
 	rnp = rcu_get_root(rsp);
 	raw_spin_lock(&rnp->lock);		/* irqs already disabled. */
 	rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
 	raw_spin_unlock(&rnp->lock);		/* irqs remain disabled. */
 	raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 /*
  * Report a full set of quiescent states to the specified rcu_state
  * data structure.  This involves cleaning up after the prior grace
  * period and letting rcu_start_gp() start up the next grace period
  * if one is needed.  Note that the caller must hold rnp->lock, as
  * required by rcu_start_gp(), which will release it.
  */
 static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
 	__releases(rcu_get_root(rsp)->lock)
 {
 	unsigned long gp_duration;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
 	WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
 	/*
 	 * Ensure that all grace-period and pre-grace-period activity
 	 * is seen before the assignment to rsp->completed.
 	 */
 	smp_mb(); /* See above block comment. */
 	gp_duration = jiffies - rsp->gp_start;
 	if (gp_duration > rsp->gp_max)
 		rsp->gp_max = gp_duration;
 	/*
 	 * We know the grace period is complete, but to everyone else
 	 * it appears to still be ongoing.  But it is also the case
 	 * that to everyone else it looks like there is nothing that
 	 * they can do to advance the grace period.  It is therefore
 	 * safe for us to drop the lock in order to mark the grace
 	 * period as completed in all of the rcu_node structures.
 	 *
 	 * But if this CPU needs another grace period, it will take
 	 * care of this while initializing the next grace period.
 	 * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL
 	 * because the callbacks have not yet been advanced: Those
 	 * callbacks are waiting on the grace period that just now
 	 * completed.
 	 */
 	if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) {
 		raw_spin_unlock(&rnp->lock);	 /* irqs remain disabled. */
 		/*
 		 * Propagate new ->completed value to rcu_node structures
 		 * so that other CPUs don't have to wait until the start
 		 * of the next grace period to process their callbacks.
 		 */
 		rcu_for_each_node_breadth_first(rsp, rnp) {
 			raw_spin_lock(&rnp->lock); /* irqs already disabled. */
 			rnp->completed = rsp->gpnum;
 			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 		}
 		rnp = rcu_get_root(rsp);
 		raw_spin_lock(&rnp->lock); /* irqs already disabled. */
 	}
 	rsp->completed = rsp->gpnum;  /* Declare the grace period complete. */
 	trace_rcu_grace_period(rsp->name, rsp->completed, "end");
 	rsp->fqs_state = RCU_GP_IDLE;
 	rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
 }
 /*
  * Similar to rcu_report_qs_rdp(), for which it is a helper function.
  * Allows quiescent states for a group of CPUs to be reported at one go
  * to the specified rcu_node structure, though all the CPUs in the group
  * must be represented by the same rcu_node structure (which need not be
  * a leaf rcu_node structure, though it often will be).  That structure's
  * lock must be held upon entry, and it is released before return.
  */
 static void
 rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
 		  struct rcu_node *rnp, unsigned long flags)
 	__releases(rnp->lock)
 {
 	struct rcu_node *rnp_c;
 	/* Walk up the rcu_node hierarchy. */
 	for (;;) {
 		if (!(rnp->qsmask & mask)) {
 			/* Our bit has already been cleared, so done. */
 			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 			return;
 		}
 		rnp->qsmask &= ~mask;
 		trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
 						 mask, rnp->qsmask, rnp->level,
 						 rnp->grplo, rnp->grphi,
 						 !!rnp->gp_tasks);
 		if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
 			/* Other bits still set at this level, so done. */
 			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 			return;
 		}
 		mask = rnp->grpmask;
 		if (rnp->parent == NULL) {
 			/* No more levels.  Exit loop holding root lock. */
 			break;
 		}
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		rnp_c = rnp;
 		rnp = rnp->parent;
 		raw_spin_lock_irqsave(&rnp->lock, flags);
 		WARN_ON_ONCE(rnp_c->qsmask);
 	}
 	/*
 	 * Get here if we are the last CPU to pass through a quiescent
 	 * state for this grace period.  Invoke rcu_report_qs_rsp()
 	 * to clean up and start the next grace period if one is needed.
 	 */
 	rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */
 }
 /*
  * Record a quiescent state for the specified CPU to that CPU's rcu_data
  * structure.  This must be either called from the specified CPU, or
  * called when the specified CPU is known to be offline (and when it is
  * also known that no other CPU is concurrently trying to help the offline
  * CPU).  The lastcomp argument is used to make sure we are still in the
  * grace period of interest.  We don't want to end the current grace period
  * based on quiescent states detected in an earlier grace period!
  */
 static void
 rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastgp)
 {
 	unsigned long flags;
 	unsigned long mask;
 	struct rcu_node *rnp;
 	rnp = rdp->mynode;
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	if (lastgp != rnp->gpnum || rnp->completed == rnp->gpnum) {
 		/*
 		 * The grace period in which this quiescent state was
 		 * recorded has ended, so don't report it upwards.
 		 * We will instead need a new quiescent state that lies
 		 * within the current grace period.
 		 */
 		rdp->passed_quiesce = 0;	/* need qs for new gp. */
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
 	}
 	mask = rdp->grpmask;
 	if ((rnp->qsmask & mask) == 0) {
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	} else {
 		rdp->qs_pending = 0;
 		/*
 		 * This GP can't end until cpu checks in, so all of our
 		 * callbacks can be processed during the next GP.
 		 */
 		rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
 		rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
 	}
 }
 /*
  * Check to see if there is a new grace period of which this CPU
  * is not yet aware, and if so, set up local rcu_data state for it.
  * Otherwise, see if this CPU has just passed through its first
  * quiescent state for this grace period, and record that fact if so.
  */
 static void
 rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 {
 	/* If there is now a new grace period, record and return. */
 	if (check_for_new_grace_period(rsp, rdp))
 		return;
 	/*
 	 * Does this CPU still need to do its part for current grace period?
 	 * If no, return and let the other CPUs do their part as well.
 	 */
 	if (!rdp->qs_pending)
 		return;
 	/*
 	 * Was there a quiescent state since the beginning of the grace
 	 * period? If no, then exit and wait for the next call.
 	 */
 	if (!rdp->passed_quiesce)
 		return;
 	/*
 	 * Tell RCU we are done (but rcu_report_qs_rdp() will be the
 	 * judge of that).
 	 */
 	rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesce_gpnum);
 }
 #ifdef CONFIG_HOTPLUG_CPU
 /*
  * Move a dying CPU's RCU callbacks to online CPU's callback list.
  * Synchronization is not required because this function executes
  * in stop_machine() context.
  */
 static void rcu_send_cbs_to_online(struct rcu_state *rsp)
 {
 	int i;
 	/* current DYING CPU is cleared in the cpu_online_mask */
 	int receive_cpu = cpumask_any(cpu_online_mask);
 	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
 	struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
 	if (rdp->nxtlist == NULL)
 		return;  /* irqs disabled, so comparison is stable. */
 	*receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
 	receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
 	receive_rdp->qlen += rdp->qlen;
 	receive_rdp->n_cbs_adopted += rdp->qlen;
 	rdp->n_cbs_orphaned += rdp->qlen;
 	rdp->nxtlist = NULL;
 	for (i = 0; i < RCU_NEXT_SIZE; i++)
 		rdp->nxttail[i] = &rdp->nxtlist;
 	rdp->qlen = 0;
 }
 /*
  * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
  * and move all callbacks from the outgoing CPU to the current one.
  * There can only be one CPU hotplug operation at a time, so no other
  * CPU can be attempting to update rcu_cpu_kthread_task.
  */
 static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
 {
 	unsigned long flags;
 	unsigned long mask;
 	int need_report = 0;
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp;
 	rcu_stop_cpu_kthread(cpu);
 	/* Exclude any attempts to start a new grace period. */
 	raw_spin_lock_irqsave(&rsp->onofflock, flags);
 	/* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
 	rnp = rdp->mynode;	/* this is the outgoing CPU's rnp. */
 	mask = rdp->grpmask;	/* rnp->grplo is constant. */
 	do {
 		raw_spin_lock(&rnp->lock);	/* irqs already disabled. */
 		rnp->qsmaskinit &= ~mask;
 		if (rnp->qsmaskinit != 0) {
 			if (rnp != rdp->mynode)
 				raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 			else
 				trace_rcu_grace_period(rsp->name,
 						       rnp->gpnum + 1 -
 						       !!(rnp->qsmask & mask),
 						       "cpuofl");
 			break;
 		}
 		if (rnp == rdp->mynode) {
 			trace_rcu_grace_period(rsp->name,
 					       rnp->gpnum + 1 -
 					       !!(rnp->qsmask & mask),
 					       "cpuofl");
 			need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
 		} else
 			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 		mask = rnp->grpmask;
 		rnp = rnp->parent;
 	} while (rnp != NULL);
 	/*
 	 * We still hold the leaf rcu_node structure lock here, and
 	 * irqs are still disabled.  The reason for this subterfuge is
 	 * because invoking rcu_report_unblock_qs_rnp() with ->onofflock
 	 * held leads to deadlock.
 	 */
 	raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
 	rnp = rdp->mynode;
 	if (need_report & RCU_OFL_TASKS_NORM_GP)
 		rcu_report_unblock_qs_rnp(rnp, flags);
 	else
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	if (need_report & RCU_OFL_TASKS_EXP_GP)
 		rcu_report_exp_rnp(rsp, rnp, true);
 	rcu_node_kthread_setaffinity(rnp, -1);
 }
 /*
  * Remove the specified CPU from the RCU hierarchy and move any pending
  * callbacks that it might have to the current CPU.  This code assumes
  * that at least one CPU in the system will remain running at all times.
  * Any attempt to offline -all- CPUs is likely to strand RCU callbacks.
  */
 static void rcu_offline_cpu(int cpu)
 {
 	__rcu_offline_cpu(cpu, &rcu_sched_state);
 	__rcu_offline_cpu(cpu, &rcu_bh_state);
 	rcu_preempt_offline_cpu(cpu);
 }
 #else /* #ifdef CONFIG_HOTPLUG_CPU */
 static void rcu_send_cbs_to_online(struct rcu_state *rsp)
 {
 }
 static void rcu_offline_cpu(int cpu)
 {
 }
 #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
 /*
  * Invoke any RCU callbacks that have made it to the end of their grace
  * period.  Thottle as specified by rdp->blimit.
  */
 static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 {
 	unsigned long flags;
 	struct rcu_head *next, *list, **tail;
 	int bl, count;
 	/* If no callbacks are ready, just return.*/
 	if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
 		trace_rcu_batch_start(rsp->name, 0, 0);
 		trace_rcu_batch_end(rsp->name, 0);
 		return;
 	}
 	/*
 	 * Extract the list of ready callbacks, disabling to prevent
 	 * races with call_rcu() from interrupt handlers.
 	 */
 	local_irq_save(flags);
 	bl = rdp->blimit;
 	trace_rcu_batch_start(rsp->name, rdp->qlen, bl);
 	list = rdp->nxtlist;
 	rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
 	*rdp->nxttail[RCU_DONE_TAIL] = NULL;
 	tail = rdp->nxttail[RCU_DONE_TAIL];
 	for (count = RCU_NEXT_SIZE - 1; count >= 0; count--)
 		if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL])
 			rdp->nxttail[count] = &rdp->nxtlist;
 	local_irq_restore(flags);
 	/* Invoke callbacks. */
 	count = 0;
 	while (list) {
 		next = list->next;
 		prefetch(next);
 		debug_rcu_head_unqueue(list);
 		__rcu_reclaim(rsp->name, list);
 		list = next;
 		if (++count >= bl)
 			break;
 	}
 	local_irq_save(flags);
 	trace_rcu_batch_end(rsp->name, count);
 	/* Update count, and requeue any remaining callbacks. */
 	rdp->qlen -= count;
 	rdp->n_cbs_invoked += count;
 	if (list != NULL) {
 		*tail = rdp->nxtlist;
 		rdp->nxtlist = list;
 		for (count = 0; count < RCU_NEXT_SIZE; count++)
 			if (&rdp->nxtlist == rdp->nxttail[count])
 				rdp->nxttail[count] = tail;
 			else
 				break;
 	}
 	/* Reinstate batch limit if we have worked down the excess. */
 	if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
 		rdp->blimit = blimit;
 	/* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
 	if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) {
 		rdp->qlen_last_fqs_check = 0;
 		rdp->n_force_qs_snap = rsp->n_force_qs;
 	} else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark)
 		rdp->qlen_last_fqs_check = rdp->qlen;
 	local_irq_restore(flags);
 	/* Re-invoke RCU core processing if there are callbacks remaining. */
 	if (cpu_has_callbacks_ready_to_invoke(rdp))
 		invoke_rcu_core();
 }
 /*
  * Check to see if this CPU is in a non-context-switch quiescent state
  * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
  * Also schedule RCU core processing.
  *
  * This function must be called from hardirq context.  It is normally
  * invoked from the scheduling-clock interrupt.  If rcu_pending returns
  * false, there is no point in invoking rcu_check_callbacks().
  */
 void rcu_check_callbacks(int cpu, int user)
 {
 	trace_rcu_utilization("Start scheduler-tick");
 	if (user || rcu_is_cpu_rrupt_from_idle()) {
 		/*
 		 * Get here if this CPU took its interrupt from user
 		 * mode or from the idle loop, and if this is not a
 		 * nested interrupt.  In this case, the CPU is in
 		 * a quiescent state, so note it.
 		 *
 		 * No memory barrier is required here because both
 		 * rcu_sched_qs() and rcu_bh_qs() reference only CPU-local
 		 * variables that other CPUs neither access nor modify,
 		 * at least not while the corresponding CPU is online.
 		 */
 		rcu_sched_qs(cpu);
 		rcu_bh_qs(cpu);
 	} else if (!in_softirq()) {
 		/*
 		 * Get here if this CPU did not take its interrupt from
 		 * softirq, in other words, if it is not interrupting
 		 * a rcu_bh read-side critical section.  This is an _bh
 		 * critical section, so note it.
 		 */
 		rcu_bh_qs(cpu);
 	}
 	rcu_preempt_check_callbacks(cpu);
 	if (rcu_pending(cpu))
 		invoke_rcu_core();
 	trace_rcu_utilization("End scheduler-tick");
 }
 #ifdef CONFIG_SMP
 /*
  * Scan the leaf rcu_node structures, processing dyntick state for any that
  * have not yet encountered a quiescent state, using the function specified.
  * Also initiate boosting for any threads blocked on the root rcu_node.
  *
  * The caller must have suppressed start of new grace periods.
  */
 static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
 {
 	unsigned long bit;
 	int cpu;
 	unsigned long flags;
 	unsigned long mask;
 	struct rcu_node *rnp;
 	rcu_for_each_leaf_node(rsp, rnp) {
 		mask = 0;
 		raw_spin_lock_irqsave(&rnp->lock, flags);
 		if (!rcu_gp_in_progress(rsp)) {
 			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 			return;
 		}
 		if (rnp->qsmask == 0) {
 			rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
 			continue;
 		}
 		cpu = rnp->grplo;
 		bit = 1;
 		for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
 			if ((rnp->qsmask & bit) != 0 &&
 			    f(per_cpu_ptr(rsp->rda, cpu)))
 				mask |= bit;
 		}
 		if (mask != 0) {
 			/* rcu_report_qs_rnp() releases rnp->lock. */
 			rcu_report_qs_rnp(mask, rsp, rnp, flags);
 			continue;
 		}
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	}
 	rnp = rcu_get_root(rsp);
 	if (rnp->qsmask == 0) {
 		raw_spin_lock_irqsave(&rnp->lock, flags);
 		rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
 	}
 }
 /*
  * Force quiescent states on reluctant CPUs, and also detect which
  * CPUs are in dyntick-idle mode.
  */
 static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
 {
 	unsigned long flags;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 	trace_rcu_utilization("Start fqs");
 	if (!rcu_gp_in_progress(rsp)) {
 		trace_rcu_utilization("End fqs");
 		return;  /* No grace period in progress, nothing to force. */
 	}
 	if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) {
 		rsp->n_force_qs_lh++; /* Inexact, can lose counts.  Tough! */
 		trace_rcu_utilization("End fqs");
 		return;	/* Someone else is already on the job. */
 	}
 	if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies))
 		goto unlock_fqs_ret; /* no emergency and done recently. */
 	rsp->n_force_qs++;
 	raw_spin_lock(&rnp->lock);  /* irqs already disabled */
 	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
 	if(!rcu_gp_in_progress(rsp)) {
 		rsp->n_force_qs_ngp++;
 		raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
 		goto unlock_fqs_ret;  /* no GP in progress, time updated. */
 	}
 	rsp->fqs_active = 1;
 	switch (rsp->fqs_state) {
 	case RCU_GP_IDLE:
 	case RCU_GP_INIT:
 		break; /* grace period idle or initializing, ignore. */
 	case RCU_SAVE_DYNTICK:
 		if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
 			break; /* So gcc recognizes the dead code. */
 		raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
 		/* Record dyntick-idle state. */
 		force_qs_rnp(rsp, dyntick_save_progress_counter);
 		raw_spin_lock(&rnp->lock);  /* irqs already disabled */
 		if (rcu_gp_in_progress(rsp))
 			rsp->fqs_state = RCU_FORCE_QS;
 		break;
 	case RCU_FORCE_QS:
 		/* Check dyntick-idle state, send IPI to laggarts. */
 		raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
 		force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
 		/* Leave state in case more forcing is required. */
 		raw_spin_lock(&rnp->lock);  /* irqs already disabled */
 		break;
 	}
 	rsp->fqs_active = 0;
 	if (rsp->fqs_need_gp) {
 		raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */
 		rsp->fqs_need_gp = 0;
 		rcu_start_gp(rsp, flags); /* releases rnp->lock */
 		trace_rcu_utilization("End fqs");
 		return;
 	}
 	raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
 unlock_fqs_ret:
 	raw_spin_unlock_irqrestore(&rsp->fqslock, flags);
 	trace_rcu_utilization("End fqs");
 }
 #else /* #ifdef CONFIG_SMP */
 static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
 {
 	set_need_resched();
 }
 #endif /* #else #ifdef CONFIG_SMP */
 /*
  * This does the RCU core processing work for the specified rcu_state
  * and rcu_data structures.  This may be called only from the CPU to
  * whom the rdp belongs.
  */
 static void
 __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
 {
 	unsigned long flags;
 	WARN_ON_ONCE(rdp->beenonline == 0);
 	/*
 	 * If an RCU GP has gone long enough, go check for dyntick
 	 * idle CPUs and, if needed, send resched IPIs.
 	 */
 	if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
 		force_quiescent_state(rsp, 1);
 	/*
 	 * Advance callbacks in response to end of earlier grace
 	 * period that some other CPU ended.
 	 */
 	rcu_process_gp_end(rsp, rdp);
 	/* Update RCU state based on any recent quiescent states. */
 	rcu_check_quiescent_state(rsp, rdp);
 	/* Does this CPU require a not-yet-started grace period? */
 	if (cpu_needs_another_gp(rsp, rdp)) {
 		raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
 		rcu_start_gp(rsp, flags);  /* releases above lock */
 	}
 	/* If there are callbacks ready, invoke them. */
 	if (cpu_has_callbacks_ready_to_invoke(rdp))
 		invoke_rcu_callbacks(rsp, rdp);
 }
 /*
  * Do RCU core processing for the current CPU.
  */
 static void rcu_process_callbacks(struct softirq_action *unused)
 {
 	trace_rcu_utilization("Start RCU core");
 	__rcu_process_callbacks(&rcu_sched_state,
 				&__get_cpu_var(rcu_sched_data));
 	__rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
 	rcu_preempt_process_callbacks();
 	trace_rcu_utilization("End RCU core");
 }
 /*
  * Schedule RCU callback invocation.  If the specified type of RCU
  * does not support RCU priority boosting, just do a direct call,
  * otherwise wake up the per-CPU kernel kthread.  Note that because we
  * are running on the current CPU with interrupts disabled, the
  * rcu_cpu_kthread_task cannot disappear out from under us.
  */
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
 {
 	if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active)))
 		return;
 	if (likely(!rsp->boost)) {
 		rcu_do_batch(rsp, rdp);
 		return;
 	}
 	invoke_rcu_callbacks_kthread();
 }
 static void invoke_rcu_core(void)
 {
 	raise_softirq(RCU_SOFTIRQ);
 }
 static void
 __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 	   struct rcu_state *rsp)
 {
 	unsigned long flags;
 	struct rcu_data *rdp;
 	debug_rcu_head_queue(head);
 	head->func = func;
 	head->next = NULL;
 	smp_mb(); /* Ensure RCU update seen before callback registry. */
 	/*
 	 * Opportunistically note grace-period endings and beginnings.
 	 * Note that we might see a beginning right after we see an
 	 * end, but never vice versa, since this CPU has to pass through
 	 * a quiescent state betweentimes.
 	 */
 	local_irq_save(flags);
 	rdp = this_cpu_ptr(rsp->rda);
 	/* Add the callback to our list. */
 	*rdp->nxttail[RCU_NEXT_TAIL] = head;
 	rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
 	rdp->qlen++;
 	if (__is_kfree_rcu_offset((unsigned long)func))
 		trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
 					 rdp->qlen);
 	else
 		trace_rcu_callback(rsp->name, head, rdp->qlen);
 	/* If interrupts were disabled, don't dive into RCU core. */
 	if (irqs_disabled_flags(flags)) {
 		local_irq_restore(flags);
 		return;
 	}
 	/*
 	 * Force the grace period if too many callbacks or too long waiting.
 	 * Enforce hysteresis, and don't invoke force_quiescent_state()
 	 * if some other CPU has recently done so.  Also, don't bother
 	 * invoking force_quiescent_state() if the newly enqueued callback
 	 * is the only one waiting for a grace period to complete.
 	 */
 	if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
 		/* Are we ignoring a completed grace period? */
 		rcu_process_gp_end(rsp, rdp);
 		check_for_new_grace_period(rsp, rdp);
 		/* Start a new grace period if one not already started. */
 		if (!rcu_gp_in_progress(rsp)) {
 			unsigned long nestflag;
 			struct rcu_node *rnp_root = rcu_get_root(rsp);
 			raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
 			rcu_start_gp(rsp, nestflag);  /* rlses rnp_root->lock */
 		} else {
 			/* Give the grace period a kick. */
 			rdp->blimit = LONG_MAX;
 			if (rsp->n_force_qs == rdp->n_force_qs_snap &&
 			    *rdp->nxttail[RCU_DONE_TAIL] != head)
 				force_quiescent_state(rsp, 0);
 			rdp->n_force_qs_snap = rsp->n_force_qs;
 			rdp->qlen_last_fqs_check = rdp->qlen;
 		}
 	} else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
 		force_quiescent_state(rsp, 1);
 	local_irq_restore(flags);
 }
 /*
  * Queue an RCU-sched callback for invocation after a grace period.
  */
 void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 {
 	__call_rcu(head, func, &rcu_sched_state);
 }
 EXPORT_SYMBOL_GPL(call_rcu_sched);
 /*
  * Queue an RCU for invocation after a quicker grace period.
  */
 void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 {
 	__call_rcu(head, func, &rcu_bh_state);
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
 /**
  * synchronize_sched - wait until an rcu-sched grace period has elapsed.
  *
  * Control will return to the caller some time after a full rcu-sched
  * grace period has elapsed, in other words after all currently executing
  * rcu-sched read-side critical sections have completed.   These read-side
  * critical sections are delimited by rcu_read_lock_sched() and
  * rcu_read_unlock_sched(), and may be nested.  Note that preempt_disable(),
  * local_irq_disable(), and so on may be used in place of
  * rcu_read_lock_sched().
  *
  * This means that all preempt_disable code sequences, including NMI and
  * hardware-interrupt handlers, in progress on entry will have completed
  * before this primitive returns.  However, this does not guarantee that
  * softirq handlers will have completed, since in some kernels, these
  * handlers can run in process context, and can block.
  *
  * This primitive provides the guarantees made by the (now removed)
  * synchronize_kernel() API.  In contrast, synchronize_rcu() only
  * guarantees that rcu_read_lock() sections will have completed.
  * In "classic RCU", these two guarantees happen to be one and
  * the same, but can differ in realtime RCU implementations.
  */
 void synchronize_sched(void)
 {
 	if (rcu_blocking_is_gp())
 		return;
 	wait_rcu_gp(call_rcu_sched);
 }
 EXPORT_SYMBOL_GPL(synchronize_sched);
 /**
  * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
  *
  * Control will return to the caller some time after a full rcu_bh grace
  * period has elapsed, in other words after all currently executing rcu_bh
  * read-side critical sections have completed.  RCU read-side critical
  * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
  * and may be nested.
  */
 void synchronize_rcu_bh(void)
 {
 	if (rcu_blocking_is_gp())
 		return;
 	wait_rcu_gp(call_rcu_bh);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
 /*
  * Check to see if there is any immediate RCU-related work to be done
  * by the current CPU, for the specified type of RCU, returning 1 if so.
  * The checks are in order of increasing expense: checks that can be
  * carried out against CPU-local state are performed first.  However,
  * we must check for CPU stalls first, else we might not get a chance.
  */
 static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 {
 	struct rcu_node *rnp = rdp->mynode;
 	rdp->n_rcu_pending++;
 	/* Check for CPU stalls, if enabled. */
 	check_cpu_stall(rsp, rdp);
 	/* Is the RCU core waiting for a quiescent state from this CPU? */
 	if (rcu_scheduler_fully_active &&
 	    rdp->qs_pending && !rdp->passed_quiesce) {
 		/*
 		 * If force_quiescent_state() coming soon and this CPU
 		 * needs a quiescent state, and this is either RCU-sched
 		 * or RCU-bh, force a local reschedule.
 		 */
 		rdp->n_rp_qs_pending++;
 		if (!rdp->preemptible &&
 		    ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
 				 jiffies))
 			set_need_resched();
 	} else if (rdp->qs_pending && rdp->passed_quiesce) {
 		rdp->n_rp_report_qs++;
 		return 1;
 	}
 	/* Does this CPU have callbacks ready to invoke? */
 	if (cpu_has_callbacks_ready_to_invoke(rdp)) {
 		rdp->n_rp_cb_ready++;
 		return 1;
 	}
 	/* Has RCU gone idle with this CPU needing another grace period? */
 	if (cpu_needs_another_gp(rsp, rdp)) {
 		rdp->n_rp_cpu_needs_gp++;
 		return 1;
 	}
 	/* Has another RCU grace period completed?  */
 	if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
 		rdp->n_rp_gp_completed++;
 		return 1;
 	}
 	/* Has a new RCU grace period started? */
 	if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */
 		rdp->n_rp_gp_started++;
 		return 1;
 	}
 	/* Has an RCU GP gone long enough to send resched IPIs &c? */
 	if (rcu_gp_in_progress(rsp) &&
 	    ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) {
 		rdp->n_rp_need_fqs++;
 		return 1;
 	}
 	/* nothing to do */
 	rdp->n_rp_need_nothing++;
 	return 0;
 }
 /*
  * Check to see if there is any immediate RCU-related work to be done
  * by the current CPU, returning 1 if so.  This function is part of the
  * RCU implementation; it is -not- an exported member of the RCU API.
  */
 static int rcu_pending(int cpu)
 {
 	return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) ||
 	       __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) ||
 	       rcu_preempt_pending(cpu);
 }
 /*
  * Check to see if any future RCU-related work will need to be done
  * by the current CPU, even if none need be done immediately, returning
  * 1 if so.
  */
 static int rcu_cpu_has_callbacks(int cpu)
 {
 	/* RCU callbacks either ready or pending? */
 	return per_cpu(rcu_sched_data, cpu).nxtlist ||
 	       per_cpu(rcu_bh_data, cpu).nxtlist ||
 	       rcu_preempt_needs_cpu(cpu);
 }
 static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
 static atomic_t rcu_barrier_cpu_count;
 static DEFINE_MUTEX(rcu_barrier_mutex);
 static struct completion rcu_barrier_completion;
 static void rcu_barrier_callback(struct rcu_head *notused)
 {
 	if (atomic_dec_and_test(&rcu_barrier_cpu_count))
 		complete(&rcu_barrier_completion);
 }
 /*
  * Called with preemption disabled, and from cross-cpu IRQ context.
  */
 static void rcu_barrier_func(void *type)
 {
 	int cpu = smp_processor_id();
 	struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
 	void (*call_rcu_func)(struct rcu_head *head,
 			      void (*func)(struct rcu_head *head));
 	atomic_inc(&rcu_barrier_cpu_count);
 	call_rcu_func = type;
 	call_rcu_func(head, rcu_barrier_callback);
 }
 /*
  * Orchestrate the specified type of RCU barrier, waiting for all
  * RCU callbacks of the specified type to complete.
  */
 static void _rcu_barrier(struct rcu_state *rsp,
 			 void (*call_rcu_func)(struct rcu_head *head,
 					       void (*func)(struct rcu_head *head)))
 {
 	BUG_ON(in_interrupt());
 	/* Take mutex to serialize concurrent rcu_barrier() requests. */
 	mutex_lock(&rcu_barrier_mutex);
 	init_completion(&rcu_barrier_completion);
 	/*
 	 * Initialize rcu_barrier_cpu_count to 1, then invoke
 	 * rcu_barrier_func() on each CPU, so that each CPU also has
 	 * incremented rcu_barrier_cpu_count.  Only then is it safe to
 	 * decrement rcu_barrier_cpu_count -- otherwise the first CPU
 	 * might complete its grace period before all of the other CPUs
 	 * did their increment, causing this function to return too
 	 * early.  Note that on_each_cpu() disables irqs, which prevents
 	 * any CPUs from coming online or going offline until each online
 	 * CPU has queued its RCU-barrier callback.
 	 */
 	atomic_set(&rcu_barrier_cpu_count, 1);
 	on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
 	if (atomic_dec_and_test(&rcu_barrier_cpu_count))
 		complete(&rcu_barrier_completion);
 	wait_for_completion(&rcu_barrier_completion);
 	mutex_unlock(&rcu_barrier_mutex);
 }
 /**
  * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
  */
 void rcu_barrier_bh(void)
 {
 	_rcu_barrier(&rcu_bh_state, call_rcu_bh);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_bh);
 /**
  * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
  */
 void rcu_barrier_sched(void)
 {
 	_rcu_barrier(&rcu_sched_state, call_rcu_sched);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_sched);
 /*
  * Do boot-time initialization of a CPU's per-CPU RCU data.
  */
 static void __init
 rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 {
 	unsigned long flags;
 	int i;
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp = rcu_get_root(rsp);
 	/* Set up local state, ensuring consistent view of global state. */
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
 	rdp->nxtlist = NULL;
 	for (i = 0; i < RCU_NEXT_SIZE; i++)
 		rdp->nxttail[i] = &rdp->nxtlist;
 	rdp->qlen = 0;
 	rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
 	WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING);
 	WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
 	rdp->cpu = cpu;
 	rdp->rsp = rsp;
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 /*
  * Initialize a CPU's per-CPU RCU data.  Note that only one online or
  * offline event can be happening at a given time.  Note also that we
  * can accept some slop in the rsp->completed access due to the fact
  * that this CPU cannot possibly have any RCU callbacks in flight yet.
  */
 static void __cpuinit
 rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
 {
 	unsigned long flags;
 	unsigned long mask;
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp = rcu_get_root(rsp);
 	/* Set up local state, ensuring consistent view of global state. */
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	rdp->beenonline = 1;	 /* We have now been online. */
 	rdp->preemptible = preemptible;
 	rdp->qlen_last_fqs_check = 0;
 	rdp->n_force_qs_snap = rsp->n_force_qs;
 	rdp->blimit = blimit;
 	rdp->dynticks->dynticks_nesting = DYNTICK_TASK_NESTING;
 	atomic_set(&rdp->dynticks->dynticks,
 		   (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
+	rcu_prepare_for_idle_init(cpu);
 	raw_spin_unlock(&rnp->lock);		/* irqs remain disabled. */
 	/*
 	 * A new grace period might start here.  If so, we won't be part
 	 * of it, but that is OK, as we are currently in a quiescent state.
 	 */
 	/* Exclude any attempts to start a new GP on large systems. */
 	raw_spin_lock(&rsp->onofflock);		/* irqs already disabled. */
 	/* Add CPU to rcu_node bitmasks. */
 	rnp = rdp->mynode;
 	mask = rdp->grpmask;
 	do {
 		/* Exclude any attempts to start a new GP on small systems. */
 		raw_spin_lock(&rnp->lock);	/* irqs already disabled. */
 		rnp->qsmaskinit |= mask;
 		mask = rnp->grpmask;
 		if (rnp == rdp->mynode) {
 			/*
 			 * If there is a grace period in progress, we will
 			 * set up to wait for it next time we run the
 			 * RCU core code.
 			 */
 			rdp->gpnum = rnp->completed;
 			rdp->completed = rnp->completed;
 			rdp->passed_quiesce = 0;
 			rdp->qs_pending = 0;
 			rdp->passed_quiesce_gpnum = rnp->gpnum - 1;
 			trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl");
 		}
 		raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
 		rnp = rnp->parent;
 	} while (rnp != NULL && !(rnp->qsmaskinit & mask));
 	raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 static void __cpuinit rcu_prepare_cpu(int cpu)
 {
 	rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
 	rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
 	rcu_preempt_init_percpu_data(cpu);
 }
 /*
  * Handle CPU online/offline notification events.
  */
 static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 				    unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
 	struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
 	struct rcu_node *rnp = rdp->mynode;
 	trace_rcu_utilization("Start CPU hotplug");
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
 		rcu_prepare_cpu(cpu);
 		rcu_prepare_kthreads(cpu);
 		break;
 	case CPU_ONLINE:
 	case CPU_DOWN_FAILED:
 		rcu_node_kthread_setaffinity(rnp, -1);
 		rcu_cpu_kthread_setrt(cpu, 1);
 		break;
 	case CPU_DOWN_PREPARE:
 		rcu_node_kthread_setaffinity(rnp, cpu);
 		rcu_cpu_kthread_setrt(cpu, 0);
 		break;
 	case CPU_DYING:
 	case CPU_DYING_FROZEN:
 		/*
 		 * The whole machine is "stopped" except this CPU, so we can
 		 * touch any data without introducing corruption. We send the
 		 * dying CPU's callbacks to an arbitrarily chosen online CPU.
 		 */
 		rcu_send_cbs_to_online(&rcu_bh_state);
 		rcu_send_cbs_to_online(&rcu_sched_state);
 		rcu_preempt_send_cbs_to_online();
+		rcu_cleanup_after_idle(cpu);
 		break;
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 	case CPU_UP_CANCELED:
 	case CPU_UP_CANCELED_FROZEN:
 		rcu_offline_cpu(cpu);
 		break;
 	default:
 		break;
 	}
 	trace_rcu_utilization("End CPU hotplug");
 	return NOTIFY_OK;
 }
 /*
  * This function is invoked towards the end of the scheduler's initialization
  * process.  Before this is called, the idle task might contain
  * RCU read-side critical sections (during which time, this idle
  * task is booting the system).  After this function is called, the
  * idle tasks are prohibited from containing RCU read-side critical
  * sections.  This function also enables RCU lockdep checking.
  */
 void rcu_scheduler_starting(void)
 {
 	WARN_ON(num_online_cpus() != 1);
 	WARN_ON(nr_context_switches() > 0);
 	rcu_scheduler_active = 1;
 }
 /*
  * Compute the per-level fanout, either using the exact fanout specified
  * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
  */
 #ifdef CONFIG_RCU_FANOUT_EXACT
 static void __init rcu_init_levelspread(struct rcu_state *rsp)
 {
 	int i;
 	for (i = NUM_RCU_LVLS - 1; i > 0; i--)
 		rsp->levelspread[i] = CONFIG_RCU_FANOUT;
 	rsp->levelspread[0] = RCU_FANOUT_LEAF;
 }
 #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
 static void __init rcu_init_levelspread(struct rcu_state *rsp)
 {
 	int ccur;
 	int cprv;
 	int i;
 	cprv = NR_CPUS;
 	for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
 		ccur = rsp->levelcnt[i];
 		rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
 		cprv = ccur;
 	}
 }
 #endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */
 /*
  * Helper function for rcu_init() that initializes one rcu_state structure.
  */
 static void __init rcu_init_one(struct rcu_state *rsp,
 		struct rcu_data __percpu *rda)
 {
 	static char *buf[] = { "rcu_node_level_0",
 			       "rcu_node_level_1",
 			       "rcu_node_level_2",
 			       "rcu_node_level_3" };  /* Match MAX_RCU_LVLS */
 	int cpustride = 1;
 	int i;
 	int j;
 	struct rcu_node *rnp;
 	BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf));  /* Fix buf[] init! */
 	/* Initialize the level-tracking arrays. */
 	for (i = 1; i < NUM_RCU_LVLS; i++)
 		rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
 	rcu_init_levelspread(rsp);
 	/* Initialize the elements themselves, starting from the leaves. */
 	for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
 		cpustride *= rsp->levelspread[i];
 		rnp = rsp->level[i];
 		for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
 			raw_spin_lock_init(&rnp->lock);
 			lockdep_set_class_and_name(&rnp->lock,
 						   &rcu_node_class[i], buf[i]);
 			rnp->gpnum = 0;
 			rnp->qsmask = 0;
 			rnp->qsmaskinit = 0;
 			rnp->grplo = j * cpustride;
 			rnp->grphi = (j + 1) * cpustride - 1;
 			if (rnp->grphi >= NR_CPUS)
 				rnp->grphi = NR_CPUS - 1;
 			if (i == 0) {
 				rnp->grpnum = 0;
 				rnp->grpmask = 0;
 				rnp->parent = NULL;
 			} else {
 				rnp->grpnum = j % rsp->levelspread[i - 1];
 				rnp->grpmask = 1UL << rnp->grpnum;
 				rnp->parent = rsp->level[i - 1] +
 					      j / rsp->levelspread[i - 1];
 			}
 			rnp->level = i;
 			INIT_LIST_HEAD(&rnp->blkd_tasks);
 		}
 	}
 	rsp->rda = rda;
 	rnp = rsp->level[NUM_RCU_LVLS - 1];
 	for_each_possible_cpu(i) {
 		while (i > rnp->grphi)
 			rnp++;
 		per_cpu_ptr(rsp->rda, i)->mynode = rnp;
 		rcu_boot_init_percpu_data(i, rsp);
 	}
 }
 void __init rcu_init(void)
 {
 	int cpu;
 	rcu_bootup_announce();
 	rcu_init_one(&rcu_sched_state, &rcu_sched_data);
 	rcu_init_one(&rcu_bh_state, &rcu_bh_data);
 	__rcu_init_preempt();
 	 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 	/*
 	 * We don't need protection against CPU-hotplug here because
 	 * this is called early in boot, before either interrupts
 	 * or the scheduler are operational.
 	 */
 	cpu_notifier(rcu_cpu_notify, 0);
 	for_each_online_cpu(cpu)
 		rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
 	check_cpu_stall_init();
 }
 #include "rcutree_plugin.h"

kernel/rcutree.h

Diff comments View file @ 7cb9249

 /*
  * Read-Copy Update mechanism for mutual exclusion (tree-based version)
  * Internal non-public definitions.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  * (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  *
  * Copyright IBM Corporation, 2008
  *
  * Author: Ingo Molnar <mingo@elte.hu>
  *	   Paul E. McKenney <paulmck@linux.vnet.ibm.com>
  */
 #include <linux/cache.h>
 #include <linux/spinlock.h>
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
 /*
  * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
  * In theory, it should be possible to add more levels straightforwardly.
  * In practice, this did work well going from three levels to four.
  * Of course, your mileage may vary.
  */
 #define MAX_RCU_LVLS 4
 #if CONFIG_RCU_FANOUT > 16
 #define RCU_FANOUT_LEAF       16
 #else /* #if CONFIG_RCU_FANOUT > 16 */
 #define RCU_FANOUT_LEAF       (CONFIG_RCU_FANOUT)
 #endif /* #else #if CONFIG_RCU_FANOUT > 16 */
 #define RCU_FANOUT_1	      (RCU_FANOUT_LEAF)
 #define RCU_FANOUT_2	      (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
 #define RCU_FANOUT_3	      (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
 #define RCU_FANOUT_4	      (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
 #if NR_CPUS <= RCU_FANOUT_1
 #  define NUM_RCU_LVLS	      1
 #  define NUM_RCU_LVL_0	      1
 #  define NUM_RCU_LVL_1	      (NR_CPUS)
 #  define NUM_RCU_LVL_2	      0
 #  define NUM_RCU_LVL_3	      0
 #  define NUM_RCU_LVL_4	      0
 #elif NR_CPUS <= RCU_FANOUT_2
 #  define NUM_RCU_LVLS	      2
 #  define NUM_RCU_LVL_0	      1
 #  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
 #  define NUM_RCU_LVL_2	      (NR_CPUS)
 #  define NUM_RCU_LVL_3	      0
 #  define NUM_RCU_LVL_4	      0
 #elif NR_CPUS <= RCU_FANOUT_3
 #  define NUM_RCU_LVLS	      3
 #  define NUM_RCU_LVL_0	      1
 #  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
 #  define NUM_RCU_LVL_2	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
 #  define NUM_RCU_LVL_3	      (NR_CPUS)
 #  define NUM_RCU_LVL_4	      0
 #elif NR_CPUS <= RCU_FANOUT_4
 #  define NUM_RCU_LVLS	      4
 #  define NUM_RCU_LVL_0	      1
 #  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
 #  define NUM_RCU_LVL_2	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
 #  define NUM_RCU_LVL_3	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
 #  define NUM_RCU_LVL_4	      (NR_CPUS)
 #else
 # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
 #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
 #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
 #define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
 /*
  * Dynticks per-CPU state.
  */
 struct rcu_dynticks {
 	long long dynticks_nesting; /* Track irq/process nesting level. */
 				    /* Process level is worth LLONG_MAX/2. */
 	int dynticks_nmi_nesting;   /* Track NMI nesting level. */
 	atomic_t dynticks;	    /* Even value for idle, else odd. */
 };
 /* RCU's kthread states for tracing. */
 #define RCU_KTHREAD_STOPPED  0
 #define RCU_KTHREAD_RUNNING  1
 #define RCU_KTHREAD_WAITING  2
 #define RCU_KTHREAD_OFFCPU   3
 #define RCU_KTHREAD_YIELDING 4
 #define RCU_KTHREAD_MAX      4
 /*
  * Definition for node within the RCU grace-period-detection hierarchy.
  */
 struct rcu_node {
 	raw_spinlock_t lock;	/* Root rcu_node's lock protects some */
 				/*  rcu_state fields as well as following. */
 	unsigned long gpnum;	/* Current grace period for this node. */
 				/*  This will either be equal to or one */
 				/*  behind the root rcu_node's gpnum. */
 	unsigned long completed; /* Last GP completed for this node. */
 				/*  This will either be equal to or one */
 				/*  behind the root rcu_node's gpnum. */
 	unsigned long qsmask;	/* CPUs or groups that need to switch in */
 				/*  order for current grace period to proceed.*/
 				/*  In leaf rcu_node, each bit corresponds to */
 				/*  an rcu_data structure, otherwise, each */
 				/*  bit corresponds to a child rcu_node */
 				/*  structure. */
 	unsigned long expmask;	/* Groups that have ->blkd_tasks */
 				/*  elements that need to drain to allow the */
 				/*  current expedited grace period to */
 				/*  complete (only for TREE_PREEMPT_RCU). */
 	atomic_t wakemask;	/* CPUs whose kthread needs to be awakened. */
 				/*  Since this has meaning only for leaf */
 				/*  rcu_node structures, 32 bits suffices. */
 	unsigned long qsmaskinit;
 				/* Per-GP initial value for qsmask & expmask. */
 	unsigned long grpmask;	/* Mask to apply to parent qsmask. */
 				/*  Only one bit will be set in this mask. */
 	int	grplo;		/* lowest-numbered CPU or group here. */
 	int	grphi;		/* highest-numbered CPU or group here. */
 	u8	grpnum;		/* CPU/group number for next level up. */
 	u8	level;		/* root is at level 0. */
 	struct rcu_node *parent;
 	struct list_head blkd_tasks;
 				/* Tasks blocked in RCU read-side critical */
 				/*  section.  Tasks are placed at the head */
 				/*  of this list and age towards the tail. */
 	struct list_head *gp_tasks;
 				/* Pointer to the first task blocking the */
 				/*  current grace period, or NULL if there */
 				/*  is no such task. */
 	struct list_head *exp_tasks;
 				/* Pointer to the first task blocking the */
 				/*  current expedited grace period, or NULL */
 				/*  if there is no such task.  If there */
 				/*  is no current expedited grace period, */
 				/*  then there can cannot be any such task. */
 #ifdef CONFIG_RCU_BOOST
 	struct list_head *boost_tasks;
 				/* Pointer to first task that needs to be */
 				/*  priority boosted, or NULL if no priority */
 				/*  boosting is needed for this rcu_node */
 				/*  structure.  If there are no tasks */
 				/*  queued on this rcu_node structure that */
 				/*  are blocking the current grace period, */
 				/*  there can be no such task. */
 	unsigned long boost_time;
 				/* When to start boosting (jiffies). */
 	struct task_struct *boost_kthread_task;
 				/* kthread that takes care of priority */
 				/*  boosting for this rcu_node structure. */
 	unsigned int boost_kthread_status;
 				/* State of boost_kthread_task for tracing. */
 	unsigned long n_tasks_boosted;
 				/* Total number of tasks boosted. */
 	unsigned long n_exp_boosts;
 				/* Number of tasks boosted for expedited GP. */
 	unsigned long n_normal_boosts;
 				/* Number of tasks boosted for normal GP. */
 	unsigned long n_balk_blkd_tasks;
 				/* Refused to boost: no blocked tasks. */
 	unsigned long n_balk_exp_gp_tasks;
 				/* Refused to boost: nothing blocking GP. */
 	unsigned long n_balk_boost_tasks;
 				/* Refused to boost: already boosting. */
 	unsigned long n_balk_notblocked;
 				/* Refused to boost: RCU RS CS still running. */
 	unsigned long n_balk_notyet;
 				/* Refused to boost: not yet time. */
 	unsigned long n_balk_nos;
 				/* Refused to boost: not sure why, though. */
 				/*  This can happen due to race conditions. */
 #endif /* #ifdef CONFIG_RCU_BOOST */
 	struct task_struct *node_kthread_task;
 				/* kthread that takes care of this rcu_node */
 				/*  structure, for example, awakening the */
 				/*  per-CPU kthreads as needed. */
 	unsigned int node_kthread_status;
 				/* State of node_kthread_task for tracing. */
 } ____cacheline_internodealigned_in_smp;
 /*
  * Do a full breadth-first scan of the rcu_node structures for the
  * specified rcu_state structure.
  */
 #define rcu_for_each_node_breadth_first(rsp, rnp) \
 	for ((rnp) = &(rsp)->node[0]; \
 	     (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
 /*
  * Do a breadth-first scan of the non-leaf rcu_node structures for the
  * specified rcu_state structure.  Note that if there is a singleton
  * rcu_node tree with but one rcu_node structure, this loop is a no-op.
  */
 #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
 	for ((rnp) = &(rsp)->node[0]; \
 	     (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++)
 /*
  * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
  * structure.  Note that if there is a singleton rcu_node tree with but
  * one rcu_node structure, this loop -will- visit the rcu_node structure.
  * It is still a leaf node, even if it is also the root node.
  */
 #define rcu_for_each_leaf_node(rsp, rnp) \
 	for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \
 	     (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
 /* Index values for nxttail array in struct rcu_data. */
 #define RCU_DONE_TAIL		0	/* Also RCU_WAIT head. */
 #define RCU_WAIT_TAIL		1	/* Also RCU_NEXT_READY head. */
 #define RCU_NEXT_READY_TAIL	2	/* Also RCU_NEXT head. */
 #define RCU_NEXT_TAIL		3
 #define RCU_NEXT_SIZE		4
 /* Per-CPU data for read-copy update. */
 struct rcu_data {
 	/* 1) quiescent-state and grace-period handling : */
 	unsigned long	completed;	/* Track rsp->completed gp number */
 					/*  in order to detect GP end. */
 	unsigned long	gpnum;		/* Highest gp number that this CPU */
 					/*  is aware of having started. */
 	unsigned long	passed_quiesce_gpnum;
 					/* gpnum at time of quiescent state. */
 	bool		passed_quiesce;	/* User-mode/idle loop etc. */
 	bool		qs_pending;	/* Core waits for quiesc state. */
 	bool		beenonline;	/* CPU online at least once. */
 	bool		preemptible;	/* Preemptible RCU? */
 	struct rcu_node *mynode;	/* This CPU's leaf of hierarchy */
 	unsigned long grpmask;		/* Mask to apply to leaf qsmask. */
 	/* 2) batch handling */
 	/*
 	 * If nxtlist is not NULL, it is partitioned as follows.
 	 * Any of the partitions might be empty, in which case the
 	 * pointer to that partition will be equal to the pointer for
 	 * the following partition.  When the list is empty, all of
 	 * the nxttail elements point to the ->nxtlist pointer itself,
 	 * which in that case is NULL.
 	 *
 	 * [nxtlist, *nxttail[RCU_DONE_TAIL]):
 	 *	Entries that batch # <= ->completed
 	 *	The grace period for these entries has completed, and
 	 *	the other grace-period-completed entries may be moved
 	 *	here temporarily in rcu_process_callbacks().
 	 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
 	 *	Entries that batch # <= ->completed - 1: waiting for current GP
 	 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
 	 *	Entries known to have arrived before current GP ended
 	 * [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]):
 	 *	Entries that might have arrived after current GP ended
 	 *	Note that the value of *nxttail[RCU_NEXT_TAIL] will
 	 *	always be NULL, as this is the end of the list.
 	 */
 	struct rcu_head *nxtlist;
 	struct rcu_head **nxttail[RCU_NEXT_SIZE];
 	long		qlen;		/* # of queued callbacks */
 	long		qlen_last_fqs_check;
 					/* qlen at last check for QS forcing */
 	unsigned long	n_cbs_invoked;	/* count of RCU cbs invoked. */
 	unsigned long   n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
 	unsigned long   n_cbs_adopted;  /* RCU cbs adopted from dying CPU */
 	unsigned long	n_force_qs_snap;
 					/* did other CPU force QS recently? */
 	long		blimit;		/* Upper limit on a processed batch */
 	/* 3) dynticks interface. */
 	struct rcu_dynticks *dynticks;	/* Shared per-CPU dynticks state. */
 	int dynticks_snap;		/* Per-GP tracking for dynticks. */
 	/* 4) reasons this CPU needed to be kicked by force_quiescent_state */
 	unsigned long dynticks_fqs;	/* Kicked due to dynticks idle. */
 	unsigned long offline_fqs;	/* Kicked due to being offline. */
 	unsigned long resched_ipi;	/* Sent a resched IPI. */
 	/* 5) __rcu_pending() statistics. */
 	unsigned long n_rcu_pending;	/* rcu_pending() calls since boot. */
 	unsigned long n_rp_qs_pending;
 	unsigned long n_rp_report_qs;
 	unsigned long n_rp_cb_ready;
 	unsigned long n_rp_cpu_needs_gp;
 	unsigned long n_rp_gp_completed;
 	unsigned long n_rp_gp_started;
 	unsigned long n_rp_need_fqs;
 	unsigned long n_rp_need_nothing;
 	int cpu;
 	struct rcu_state *rsp;
 };
 /* Values for fqs_state field in struct rcu_state. */
 #define RCU_GP_IDLE		0	/* No grace period in progress. */
 #define RCU_GP_INIT		1	/* Grace period being initialized. */
 #define RCU_SAVE_DYNTICK	2	/* Need to scan dyntick state. */
 #define RCU_FORCE_QS		3	/* Need to force quiescent state. */
 #define RCU_SIGNAL_INIT		RCU_SAVE_DYNTICK
 #define RCU_JIFFIES_TILL_FORCE_QS	 3	/* for rsp->jiffies_force_qs */
 #ifdef CONFIG_PROVE_RCU
 #define RCU_STALL_DELAY_DELTA	       (5 * HZ)
 #else
 #define RCU_STALL_DELAY_DELTA	       0
 #endif
 #define RCU_SECONDS_TILL_STALL_CHECK   (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \
 					RCU_STALL_DELAY_DELTA)
 						/* for rsp->jiffies_stall */
 #define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30)
 						/* for rsp->jiffies_stall */
 #define RCU_STALL_RAT_DELAY		2	/* Allow other CPUs time */
 						/*  to take at least one */
 						/*  scheduling clock irq */
 						/*  before ratting on them. */
 #define rcu_wait(cond)							\
 do {									\
 	for (;;) {							\
 		set_current_state(TASK_INTERRUPTIBLE);			\
 		if (cond)						\
 			break;						\
 		schedule();						\
 	}								\
 	__set_current_state(TASK_RUNNING);				\
 } while (0)
 /*
  * RCU global state, including node hierarchy.  This hierarchy is
  * represented in "heap" form in a dense array.  The root (first level)
  * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
  * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]),
  * and the third level in ->node[m+1] and following (->node[m+1] referenced
  * by ->level[2]).  The number of levels is determined by the number of
  * CPUs and by CONFIG_RCU_FANOUT.  Small systems will have a "hierarchy"
  * consisting of a single rcu_node.
  */
 struct rcu_state {
 	struct rcu_node node[NUM_RCU_NODES];	/* Hierarchy. */
 	struct rcu_node *level[NUM_RCU_LVLS];	/* Hierarchy levels. */
 	u32 levelcnt[MAX_RCU_LVLS + 1];		/* # nodes in each level. */
 	u8 levelspread[NUM_RCU_LVLS];		/* kids/node in each level. */
 	struct rcu_data __percpu *rda;		/* pointer of percu rcu_data. */
 	/* The following fields are guarded by the root rcu_node's lock. */
 	u8	fqs_state ____cacheline_internodealigned_in_smp;
 						/* Force QS state. */
 	u8	fqs_active;			/* force_quiescent_state() */
 						/*  is running. */
 	u8	fqs_need_gp;			/* A CPU was prevented from */
 						/*  starting a new grace */
 						/*  period because */
 						/*  force_quiescent_state() */
 						/*  was running. */
 	u8	boost;				/* Subject to priority boost. */
 	unsigned long gpnum;			/* Current gp number. */
 	unsigned long completed;		/* # of last completed gp. */
 	/* End of fields guarded by root rcu_node's lock. */
 	raw_spinlock_t onofflock;		/* exclude on/offline and */
 						/*  starting new GP. */
 	raw_spinlock_t fqslock;			/* Only one task forcing */
 						/*  quiescent states. */
 	unsigned long jiffies_force_qs;		/* Time at which to invoke */
 						/*  force_quiescent_state(). */
 	unsigned long n_force_qs;		/* Number of calls to */
 						/*  force_quiescent_state(). */
 	unsigned long n_force_qs_lh;		/* ~Number of calls leaving */
 						/*  due to lock unavailable. */
 	unsigned long n_force_qs_ngp;		/* Number of calls leaving */
 						/*  due to no GP active. */
 	unsigned long gp_start;			/* Time at which GP started, */
 						/*  but in jiffies. */
 	unsigned long jiffies_stall;		/* Time at which to check */
 						/*  for CPU stalls. */
 	unsigned long gp_max;			/* Maximum GP duration in */
 						/*  jiffies. */
 	char *name;				/* Name of structure. */
 };
 /* Return values for rcu_preempt_offline_tasks(). */
 #define RCU_OFL_TASKS_NORM_GP	0x1		/* Tasks blocking normal */
 						/*  GP were moved to root. */
 #define RCU_OFL_TASKS_EXP_GP	0x2		/* Tasks blocking expedited */
 						/*  GP were moved to root. */
 /*
  * RCU implementation internal declarations:
  */
 extern struct rcu_state rcu_sched_state;
 DECLARE_PER_CPU(struct rcu_data, rcu_sched_data);
 extern struct rcu_state rcu_bh_state;
 DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
 #ifdef CONFIG_TREE_PREEMPT_RCU
 extern struct rcu_state rcu_preempt_state;
 DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 #ifdef CONFIG_RCU_BOOST
 DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
 DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
 DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
 DECLARE_PER_CPU(char, rcu_cpu_has_work);
 #endif /* #ifdef CONFIG_RCU_BOOST */
 #ifndef RCU_TREE_NONCORE
 /* Forward declarations for rcutree_plugin.h */
 static void rcu_bootup_announce(void);
 long rcu_batches_completed(void);
 static void rcu_preempt_note_context_switch(int cpu);
 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
 static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
 				      unsigned long flags);
 static void rcu_stop_cpu_kthread(int cpu);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static void rcu_print_detail_task_stall(struct rcu_state *rsp);
 static int rcu_print_task_stall(struct rcu_node *rnp);
 static void rcu_preempt_stall_reset(void);
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
 static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
 				     struct rcu_node *rnp,
 				     struct rcu_data *rdp);
 static void rcu_preempt_offline_cpu(int cpu);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static void rcu_preempt_check_callbacks(int cpu);
 static void rcu_preempt_process_callbacks(void);
 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
 #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
 static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 			       bool wake);
 #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
 static int rcu_preempt_pending(int cpu);
 static int rcu_preempt_needs_cpu(int cpu);
 static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
 static void rcu_preempt_send_cbs_to_online(void);
 static void __init __rcu_init_preempt(void);
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
 static void invoke_rcu_callbacks_kthread(void);
 #ifdef CONFIG_RCU_BOOST
 static void rcu_preempt_do_callbacks(void);
 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
 					  cpumask_var_t cm);
 static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
 						 struct rcu_node *rnp,
 						 int rnp_index);
 static void invoke_rcu_node_kthread(struct rcu_node *rnp);
 static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
 #endif /* #ifdef CONFIG_RCU_BOOST */
 static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
 static void __cpuinit rcu_prepare_kthreads(int cpu);
+static void rcu_prepare_for_idle_init(int cpu);
+static void rcu_cleanup_after_idle(int cpu);
 static void rcu_prepare_for_idle(int cpu);
 #endif /* #ifndef RCU_TREE_NONCORE */

kernel/rcutree_plugin.h

Diff comments View file @ 7cb9249

 /*
  * Read-Copy Update mechanism for mutual exclusion (tree-based version)
  * Internal non-public definitions that provide either classic
  * or preemptible semantics.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  * (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  *
  * Copyright Red Hat, 2009
  * Copyright IBM Corporation, 2009
  *
  * Author: Ingo Molnar <mingo@elte.hu>
  *	   Paul E. McKenney <paulmck@linux.vnet.ibm.com>
  */
 #include <linux/delay.h>
 #include <linux/stop_machine.h>
 #define RCU_KTHREAD_PRIO 1
 #ifdef CONFIG_RCU_BOOST
 #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
 #else
 #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
 #endif
 /*
  * Check the RCU kernel configuration parameters and print informative
  * messages about anything out of the ordinary.  If you like #ifdef, you
  * will love this function.
  */
 static void __init rcu_bootup_announce_oddness(void)
 {
 #ifdef CONFIG_RCU_TRACE
 	printk(KERN_INFO "\tRCU debugfs-based tracing is enabled.\n");
 #endif
 #if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)
 	printk(KERN_INFO "\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
 	       CONFIG_RCU_FANOUT);
 #endif
 #ifdef CONFIG_RCU_FANOUT_EXACT
 	printk(KERN_INFO "\tHierarchical RCU autobalancing is disabled.\n");
 #endif
 #ifdef CONFIG_RCU_FAST_NO_HZ
 	printk(KERN_INFO
 	       "\tRCU dyntick-idle grace-period acceleration is enabled.\n");
 #endif
 #ifdef CONFIG_PROVE_RCU
 	printk(KERN_INFO "\tRCU lockdep checking is enabled.\n");
 #endif
 #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
 	printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
 #endif
 #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
 	printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
 #endif
 #if NUM_RCU_LVL_4 != 0
 	printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n");
 #endif
 }
 #ifdef CONFIG_TREE_PREEMPT_RCU
 struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt);
 DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
 static struct rcu_state *rcu_state = &rcu_preempt_state;
 static void rcu_read_unlock_special(struct task_struct *t);
 static int rcu_preempted_readers_exp(struct rcu_node *rnp);
 /*
  * Tell them what RCU they are running.
  */
 static void __init rcu_bootup_announce(void)
 {
 	printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n");
 	rcu_bootup_announce_oddness();
 }
 /*
  * Return the number of RCU-preempt batches processed thus far
  * for debug and statistics.
  */
 long rcu_batches_completed_preempt(void)
 {
 	return rcu_preempt_state.completed;
 }
 EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
 /*
  * Return the number of RCU batches processed thus far for debug & stats.
  */
 long rcu_batches_completed(void)
 {
 	return rcu_batches_completed_preempt();
 }
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
 /*
  * Force a quiescent state for preemptible RCU.
  */
 void rcu_force_quiescent_state(void)
 {
 	force_quiescent_state(&rcu_preempt_state, 0);
 }
 EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
 /*
  * Record a preemptible-RCU quiescent state for the specified CPU.  Note
  * that this just means that the task currently running on the CPU is
  * not in a quiescent state.  There might be any number of tasks blocked
  * while in an RCU read-side critical section.
  *
  * Unlike the other rcu_*_qs() functions, callers to this function
  * must disable irqs in order to protect the assignment to
  * ->rcu_read_unlock_special.
  */
 static void rcu_preempt_qs(int cpu)
 {
 	struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
 	rdp->passed_quiesce_gpnum = rdp->gpnum;
 	barrier();
 	if (rdp->passed_quiesce == 0)
 		trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs");
 	rdp->passed_quiesce = 1;
 	current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
 }
 /*
  * We have entered the scheduler, and the current task might soon be
  * context-switched away from.  If this task is in an RCU read-side
  * critical section, we will no longer be able to rely on the CPU to
  * record that fact, so we enqueue the task on the blkd_tasks list.
  * The task will dequeue itself when it exits the outermost enclosing
  * RCU read-side critical section.  Therefore, the current grace period
  * cannot be permitted to complete until the blkd_tasks list entries
  * predating the current grace period drain, in other words, until
  * rnp->gp_tasks becomes NULL.
  *
  * Caller must disable preemption.
  */
 static void rcu_preempt_note_context_switch(int cpu)
 {
 	struct task_struct *t = current;
 	unsigned long flags;
 	struct rcu_data *rdp;
 	struct rcu_node *rnp;
 	if (t->rcu_read_lock_nesting > 0 &&
 	    (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
 		/* Possibly blocking in an RCU read-side critical section. */
 		rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
 		rnp = rdp->mynode;
 		raw_spin_lock_irqsave(&rnp->lock, flags);
 		t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
 		t->rcu_blocked_node = rnp;
 		/*
 		 * If this CPU has already checked in, then this task
 		 * will hold up the next grace period rather than the
 		 * current grace period.  Queue the task accordingly.
 		 * If the task is queued for the current grace period
 		 * (i.e., this CPU has not yet passed through a quiescent
 		 * state for the current grace period), then as long
 		 * as that task remains queued, the current grace period
 		 * cannot end.  Note that there is some uncertainty as
 		 * to exactly when the current grace period started.
 		 * We take a conservative approach, which can result
 		 * in unnecessarily waiting on tasks that started very
 		 * slightly after the current grace period began.  C'est
 		 * la vie!!!
 		 *
 		 * But first, note that the current CPU must still be
 		 * on line!
 		 */
 		WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
 		WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
 		if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
 			list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
 			rnp->gp_tasks = &t->rcu_node_entry;
 #ifdef CONFIG_RCU_BOOST
 			if (rnp->boost_tasks != NULL)
 				rnp->boost_tasks = rnp->gp_tasks;
 #endif /* #ifdef CONFIG_RCU_BOOST */
 		} else {
 			list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
 			if (rnp->qsmask & rdp->grpmask)
 				rnp->gp_tasks = &t->rcu_node_entry;
 		}
 		trace_rcu_preempt_task(rdp->rsp->name,
 				       t->pid,
 				       (rnp->qsmask & rdp->grpmask)
 				       ? rnp->gpnum
 				       : rnp->gpnum + 1);
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	} else if (t->rcu_read_lock_nesting < 0 &&
 		   t->rcu_read_unlock_special) {
 		/*
 		 * Complete exit from RCU read-side critical section on
 		 * behalf of preempted instance of __rcu_read_unlock().
 		 */
 		rcu_read_unlock_special(t);
 	}
 	/*
 	 * Either we were not in an RCU read-side critical section to
 	 * begin with, or we have now recorded that critical section
 	 * globally.  Either way, we can now note a quiescent state
 	 * for this CPU.  Again, if we were in an RCU read-side critical
 	 * section, and if that critical section was blocking the current
 	 * grace period, then the fact that the task has been enqueued
 	 * means that we continue to block the current grace period.
 	 */
 	local_irq_save(flags);
 	rcu_preempt_qs(cpu);
 	local_irq_restore(flags);
 }
 /*
  * Tree-preemptible RCU implementation for rcu_read_lock().
  * Just increment ->rcu_read_lock_nesting, shared state will be updated
  * if we block.
  */
 void __rcu_read_lock(void)
 {
 	current->rcu_read_lock_nesting++;
 	barrier();  /* needed if we ever invoke rcu_read_lock in rcutree.c */
 }
 EXPORT_SYMBOL_GPL(__rcu_read_lock);
 /*
  * Check for preempted RCU readers blocking the current grace period
  * for the specified rcu_node structure.  If the caller needs a reliable
  * answer, it must hold the rcu_node's ->lock.
  */
 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 {
 	return rnp->gp_tasks != NULL;
 }
 /*
  * Record a quiescent state for all tasks that were previously queued
  * on the specified rcu_node structure and that were blocking the current
  * RCU grace period.  The caller must hold the specified rnp->lock with
  * irqs disabled, and this lock is released upon return, but irqs remain
  * disabled.
  */
 static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
 	__releases(rnp->lock)
 {
 	unsigned long mask;
 	struct rcu_node *rnp_p;
 	if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;  /* Still need more quiescent states! */
 	}
 	rnp_p = rnp->parent;
 	if (rnp_p == NULL) {
 		/*
 		 * Either there is only one rcu_node in the tree,
 		 * or tasks were kicked up to root rcu_node due to
 		 * CPUs going offline.
 		 */
 		rcu_report_qs_rsp(&rcu_preempt_state, flags);
 		return;
 	}
 	/* Report up the rest of the hierarchy. */
 	mask = rnp->grpmask;
 	raw_spin_unlock(&rnp->lock);	/* irqs remain disabled. */
 	raw_spin_lock(&rnp_p->lock);	/* irqs already disabled. */
 	rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
 }
 /*
  * Advance a ->blkd_tasks-list pointer to the next entry, instead
  * returning NULL if at the end of the list.
  */
 static struct list_head *rcu_next_node_entry(struct task_struct *t,
 					     struct rcu_node *rnp)
 {
 	struct list_head *np;
 	np = t->rcu_node_entry.next;
 	if (np == &rnp->blkd_tasks)
 		np = NULL;
 	return np;
 }
 /*
  * Handle special cases during rcu_read_unlock(), such as needing to
  * notify RCU core processing or task having blocked during the RCU
  * read-side critical section.
  */
 static noinline void rcu_read_unlock_special(struct task_struct *t)
 {
 	int empty;
 	int empty_exp;
 	int empty_exp_now;
 	unsigned long flags;
 	struct list_head *np;
 #ifdef CONFIG_RCU_BOOST
 	struct rt_mutex *rbmp = NULL;
 #endif /* #ifdef CONFIG_RCU_BOOST */
 	struct rcu_node *rnp;
 	int special;
 	/* NMI handlers cannot block and cannot safely manipulate state. */
 	if (in_nmi())
 		return;
 	local_irq_save(flags);
 	/*
 	 * If RCU core is waiting for this CPU to exit critical section,
 	 * let it know that we have done so.
 	 */
 	special = t->rcu_read_unlock_special;
 	if (special & RCU_READ_UNLOCK_NEED_QS) {
 		rcu_preempt_qs(smp_processor_id());
 	}
 	/* Hardware IRQ handlers cannot block. */
 	if (in_irq() || in_serving_softirq()) {
 		local_irq_restore(flags);
 		return;
 	}
 	/* Clean up if blocked during RCU read-side critical section. */
 	if (special & RCU_READ_UNLOCK_BLOCKED) {
 		t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
 		/*
 		 * Remove this task from the list it blocked on.  The
 		 * task can migrate while we acquire the lock, but at
 		 * most one time.  So at most two passes through loop.
 		 */
 		for (;;) {
 			rnp = t->rcu_blocked_node;
 			raw_spin_lock(&rnp->lock);  /* irqs already disabled. */
 			if (rnp == t->rcu_blocked_node)
 				break;
 			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 		}
 		empty = !rcu_preempt_blocked_readers_cgp(rnp);
 		empty_exp = !rcu_preempted_readers_exp(rnp);
 		smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
 		np = rcu_next_node_entry(t, rnp);
 		list_del_init(&t->rcu_node_entry);
 		t->rcu_blocked_node = NULL;
 		trace_rcu_unlock_preempted_task("rcu_preempt",
 						rnp->gpnum, t->pid);
 		if (&t->rcu_node_entry == rnp->gp_tasks)
 			rnp->gp_tasks = np;
 		if (&t->rcu_node_entry == rnp->exp_tasks)
 			rnp->exp_tasks = np;
 #ifdef CONFIG_RCU_BOOST
 		if (&t->rcu_node_entry == rnp->boost_tasks)
 			rnp->boost_tasks = np;
 		/* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */
 		if (t->rcu_boost_mutex) {
 			rbmp = t->rcu_boost_mutex;
 			t->rcu_boost_mutex = NULL;
 		}
 #endif /* #ifdef CONFIG_RCU_BOOST */
 		/*
 		 * If this was the last task on the current list, and if
 		 * we aren't waiting on any CPUs, report the quiescent state.
 		 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
 		 * so we must take a snapshot of the expedited state.
 		 */
 		empty_exp_now = !rcu_preempted_readers_exp(rnp);
 		if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
 			trace_rcu_quiescent_state_report("preempt_rcu",
 							 rnp->gpnum,
 							 0, rnp->qsmask,
 							 rnp->level,
 							 rnp->grplo,
 							 rnp->grphi,
 							 !!rnp->gp_tasks);
 			rcu_report_unblock_qs_rnp(rnp, flags);
 		} else
 			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 #ifdef CONFIG_RCU_BOOST
 		/* Unboost if we were boosted. */
 		if (rbmp)
 			rt_mutex_unlock(rbmp);
 #endif /* #ifdef CONFIG_RCU_BOOST */
 		/*
 		 * If this was the last task on the expedited lists,
 		 * then we need to report up the rcu_node hierarchy.
 		 */
 		if (!empty_exp && empty_exp_now)
 			rcu_report_exp_rnp(&rcu_preempt_state, rnp, true);
 	} else {
 		local_irq_restore(flags);
 	}
 }
 /*
  * Tree-preemptible RCU implementation for rcu_read_unlock().
  * Decrement ->rcu_read_lock_nesting.  If the result is zero (outermost
  * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
  * invoke rcu_read_unlock_special() to clean up after a context switch
  * in an RCU read-side critical section and other special cases.
  */
 void __rcu_read_unlock(void)
 {
 	struct task_struct *t = current;
 	if (t->rcu_read_lock_nesting != 1)
 		--t->rcu_read_lock_nesting;
 	else {
 		barrier();  /* critical section before exit code. */
 		t->rcu_read_lock_nesting = INT_MIN;
 		barrier();  /* assign before ->rcu_read_unlock_special load */
 		if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
 			rcu_read_unlock_special(t);
 		barrier();  /* ->rcu_read_unlock_special load before assign */
 		t->rcu_read_lock_nesting = 0;
 	}
 #ifdef CONFIG_PROVE_LOCKING
 	{
 		int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
 		WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
 	}
 #endif /* #ifdef CONFIG_PROVE_LOCKING */
 }
 EXPORT_SYMBOL_GPL(__rcu_read_unlock);
 #ifdef CONFIG_RCU_CPU_STALL_VERBOSE
 /*
  * Dump detailed information for all tasks blocking the current RCU
  * grace period on the specified rcu_node structure.
  */
 static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
 {
 	unsigned long flags;
 	struct task_struct *t;
 	if (!rcu_preempt_blocked_readers_cgp(rnp))
 		return;
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	t = list_entry(rnp->gp_tasks,
 		       struct task_struct, rcu_node_entry);
 	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
 		sched_show_task(t);
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 /*
  * Dump detailed information for all tasks blocking the current RCU
  * grace period.
  */
 static void rcu_print_detail_task_stall(struct rcu_state *rsp)
 {
 	struct rcu_node *rnp = rcu_get_root(rsp);
 	rcu_print_detail_task_stall_rnp(rnp);
 	rcu_for_each_leaf_node(rsp, rnp)
 		rcu_print_detail_task_stall_rnp(rnp);
 }
 #else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
 static void rcu_print_detail_task_stall(struct rcu_state *rsp)
 {
 }
 #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
 /*
  * Scan the current list of tasks blocked within RCU read-side critical
  * sections, printing out the tid of each.
  */
 static int rcu_print_task_stall(struct rcu_node *rnp)
 {
 	struct task_struct *t;
 	int ndetected = 0;
 	if (!rcu_preempt_blocked_readers_cgp(rnp))
 		return 0;
 	t = list_entry(rnp->gp_tasks,
 		       struct task_struct, rcu_node_entry);
 	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
 		printk(" P%d", t->pid);
 		ndetected++;
 	}
 	return ndetected;
 }
 /*
  * Suppress preemptible RCU's CPU stall warnings by pushing the
  * time of the next stall-warning message comfortably far into the
  * future.
  */
 static void rcu_preempt_stall_reset(void)
 {
 	rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
 }
 /*
  * Check that the list of blocked tasks for the newly completed grace
  * period is in fact empty.  It is a serious bug to complete a grace
  * period that still has RCU readers blocked!  This function must be
  * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
  * must be held by the caller.
  *
  * Also, if there are blocked tasks on the list, they automatically
  * block the newly created grace period, so set up ->gp_tasks accordingly.
  */
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 {
 	WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
 	if (!list_empty(&rnp->blkd_tasks))
 		rnp->gp_tasks = rnp->blkd_tasks.next;
 	WARN_ON_ONCE(rnp->qsmask);
 }
 #ifdef CONFIG_HOTPLUG_CPU
 /*
  * Handle tasklist migration for case in which all CPUs covered by the
  * specified rcu_node have gone offline.  Move them up to the root
  * rcu_node.  The reason for not just moving them to the immediate
  * parent is to remove the need for rcu_read_unlock_special() to
  * make more than two attempts to acquire the target rcu_node's lock.
  * Returns true if there were tasks blocking the current RCU grace
  * period.
  *
  * Returns 1 if there was previously a task blocking the current grace
  * period on the specified rcu_node structure.
  *
  * The caller must hold rnp->lock with irqs disabled.
  */
 static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
 				     struct rcu_node *rnp,
 				     struct rcu_data *rdp)
 {
 	struct list_head *lp;
 	struct list_head *lp_root;
 	int retval = 0;
 	struct rcu_node *rnp_root = rcu_get_root(rsp);
 	struct task_struct *t;
 	if (rnp == rnp_root) {
 		WARN_ONCE(1, "Last CPU thought to be offlined?");
 		return 0;  /* Shouldn't happen: at least one CPU online. */
 	}
 	/* If we are on an internal node, complain bitterly. */
 	WARN_ON_ONCE(rnp != rdp->mynode);
 	/*
 	 * Move tasks up to root rcu_node.  Don't try to get fancy for
 	 * this corner-case operation -- just put this node's tasks
 	 * at the head of the root node's list, and update the root node's
 	 * ->gp_tasks and ->exp_tasks pointers to those of this node's,
 	 * if non-NULL.  This might result in waiting for more tasks than
 	 * absolutely necessary, but this is a good performance/complexity
 	 * tradeoff.
 	 */
 	if (rcu_preempt_blocked_readers_cgp(rnp))
 		retval |= RCU_OFL_TASKS_NORM_GP;
 	if (rcu_preempted_readers_exp(rnp))
 		retval |= RCU_OFL_TASKS_EXP_GP;
 	lp = &rnp->blkd_tasks;
 	lp_root = &rnp_root->blkd_tasks;
 	while (!list_empty(lp)) {
 		t = list_entry(lp->next, typeof(*t), rcu_node_entry);
 		raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
 		list_del(&t->rcu_node_entry);
 		t->rcu_blocked_node = rnp_root;
 		list_add(&t->rcu_node_entry, lp_root);
 		if (&t->rcu_node_entry == rnp->gp_tasks)
 			rnp_root->gp_tasks = rnp->gp_tasks;
 		if (&t->rcu_node_entry == rnp->exp_tasks)
 			rnp_root->exp_tasks = rnp->exp_tasks;
 #ifdef CONFIG_RCU_BOOST
 		if (&t->rcu_node_entry == rnp->boost_tasks)
 			rnp_root->boost_tasks = rnp->boost_tasks;
 #endif /* #ifdef CONFIG_RCU_BOOST */
 		raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
 	}
 #ifdef CONFIG_RCU_BOOST
 	/* In case root is being boosted and leaf is not. */
 	raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
 	if (rnp_root->boost_tasks != NULL &&
 	    rnp_root->boost_tasks != rnp_root->gp_tasks)
 		rnp_root->boost_tasks = rnp_root->gp_tasks;
 	raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
 #endif /* #ifdef CONFIG_RCU_BOOST */
 	rnp->gp_tasks = NULL;
 	rnp->exp_tasks = NULL;
 	return retval;
 }
 /*
  * Do CPU-offline processing for preemptible RCU.
  */
 static void rcu_preempt_offline_cpu(int cpu)
 {
 	__rcu_offline_cpu(cpu, &rcu_preempt_state);
 }
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 /*
  * Check for a quiescent state from the current CPU.  When a task blocks,
  * the task is recorded in the corresponding CPU's rcu_node structure,
  * which is checked elsewhere.
  *
  * Caller must disable hard irqs.
  */
 static void rcu_preempt_check_callbacks(int cpu)
 {
 	struct task_struct *t = current;
 	if (t->rcu_read_lock_nesting == 0) {
 		rcu_preempt_qs(cpu);
 		return;
 	}
 	if (t->rcu_read_lock_nesting > 0 &&
 	    per_cpu(rcu_preempt_data, cpu).qs_pending)
 		t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
 }
 /*
  * Process callbacks for preemptible RCU.
  */
 static void rcu_preempt_process_callbacks(void)
 {
 	__rcu_process_callbacks(&rcu_preempt_state,
 				&__get_cpu_var(rcu_preempt_data));
 }
 #ifdef CONFIG_RCU_BOOST
 static void rcu_preempt_do_callbacks(void)
 {
 	rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data));
 }
 #endif /* #ifdef CONFIG_RCU_BOOST */
 /*
  * Queue a preemptible-RCU callback for invocation after a grace period.
  */
 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 {
 	__call_rcu(head, func, &rcu_preempt_state);
 }
 EXPORT_SYMBOL_GPL(call_rcu);
 /**
  * synchronize_rcu - wait until a grace period has elapsed.
  *
  * Control will return to the caller some time after a full grace
  * period has elapsed, in other words after all currently executing RCU
  * read-side critical sections have completed.  Note, however, that
  * upon return from synchronize_rcu(), the caller might well be executing
  * concurrently with new RCU read-side critical sections that began while
  * synchronize_rcu() was waiting.  RCU read-side critical sections are
  * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
  */
 void synchronize_rcu(void)
 {
 	if (!rcu_scheduler_active)
 		return;
 	wait_rcu_gp(call_rcu);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu);
 static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
 static long sync_rcu_preempt_exp_count;
 static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
 /*
  * Return non-zero if there are any tasks in RCU read-side critical
  * sections blocking the current preemptible-RCU expedited grace period.
  * If there is no preemptible-RCU expedited grace period currently in
  * progress, returns zero unconditionally.
  */
 static int rcu_preempted_readers_exp(struct rcu_node *rnp)
 {
 	return rnp->exp_tasks != NULL;
 }
 /*
  * return non-zero if there is no RCU expedited grace period in progress
  * for the specified rcu_node structure, in other words, if all CPUs and
  * tasks covered by the specified rcu_node structure have done their bit
  * for the current expedited grace period.  Works only for preemptible
  * RCU -- other RCU implementation use other means.
  *
  * Caller must hold sync_rcu_preempt_exp_mutex.
  */
 static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
 {
 	return !rcu_preempted_readers_exp(rnp) &&
 	       ACCESS_ONCE(rnp->expmask) == 0;
 }
 /*
  * Report the exit from RCU read-side critical section for the last task
  * that queued itself during or before the current expedited preemptible-RCU
  * grace period.  This event is reported either to the rcu_node structure on
  * which the task was queued or to one of that rcu_node structure's ancestors,
  * recursively up the tree.  (Calm down, calm down, we do the recursion
  * iteratively!)
  *
  * Most callers will set the "wake" flag, but the task initiating the
  * expedited grace period need not wake itself.
  *
  * Caller must hold sync_rcu_preempt_exp_mutex.
  */
 static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 			       bool wake)
 {
 	unsigned long flags;
 	unsigned long mask;
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	for (;;) {
 		if (!sync_rcu_preempt_exp_done(rnp)) {
 			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 			break;
 		}
 		if (rnp->parent == NULL) {
 			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 			if (wake)
 				wake_up(&sync_rcu_preempt_exp_wq);
 			break;
 		}
 		mask = rnp->grpmask;
 		raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
 		rnp = rnp->parent;
 		raw_spin_lock(&rnp->lock); /* irqs already disabled */
 		rnp->expmask &= ~mask;
 	}
 }
 /*
  * Snapshot the tasks blocking the newly started preemptible-RCU expedited
  * grace period for the specified rcu_node structure.  If there are no such
  * tasks, report it up the rcu_node hierarchy.
  *
  * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock.
  */
 static void
 sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
 {
 	unsigned long flags;
 	int must_wait = 0;
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	if (list_empty(&rnp->blkd_tasks))
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	else {
 		rnp->exp_tasks = rnp->blkd_tasks.next;
 		rcu_initiate_boost(rnp, flags);  /* releases rnp->lock */
 		must_wait = 1;
 	}
 	if (!must_wait)
 		rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */
 }
 /*
  * Wait for an rcu-preempt grace period, but expedite it.  The basic idea
  * is to invoke synchronize_sched_expedited() to push all the tasks to
  * the ->blkd_tasks lists and wait for this list to drain.
  */
 void synchronize_rcu_expedited(void)
 {
 	unsigned long flags;
 	struct rcu_node *rnp;
 	struct rcu_state *rsp = &rcu_preempt_state;
 	long snap;
 	int trycount = 0;
 	smp_mb(); /* Caller's modifications seen first by other CPUs. */
 	snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1;
 	smp_mb(); /* Above access cannot bleed into critical section. */
 	/*
 	 * Acquire lock, falling back to synchronize_rcu() if too many
 	 * lock-acquisition failures.  Of course, if someone does the
 	 * expedited grace period for us, just leave.
 	 */
 	while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
 		if (trycount++ < 10)
 			udelay(trycount * num_online_cpus());
 		else {
 			synchronize_rcu();
 			return;
 		}
 		if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
 			goto mb_ret; /* Others did our work for us. */
 	}
 	if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
 		goto unlock_mb_ret; /* Others did our work for us. */
 	/* force all RCU readers onto ->blkd_tasks lists. */
 	synchronize_sched_expedited();
 	raw_spin_lock_irqsave(&rsp->onofflock, flags);
 	/* Initialize ->expmask for all non-leaf rcu_node structures. */
 	rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
 		raw_spin_lock(&rnp->lock); /* irqs already disabled. */
 		rnp->expmask = rnp->qsmaskinit;
 		raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 	}
 	/* Snapshot current state of ->blkd_tasks lists. */
 	rcu_for_each_leaf_node(rsp, rnp)
 		sync_rcu_preempt_exp_init(rsp, rnp);
 	if (NUM_RCU_NODES > 1)
 		sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
 	raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 	/* Wait for snapshotted ->blkd_tasks lists to drain. */
 	rnp = rcu_get_root(rsp);
 	wait_event(sync_rcu_preempt_exp_wq,
 		   sync_rcu_preempt_exp_done(rnp));
 	/* Clean up and exit. */
 	smp_mb(); /* ensure expedited GP seen before counter increment. */
 	ACCESS_ONCE(sync_rcu_preempt_exp_count)++;
 unlock_mb_ret:
 	mutex_unlock(&sync_rcu_preempt_exp_mutex);
 mb_ret:
 	smp_mb(); /* ensure subsequent action seen after grace period. */
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 /*
  * Check to see if there is any immediate preemptible-RCU-related work
  * to be done.
  */
 static int rcu_preempt_pending(int cpu)
 {
 	return __rcu_pending(&rcu_preempt_state,
 			     &per_cpu(rcu_preempt_data, cpu));
 }
 /*
  * Does preemptible RCU need the CPU to stay out of dynticks mode?
  */
 static int rcu_preempt_needs_cpu(int cpu)
 {
 	return !!per_cpu(rcu_preempt_data, cpu).nxtlist;
 }
 /**
  * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
  */
 void rcu_barrier(void)
 {
 	_rcu_barrier(&rcu_preempt_state, call_rcu);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier);
 /*
  * Initialize preemptible RCU's per-CPU data.
  */
 static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
 {
 	rcu_init_percpu_data(cpu, &rcu_preempt_state, 1);
 }
 /*
  * Move preemptible RCU's callbacks from dying CPU to other online CPU.
  */
 static void rcu_preempt_send_cbs_to_online(void)
 {
 	rcu_send_cbs_to_online(&rcu_preempt_state);
 }
 /*
  * Initialize preemptible RCU's state structures.
  */
 static void __init __rcu_init_preempt(void)
 {
 	rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
 }
 /*
  * Check for a task exiting while in a preemptible-RCU read-side
  * critical section, clean up if so.  No need to issue warnings,
  * as debug_check_no_locks_held() already does this if lockdep
  * is enabled.
  */
 void exit_rcu(void)
 {
 	struct task_struct *t = current;
 	if (t->rcu_read_lock_nesting == 0)
 		return;
 	t->rcu_read_lock_nesting = 1;
 	__rcu_read_unlock();
 }
 #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 static struct rcu_state *rcu_state = &rcu_sched_state;
 /*
  * Tell them what RCU they are running.
  */
 static void __init rcu_bootup_announce(void)
 {
 	printk(KERN_INFO "Hierarchical RCU implementation.\n");
 	rcu_bootup_announce_oddness();
 }
 /*
  * Return the number of RCU batches processed thus far for debug & stats.
  */
 long rcu_batches_completed(void)
 {
 	return rcu_batches_completed_sched();
 }
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
 /*
  * Force a quiescent state for RCU, which, because there is no preemptible
  * RCU, becomes the same as rcu-sched.
  */
 void rcu_force_quiescent_state(void)
 {
 	rcu_sched_force_quiescent_state();
 }
 EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
 /*
  * Because preemptible RCU does not exist, we never have to check for
  * CPUs being in quiescent states.
  */
 static void rcu_preempt_note_context_switch(int cpu)
 {
 }
 /*
  * Because preemptible RCU does not exist, there are never any preempted
  * RCU readers.
  */
 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 {
 	return 0;
 }
 #ifdef CONFIG_HOTPLUG_CPU
 /* Because preemptible RCU does not exist, no quieting of tasks. */
 static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
 {
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 /*
  * Because preemptible RCU does not exist, we never have to check for
  * tasks blocked within RCU read-side critical sections.
  */
 static void rcu_print_detail_task_stall(struct rcu_state *rsp)
 {
 }
 /*
  * Because preemptible RCU does not exist, we never have to check for
  * tasks blocked within RCU read-side critical sections.
  */
 static int rcu_print_task_stall(struct rcu_node *rnp)
 {
 	return 0;
 }
 /*
  * Because preemptible RCU does not exist, there is no need to suppress
  * its CPU stall warnings.
  */
 static void rcu_preempt_stall_reset(void)
 {
 }
 /*
  * Because there is no preemptible RCU, there can be no readers blocked,
  * so there is no need to check for blocked tasks.  So check only for
  * bogus qsmask values.
  */
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 {
 	WARN_ON_ONCE(rnp->qsmask);
 }
 #ifdef CONFIG_HOTPLUG_CPU
 /*
  * Because preemptible RCU does not exist, it never needs to migrate
  * tasks that were blocked within RCU read-side critical sections, and
  * such non-existent tasks cannot possibly have been blocking the current
  * grace period.
  */
 static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
 				     struct rcu_node *rnp,
 				     struct rcu_data *rdp)
 {
 	return 0;
 }
 /*
  * Because preemptible RCU does not exist, it never needs CPU-offline
  * processing.
  */
 static void rcu_preempt_offline_cpu(int cpu)
 {
 }
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 /*
  * Because preemptible RCU does not exist, it never has any callbacks
  * to check.
  */
 static void rcu_preempt_check_callbacks(int cpu)
 {
 }
 /*
  * Because preemptible RCU does not exist, it never has any callbacks
  * to process.
  */
 static void rcu_preempt_process_callbacks(void)
 {
 }
 /*
  * Wait for an rcu-preempt grace period, but make it happen quickly.
  * But because preemptible RCU does not exist, map to rcu-sched.
  */
 void synchronize_rcu_expedited(void)
 {
 	synchronize_sched_expedited();
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 #ifdef CONFIG_HOTPLUG_CPU
 /*
  * Because preemptible RCU does not exist, there is never any need to
  * report on tasks preempted in RCU read-side critical sections during
  * expedited RCU grace periods.
  */
 static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 			       bool wake)
 {
 }
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 /*
  * Because preemptible RCU does not exist, it never has any work to do.
  */
 static int rcu_preempt_pending(int cpu)
 {
 	return 0;
 }
 /*
  * Because preemptible RCU does not exist, it never needs any CPU.
  */
 static int rcu_preempt_needs_cpu(int cpu)
 {
 	return 0;
 }
 /*
  * Because preemptible RCU does not exist, rcu_barrier() is just
  * another name for rcu_barrier_sched().
  */
 void rcu_barrier(void)
 {
 	rcu_barrier_sched();
 }
 EXPORT_SYMBOL_GPL(rcu_barrier);
 /*
  * Because preemptible RCU does not exist, there is no per-CPU
  * data to initialize.
  */
 static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
 {
 }
 /*
  * Because there is no preemptible RCU, there are no callbacks to move.
  */
 static void rcu_preempt_send_cbs_to_online(void)
 {
 }
 /*
  * Because preemptible RCU does not exist, it need not be initialized.
  */
 static void __init __rcu_init_preempt(void)
 {
 }
 #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
 #ifdef CONFIG_RCU_BOOST
 #include "rtmutex_common.h"
 #ifdef CONFIG_RCU_TRACE
 static void rcu_initiate_boost_trace(struct rcu_node *rnp)
 {
 	if (list_empty(&rnp->blkd_tasks))
 		rnp->n_balk_blkd_tasks++;
 	else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
 		rnp->n_balk_exp_gp_tasks++;
 	else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL)
 		rnp->n_balk_boost_tasks++;
 	else if (rnp->gp_tasks != NULL && rnp->qsmask != 0)
 		rnp->n_balk_notblocked++;
 	else if (rnp->gp_tasks != NULL &&
 		 ULONG_CMP_LT(jiffies, rnp->boost_time))
 		rnp->n_balk_notyet++;
 	else
 		rnp->n_balk_nos++;
 }
 #else /* #ifdef CONFIG_RCU_TRACE */
 static void rcu_initiate_boost_trace(struct rcu_node *rnp)
 {
 }
 #endif /* #else #ifdef CONFIG_RCU_TRACE */
 static struct lock_class_key rcu_boost_class;
 /*
  * Carry out RCU priority boosting on the task indicated by ->exp_tasks
  * or ->boost_tasks, advancing the pointer to the next task in the
  * ->blkd_tasks list.
  *
  * Note that irqs must be enabled: boosting the task can block.
  * Returns 1 if there are more tasks needing to be boosted.
  */
 static int rcu_boost(struct rcu_node *rnp)
 {
 	unsigned long flags;
 	struct rt_mutex mtx;
 	struct task_struct *t;
 	struct list_head *tb;
 	if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL)
 		return 0;  /* Nothing left to boost. */
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	/*
 	 * Recheck under the lock: all tasks in need of boosting
 	 * might exit their RCU read-side critical sections on their own.
 	 */
 	if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return 0;
 	}
 	/*
 	 * Preferentially boost tasks blocking expedited grace periods.
 	 * This cannot starve the normal grace periods because a second
 	 * expedited grace period must boost all blocked tasks, including
 	 * those blocking the pre-existing normal grace period.
 	 */
 	if (rnp->exp_tasks != NULL) {
 		tb = rnp->exp_tasks;
 		rnp->n_exp_boosts++;
 	} else {
 		tb = rnp->boost_tasks;
 		rnp->n_normal_boosts++;
 	}
 	rnp->n_tasks_boosted++;
 	/*
 	 * We boost task t by manufacturing an rt_mutex that appears to
 	 * be held by task t.  We leave a pointer to that rt_mutex where
 	 * task t can find it, and task t will release the mutex when it
 	 * exits its outermost RCU read-side critical section.  Then
 	 * simply acquiring this artificial rt_mutex will boost task
 	 * t's priority.  (Thanks to tglx for suggesting this approach!)
 	 *
 	 * Note that task t must acquire rnp->lock to remove itself from
 	 * the ->blkd_tasks list, which it will do from exit() if from
 	 * nowhere else.  We therefore are guaranteed that task t will
 	 * stay around at least until we drop rnp->lock.  Note that
 	 * rnp->lock also resolves races between our priority boosting
 	 * and task t's exiting its outermost RCU read-side critical
 	 * section.
 	 */
 	t = container_of(tb, struct task_struct, rcu_node_entry);
 	rt_mutex_init_proxy_locked(&mtx, t);
 	/* Avoid lockdep false positives.  This rt_mutex is its own thing. */
 	lockdep_set_class_and_name(&mtx.wait_lock, &rcu_boost_class,
 				   "rcu_boost_mutex");
 	t->rcu_boost_mutex = &mtx;
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	rt_mutex_lock(&mtx);  /* Side effect: boosts task t's priority. */
 	rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */
 	return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL;
 }
 /*
  * Timer handler to initiate waking up of boost kthreads that
  * have yielded the CPU due to excessive numbers of tasks to
  * boost.  We wake up the per-rcu_node kthread, which in turn
  * will wake up the booster kthread.
  */
 static void rcu_boost_kthread_timer(unsigned long arg)
 {
 	invoke_rcu_node_kthread((struct rcu_node *)arg);
 }
 /*
  * Priority-boosting kthread.  One per leaf rcu_node and one for the
  * root rcu_node.
  */
 static int rcu_boost_kthread(void *arg)
 {
 	struct rcu_node *rnp = (struct rcu_node *)arg;
 	int spincnt = 0;
 	int more2boost;
 	trace_rcu_utilization("Start boost kthread@init");
 	for (;;) {
 		rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
 		trace_rcu_utilization("End boost kthread@rcu_wait");
 		rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
 		trace_rcu_utilization("Start boost kthread@rcu_wait");
 		rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
 		more2boost = rcu_boost(rnp);
 		if (more2boost)
 			spincnt++;
 		else
 			spincnt = 0;
 		if (spincnt > 10) {
 			trace_rcu_utilization("End boost kthread@rcu_yield");
 			rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp);
 			trace_rcu_utilization("Start boost kthread@rcu_yield");
 			spincnt = 0;
 		}
 	}
 	/* NOTREACHED */
 	trace_rcu_utilization("End boost kthread@notreached");
 	return 0;
 }
 /*
  * Check to see if it is time to start boosting RCU readers that are
  * blocking the current grace period, and, if so, tell the per-rcu_node
  * kthread to start boosting them.  If there is an expedited grace
  * period in progress, it is always time to boost.
  *
  * The caller must hold rnp->lock, which this function releases,
  * but irqs remain disabled.  The ->boost_kthread_task is immortal,
  * so we don't need to worry about it going away.
  */
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
 {
 	struct task_struct *t;
 	if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
 		rnp->n_balk_exp_gp_tasks++;
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
 	}
 	if (rnp->exp_tasks != NULL ||
 	    (rnp->gp_tasks != NULL &&
 	     rnp->boost_tasks == NULL &&
 	     rnp->qsmask == 0 &&
 	     ULONG_CMP_GE(jiffies, rnp->boost_time))) {
 		if (rnp->exp_tasks == NULL)
 			rnp->boost_tasks = rnp->gp_tasks;
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		t = rnp->boost_kthread_task;
 		if (t != NULL)
 			wake_up_process(t);
 	} else {
 		rcu_initiate_boost_trace(rnp);
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	}
 }
 /*
  * Wake up the per-CPU kthread to invoke RCU callbacks.
  */
 static void invoke_rcu_callbacks_kthread(void)
 {
 	unsigned long flags;
 	local_irq_save(flags);
 	__this_cpu_write(rcu_cpu_has_work, 1);
 	if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
 	    current != __this_cpu_read(rcu_cpu_kthread_task))
 		wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
 	local_irq_restore(flags);
 }
 /*
  * Set the affinity of the boost kthread.  The CPU-hotplug locks are
  * held, so no one should be messing with the existence of the boost
  * kthread.
  */
 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
 					  cpumask_var_t cm)
 {
 	struct task_struct *t;
 	t = rnp->boost_kthread_task;
 	if (t != NULL)
 		set_cpus_allowed_ptr(rnp->boost_kthread_task, cm);
 }
 #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
 /*
  * Do priority-boost accounting for the start of a new grace period.
  */
 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
 {
 	rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
 }
 /*
  * Create an RCU-boost kthread for the specified node if one does not
  * already exist.  We only create this kthread for preemptible RCU.
  * Returns zero if all is well, a negated errno otherwise.
  */
 static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
 						 struct rcu_node *rnp,
 						 int rnp_index)
 {
 	unsigned long flags;
 	struct sched_param sp;
 	struct task_struct *t;
 	if (&rcu_preempt_state != rsp)
 		return 0;
 	rsp->boost = 1;
 	if (rnp->boost_kthread_task != NULL)
 		return 0;
 	t = kthread_create(rcu_boost_kthread, (void *)rnp,
 			   "rcub/%d", rnp_index);
 	if (IS_ERR(t))
 		return PTR_ERR(t);
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	rnp->boost_kthread_task = t;
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	sp.sched_priority = RCU_BOOST_PRIO;
 	sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
 	wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
 	return 0;
 }
 #ifdef CONFIG_HOTPLUG_CPU
 /*
  * Stop the RCU's per-CPU kthread when its CPU goes offline,.
  */
 static void rcu_stop_cpu_kthread(int cpu)
 {
 	struct task_struct *t;
 	/* Stop the CPU's kthread. */
 	t = per_cpu(rcu_cpu_kthread_task, cpu);
 	if (t != NULL) {
 		per_cpu(rcu_cpu_kthread_task, cpu) = NULL;
 		kthread_stop(t);
 	}
 }
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static void rcu_kthread_do_work(void)
 {
 	rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data));
 	rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
 	rcu_preempt_do_callbacks();
 }
 /*
  * Wake up the specified per-rcu_node-structure kthread.
  * Because the per-rcu_node kthreads are immortal, we don't need
  * to do anything to keep them alive.
  */
 static void invoke_rcu_node_kthread(struct rcu_node *rnp)
 {
 	struct task_struct *t;
 	t = rnp->node_kthread_task;
 	if (t != NULL)
 		wake_up_process(t);
 }
 /*
  * Set the specified CPU's kthread to run RT or not, as specified by
  * the to_rt argument.  The CPU-hotplug locks are held, so the task
  * is not going away.
  */
 static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
 {
 	int policy;
 	struct sched_param sp;
 	struct task_struct *t;
 	t = per_cpu(rcu_cpu_kthread_task, cpu);
 	if (t == NULL)
 		return;
 	if (to_rt) {
 		policy = SCHED_FIFO;
 		sp.sched_priority = RCU_KTHREAD_PRIO;
 	} else {
 		policy = SCHED_NORMAL;
 		sp.sched_priority = 0;
 	}
 	sched_setscheduler_nocheck(t, policy, &sp);
 }
 /*
  * Timer handler to initiate the waking up of per-CPU kthreads that
  * have yielded the CPU due to excess numbers of RCU callbacks.
  * We wake up the per-rcu_node kthread, which in turn will wake up
  * the booster kthread.
  */
 static void rcu_cpu_kthread_timer(unsigned long arg)
 {
 	struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
 	struct rcu_node *rnp = rdp->mynode;
 	atomic_or(rdp->grpmask, &rnp->wakemask);
 	invoke_rcu_node_kthread(rnp);
 }
 /*
  * Drop to non-real-time priority and yield, but only after posting a
  * timer that will cause us to regain our real-time priority if we
  * remain preempted.  Either way, we restore our real-time priority
  * before returning.
  */
 static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
 {
 	struct sched_param sp;
 	struct timer_list yield_timer;
 	int prio = current->rt_priority;
 	setup_timer_on_stack(&yield_timer, f, arg);
 	mod_timer(&yield_timer, jiffies + 2);
 	sp.sched_priority = 0;
 	sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
 	set_user_nice(current, 19);
 	schedule();
 	set_user_nice(current, 0);
 	sp.sched_priority = prio;
 	sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
 	del_timer(&yield_timer);
 }
 /*
  * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
  * This can happen while the corresponding CPU is either coming online
  * or going offline.  We cannot wait until the CPU is fully online
  * before starting the kthread, because the various notifier functions
  * can wait for RCU grace periods.  So we park rcu_cpu_kthread() until
  * the corresponding CPU is online.
  *
  * Return 1 if the kthread needs to stop, 0 otherwise.
  *
  * Caller must disable bh.  This function can momentarily enable it.
  */
 static int rcu_cpu_kthread_should_stop(int cpu)
 {
 	while (cpu_is_offline(cpu) ||
 	       !cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)) ||
 	       smp_processor_id() != cpu) {
 		if (kthread_should_stop())
 			return 1;
 		per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
 		per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id();
 		local_bh_enable();
 		schedule_timeout_uninterruptible(1);
 		if (!cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)))
 			set_cpus_allowed_ptr(current, cpumask_of(cpu));
 		local_bh_disable();
 	}
 	per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
 	return 0;
 }
 /*
  * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
  * RCU softirq used in flavors and configurations of RCU that do not
  * support RCU priority boosting.
  */
 static int rcu_cpu_kthread(void *arg)
 {
 	int cpu = (int)(long)arg;
 	unsigned long flags;
 	int spincnt = 0;
 	unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
 	char work;
 	char *workp = &per_cpu(rcu_cpu_has_work, cpu);
 	trace_rcu_utilization("Start CPU kthread@init");
 	for (;;) {
 		*statusp = RCU_KTHREAD_WAITING;
 		trace_rcu_utilization("End CPU kthread@rcu_wait");
 		rcu_wait(*workp != 0 || kthread_should_stop());
 		trace_rcu_utilization("Start CPU kthread@rcu_wait");
 		local_bh_disable();
 		if (rcu_cpu_kthread_should_stop(cpu)) {
 			local_bh_enable();
 			break;
 		}
 		*statusp = RCU_KTHREAD_RUNNING;
 		per_cpu(rcu_cpu_kthread_loops, cpu)++;
 		local_irq_save(flags);
 		work = *workp;
 		*workp = 0;
 		local_irq_restore(flags);
 		if (work)
 			rcu_kthread_do_work();
 		local_bh_enable();
 		if (*workp != 0)
 			spincnt++;
 		else
 			spincnt = 0;
 		if (spincnt > 10) {
 			*statusp = RCU_KTHREAD_YIELDING;
 			trace_rcu_utilization("End CPU kthread@rcu_yield");
 			rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
 			trace_rcu_utilization("Start CPU kthread@rcu_yield");
 			spincnt = 0;
 		}
 	}
 	*statusp = RCU_KTHREAD_STOPPED;
 	trace_rcu_utilization("End CPU kthread@term");
 	return 0;
 }
 /*
  * Spawn a per-CPU kthread, setting up affinity and priority.
  * Because the CPU hotplug lock is held, no other CPU will be attempting
  * to manipulate rcu_cpu_kthread_task.  There might be another CPU
  * attempting to access it during boot, but the locking in kthread_bind()
  * will enforce sufficient ordering.
  *
  * Please note that we cannot simply refuse to wake up the per-CPU
  * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state,
  * which can result in softlockup complaints if the task ends up being
  * idle for more than a couple of minutes.
  *
  * However, please note also that we cannot bind the per-CPU kthread to its
  * CPU until that CPU is fully online.  We also cannot wait until the
  * CPU is fully online before we create its per-CPU kthread, as this would
  * deadlock the system when CPU notifiers tried waiting for grace
  * periods.  So we bind the per-CPU kthread to its CPU only if the CPU
  * is online.  If its CPU is not yet fully online, then the code in
  * rcu_cpu_kthread() will wait until it is fully online, and then do
  * the binding.
  */
 static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
 {
 	struct sched_param sp;
 	struct task_struct *t;
 	if (!rcu_scheduler_fully_active ||
 	    per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
 		return 0;
 	t = kthread_create_on_node(rcu_cpu_kthread,
 				   (void *)(long)cpu,
 				   cpu_to_node(cpu),
 				   "rcuc/%d", cpu);
 	if (IS_ERR(t))
 		return PTR_ERR(t);
 	if (cpu_online(cpu))
 		kthread_bind(t, cpu);
 	per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
 	WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
 	sp.sched_priority = RCU_KTHREAD_PRIO;
 	sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
 	per_cpu(rcu_cpu_kthread_task, cpu) = t;
 	wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */
 	return 0;
 }
 /*
  * Per-rcu_node kthread, which is in charge of waking up the per-CPU
  * kthreads when needed.  We ignore requests to wake up kthreads
  * for offline CPUs, which is OK because force_quiescent_state()
  * takes care of this case.
  */
 static int rcu_node_kthread(void *arg)
 {
 	int cpu;
 	unsigned long flags;
 	unsigned long mask;
 	struct rcu_node *rnp = (struct rcu_node *)arg;
 	struct sched_param sp;
 	struct task_struct *t;
 	for (;;) {
 		rnp->node_kthread_status = RCU_KTHREAD_WAITING;
 		rcu_wait(atomic_read(&rnp->wakemask) != 0);
 		rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
 		raw_spin_lock_irqsave(&rnp->lock, flags);
 		mask = atomic_xchg(&rnp->wakemask, 0);
 		rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
 		for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
 			if ((mask & 0x1) == 0)
 				continue;
 			preempt_disable();
 			t = per_cpu(rcu_cpu_kthread_task, cpu);
 			if (!cpu_online(cpu) || t == NULL) {
 				preempt_enable();
 				continue;
 			}
 			per_cpu(rcu_cpu_has_work, cpu) = 1;
 			sp.sched_priority = RCU_KTHREAD_PRIO;
 			sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
 			preempt_enable();
 		}
 	}
 	/* NOTREACHED */
 	rnp->node_kthread_status = RCU_KTHREAD_STOPPED;
 	return 0;
 }
 /*
  * Set the per-rcu_node kthread's affinity to cover all CPUs that are
  * served by the rcu_node in question.  The CPU hotplug lock is still
  * held, so the value of rnp->qsmaskinit will be stable.
  *
  * We don't include outgoingcpu in the affinity set, use -1 if there is
  * no outgoing CPU.  If there are no CPUs left in the affinity set,
  * this function allows the kthread to execute on any CPU.
  */
 static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 {
 	cpumask_var_t cm;
 	int cpu;
 	unsigned long mask = rnp->qsmaskinit;
 	if (rnp->node_kthread_task == NULL)
 		return;
 	if (!alloc_cpumask_var(&cm, GFP_KERNEL))
 		return;
 	cpumask_clear(cm);
 	for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
 		if ((mask & 0x1) && cpu != outgoingcpu)
 			cpumask_set_cpu(cpu, cm);
 	if (cpumask_weight(cm) == 0) {
 		cpumask_setall(cm);
 		for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
 			cpumask_clear_cpu(cpu, cm);
 		WARN_ON_ONCE(cpumask_weight(cm) == 0);
 	}
 	set_cpus_allowed_ptr(rnp->node_kthread_task, cm);
 	rcu_boost_kthread_setaffinity(rnp, cm);
 	free_cpumask_var(cm);
 }
 /*
  * Spawn a per-rcu_node kthread, setting priority and affinity.
  * Called during boot before online/offline can happen, or, if
  * during runtime, with the main CPU-hotplug locks held.  So only
  * one of these can be executing at a time.
  */
 static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
 						struct rcu_node *rnp)
 {
 	unsigned long flags;
 	int rnp_index = rnp - &rsp->node[0];
 	struct sched_param sp;
 	struct task_struct *t;
 	if (!rcu_scheduler_fully_active ||
 	    rnp->qsmaskinit == 0)
 		return 0;
 	if (rnp->node_kthread_task == NULL) {
 		t = kthread_create(rcu_node_kthread, (void *)rnp,
 				   "rcun/%d", rnp_index);
 		if (IS_ERR(t))
 			return PTR_ERR(t);
 		raw_spin_lock_irqsave(&rnp->lock, flags);
 		rnp->node_kthread_task = t;
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		sp.sched_priority = 99;
 		sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
 		wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
 	}
 	return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
 }
 /*
  * Spawn all kthreads -- called as soon as the scheduler is running.
  */
 static int __init rcu_spawn_kthreads(void)
 {
 	int cpu;
 	struct rcu_node *rnp;
 	rcu_scheduler_fully_active = 1;
 	for_each_possible_cpu(cpu) {
 		per_cpu(rcu_cpu_has_work, cpu) = 0;
 		if (cpu_online(cpu))
 			(void)rcu_spawn_one_cpu_kthread(cpu);
 	}
 	rnp = rcu_get_root(rcu_state);
 	(void)rcu_spawn_one_node_kthread(rcu_state, rnp);
 	if (NUM_RCU_NODES > 1) {
 		rcu_for_each_leaf_node(rcu_state, rnp)
 			(void)rcu_spawn_one_node_kthread(rcu_state, rnp);
 	}
 	return 0;
 }
 early_initcall(rcu_spawn_kthreads);
 static void __cpuinit rcu_prepare_kthreads(int cpu)
 {
 	struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
 	struct rcu_node *rnp = rdp->mynode;
 	/* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
 	if (rcu_scheduler_fully_active) {
 		(void)rcu_spawn_one_cpu_kthread(cpu);
 		if (rnp->node_kthread_task == NULL)
 			(void)rcu_spawn_one_node_kthread(rcu_state, rnp);
 	}
 }
 #else /* #ifdef CONFIG_RCU_BOOST */
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
 {
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 static void invoke_rcu_callbacks_kthread(void)
 {
 	WARN_ON_ONCE(1);
 }
 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
 {
 }
 #ifdef CONFIG_HOTPLUG_CPU
 static void rcu_stop_cpu_kthread(int cpu)
 {
 }
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 {
 }
 static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
 {
 }
 static int __init rcu_scheduler_really_started(void)
 {
 	rcu_scheduler_fully_active = 1;
 	return 0;
 }
 early_initcall(rcu_scheduler_really_started);
 static void __cpuinit rcu_prepare_kthreads(int cpu)
 {
 }
 #endif /* #else #ifdef CONFIG_RCU_BOOST */
 #ifndef CONFIG_SMP
 void synchronize_sched_expedited(void)
 {
 	cond_resched();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 #else /* #ifndef CONFIG_SMP */
 static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
 static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
 static int synchronize_sched_expedited_cpu_stop(void *data)
 {
 	/*
 	 * There must be a full memory barrier on each affected CPU
 	 * between the time that try_stop_cpus() is called and the
 	 * time that it returns.
 	 *
 	 * In the current initial implementation of cpu_stop, the
 	 * above condition is already met when the control reaches
 	 * this point and the following smp_mb() is not strictly
 	 * necessary.  Do smp_mb() anyway for documentation and
 	 * robustness against future implementation changes.
 	 */
 	smp_mb(); /* See above comment block. */
 	return 0;
 }
 /*
  * Wait for an rcu-sched grace period to elapse, but use "big hammer"
  * approach to force grace period to end quickly.  This consumes
  * significant time on all CPUs, and is thus not recommended for
  * any sort of common-case code.
  *
  * Note that it is illegal to call this function while holding any
  * lock that is acquired by a CPU-hotplug notifier.  Failing to
  * observe this restriction will result in deadlock.
  *
  * This implementation can be thought of as an application of ticket
  * locking to RCU, with sync_sched_expedited_started and
  * sync_sched_expedited_done taking on the roles of the halves
  * of the ticket-lock word.  Each task atomically increments
  * sync_sched_expedited_started upon entry, snapshotting the old value,
  * then attempts to stop all the CPUs.  If this succeeds, then each
  * CPU will have executed a context switch, resulting in an RCU-sched
  * grace period.  We are then done, so we use atomic_cmpxchg() to
  * update sync_sched_expedited_done to match our snapshot -- but
  * only if someone else has not already advanced past our snapshot.
  *
  * On the other hand, if try_stop_cpus() fails, we check the value
  * of sync_sched_expedited_done.  If it has advanced past our
  * initial snapshot, then someone else must have forced a grace period
  * some time after we took our snapshot.  In this case, our work is
  * done for us, and we can simply return.  Otherwise, we try again,
  * but keep our initial snapshot for purposes of checking for someone
  * doing our work for us.
  *
  * If we fail too many times in a row, we fall back to synchronize_sched().
  */
 void synchronize_sched_expedited(void)
 {
 	int firstsnap, s, snap, trycount = 0;
 	/* Note that atomic_inc_return() implies full memory barrier. */
 	firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
 	get_online_cpus();
 	/*
 	 * Each pass through the following loop attempts to force a
 	 * context switch on each CPU.
 	 */
 	while (try_stop_cpus(cpu_online_mask,
 			     synchronize_sched_expedited_cpu_stop,
 			     NULL) == -EAGAIN) {
 		put_online_cpus();
 		/* No joy, try again later.  Or just synchronize_sched(). */
 		if (trycount++ < 10)
 			udelay(trycount * num_online_cpus());
 		else {
 			synchronize_sched();
 			return;
 		}
 		/* Check to see if someone else did our work for us. */
 		s = atomic_read(&sync_sched_expedited_done);
 		if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
 			smp_mb(); /* ensure test happens before caller kfree */
 			return;
 		}
 		/*
 		 * Refetching sync_sched_expedited_started allows later
 		 * callers to piggyback on our grace period.  We subtract
 		 * 1 to get the same token that the last incrementer got.
 		 * We retry after they started, so our grace period works
 		 * for them, and they started after our first try, so their
 		 * grace period works for us.
 		 */
 		get_online_cpus();
 		snap = atomic_read(&sync_sched_expedited_started);
 		smp_mb(); /* ensure read is before try_stop_cpus(). */
 	}
 	/*
 	 * Everyone up to our most recent fetch is covered by our grace
 	 * period.  Update the counter, but only if our work is still
 	 * relevant -- which it won't be if someone who started later
 	 * than we did beat us to the punch.
 	 */
 	do {
 		s = atomic_read(&sync_sched_expedited_done);
 		if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
 			smp_mb(); /* ensure test happens before caller kfree */
 			break;
 		}
 	} while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
 	put_online_cpus();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 #endif /* #else #ifndef CONFIG_SMP */
 #if !defined(CONFIG_RCU_FAST_NO_HZ)
 /*
  * Check to see if any future RCU-related work will need to be done
  * by the current CPU, even if none need be done immediately, returning
  * 1 if so.  This function is part of the RCU implementation; it is -not-
  * an exported member of the RCU API.
  *
- * Because we have preemptible RCU, just check whether this CPU needs
+ * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
- * any flavor of RCU.  Do not chew up lots of CPU cycles with preemption
+ * any flavor of RCU.
- * disabled in a most-likely vain attempt to cause RCU not to need this CPU.
  */
 int rcu_needs_cpu(int cpu)
 {
 	return rcu_cpu_has_callbacks(cpu);
 }
 /*
+ * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it.
+ */
+static void rcu_prepare_for_idle_init(int cpu)
+{
+}
+/*
+ * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
+ * after it.
+ */
+static void rcu_cleanup_after_idle(int cpu)
+{
+}
+/*
  * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=y,
  * is nothing.
  */
 static void rcu_prepare_for_idle(int cpu)
 {
 }
 #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
-#define RCU_NEEDS_CPU_FLUSHES 5
+#define RCU_NEEDS_CPU_FLUSHES 5		/* Allow for callback self-repost. */
+#define RCU_IDLE_GP_DELAY 6		/* Roughly one grace period. */
 static DEFINE_PER_CPU(int, rcu_dyntick_drain);
 static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
+static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer);
+static ktime_t rcu_idle_gp_wait;
 /*
  * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
  * callbacks on this CPU, (2) this CPU has not yet attempted to enter
  * dyntick-idle mode, or (3) this CPU is in the process of attempting to
  * enter dyntick-idle mode.  Otherwise, if we have recently tried and failed
  * to enter dyntick-idle mode, we refuse to try to enter it.  After all,
  * it is better to incur scheduling-clock interrupts than to spin
  * continuously for the same time duration!
  */
 int rcu_needs_cpu(int cpu)
 {
 	/* If no callbacks, RCU doesn't need the CPU. */
 	if (!rcu_cpu_has_callbacks(cpu))
 		return 0;
 	/* Otherwise, RCU needs the CPU only if it recently tried and failed. */
 	return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies;
 }
 /*
+ * Timer handler used to force CPU to start pushing its remaining RCU
+ * callbacks in the case where it entered dyntick-idle mode with callbacks
+ * pending.  The hander doesn't really need to do anything because the
+ * real work is done upon re-entry to idle, or by the next scheduling-clock
+ * interrupt should idle not be re-entered.
+ */
+static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp)
+{
+	trace_rcu_prep_idle("Timer");
+	return HRTIMER_NORESTART;
+}
+/*
+ * Initialize the timer used to pull CPUs out of dyntick-idle mode.
+ */
+static void rcu_prepare_for_idle_init(int cpu)
+{
+	static int firsttime = 1;
+	struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu);
+	hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hrtp->function = rcu_idle_gp_timer_func;
+	if (firsttime) {
+		unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY);
+		rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000);
+		firsttime = 0;
+	}
+}
+/*
+ * Clean up for exit from idle.  Because we are exiting from idle, there
+ * is no longer any point to rcu_idle_gp_timer, so cancel it.  This will
+ * do nothing if this timer is not active, so just cancel it unconditionally.
+ */
+static void rcu_cleanup_after_idle(int cpu)
+{
+	hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu));
+}
+/*
  * Check to see if any RCU-related work can be done by the current CPU,
  * and if so, schedule a softirq to get it done.  This function is part
  * of the RCU implementation; it is -not- an exported member of the RCU API.
  *
  * The idea is for the current CPU to clear out all work required by the
  * RCU core for the current grace period, so that this CPU can be permitted
  * to enter dyntick-idle mode.  In some cases, it will need to be awakened
  * at the end of the grace period by whatever CPU ends the grace period.
  * This allows CPUs to go dyntick-idle more quickly, and to reduce the
  * number of wakeups by a modest integer factor.
  *
  * Because it is not legal to invoke rcu_process_callbacks() with irqs
  * disabled, we do one pass of force_quiescent_state(), then do a
  * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
  * later.  The per-cpu rcu_dyntick_drain variable controls the sequencing.
  *
  * The caller must have disabled interrupts.
  */
 static void rcu_prepare_for_idle(int cpu)
 {
 	unsigned long flags;
 	local_irq_save(flags);
 	/*
 	 * If there are no callbacks on this CPU, enter dyntick-idle mode.
 	 * Also reset state to avoid prejudicing later attempts.
 	 */
 	if (!rcu_cpu_has_callbacks(cpu)) {
 		per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
 		per_cpu(rcu_dyntick_drain, cpu) = 0;
 		local_irq_restore(flags);
 		trace_rcu_prep_idle("No callbacks");
 		return;
 	}
 	/*
 	 * If in holdoff mode, just return.  We will presumably have
 	 * refrained from disabling the scheduling-clock tick.
 	 */
 	if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) {
 		local_irq_restore(flags);
 		trace_rcu_prep_idle("In holdoff");
 		return;
 	}
 	/* Check and update the rcu_dyntick_drain sequencing. */
 	if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
 		/* First time through, initialize the counter. */
 		per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES;
 	} else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
+		/* Can we go dyntick-idle despite still having callbacks? */
+		if (!rcu_pending(cpu)) {
+			trace_rcu_prep_idle("Dyntick with callbacks");
+			per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
+			hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu),
+				      rcu_idle_gp_wait, HRTIMER_MODE_REL);
+			return; /* Nothing more to do immediately. */
+		}
 		/* We have hit the limit, so time to give up. */
 		per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
 		local_irq_restore(flags);
 		trace_rcu_prep_idle("Begin holdoff");
 		invoke_rcu_core();  /* Force the CPU out of dyntick-idle. */
 		return;
 	}
 	/*
 	 * Do one step of pushing the remaining RCU callbacks through
 	 * the RCU core state machine.
 	 */
 #ifdef CONFIG_TREE_PREEMPT_RCU
 	if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
 		local_irq_restore(flags);
 		rcu_preempt_qs(cpu);
 		force_quiescent_state(&rcu_preempt_state, 0);
 		local_irq_save(flags);
 	}
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 	if (per_cpu(rcu_sched_data, cpu).nxtlist) {
 		local_irq_restore(flags);
 		rcu_sched_qs(cpu);
 		force_quiescent_state(&rcu_sched_state, 0);
 		local_irq_save(flags);
 	}
 	if (per_cpu(rcu_bh_data, cpu).nxtlist) {
 		local_irq_restore(flags);
 		rcu_bh_qs(cpu);
 		force_quiescent_state(&rcu_bh_state, 0);
 		local_irq_save(flags);
 	}
 	/*
 	 * If RCU callbacks are still pending, RCU still needs this CPU.
 	 * So try forcing the callbacks through the grace period.
 	 */
 	if (rcu_cpu_has_callbacks(cpu)) {
 		local_irq_restore(flags);
 		trace_rcu_prep_idle("More callbacks");
 		invoke_rcu_core();
 	} else {
 		local_irq_restore(flags);
 		trace_rcu_prep_idle("Callbacks drained");
 	}
 }
 #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */