Commit 6d87669357936bffa1e8fea7a4e7743e76905736

Authored by Paul E. McKenney

Merge branches 'doc.2013.03.12a', 'fixes.2013.03.13a' and 'idlenocb.2013.03.26b' into HEAD

doc.2013.03.12a: Documentation changes.

fixes.2013.03.13a: Miscellaneous fixes.

idlenocb.2013.03.26b: Remove restrictions on no-CBs CPUs, make
	RCU_FAST_NO_HZ take advantage of numbered callbacks, add
	callback acceleration based on numbered callbacks.

Showing 11 changed files Side-by-side Diff

Documentation/RCU/stallwarn.txt
... ... @@ -92,14 +92,14 @@
92 92 more information is printed with the stall-warning message, for example:
93 93  
94 94 INFO: rcu_preempt detected stall on CPU
95   - 0: (63959 ticks this GP) idle=241/3fffffffffffffff/0
  95 + 0: (63959 ticks this GP) idle=241/3fffffffffffffff/0 softirq=82/543
96 96 (t=65000 jiffies)
97 97  
98 98 In kernels with CONFIG_RCU_FAST_NO_HZ, even more information is
99 99 printed:
100 100  
101 101 INFO: rcu_preempt detected stall on CPU
102   - 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 drain=0 . timer not pending
  102 + 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 nonlazy_posted: 25 .D
103 103 (t=65000 jiffies)
104 104  
105 105 The "(64628 ticks this GP)" indicates that this CPU has taken more
... ... @@ -116,13 +116,28 @@
116 116 be a small positive number if in the idle loop and a very large positive
117 117 number (as shown above) otherwise.
118 118  
119   -For CONFIG_RCU_FAST_NO_HZ kernels, the "drain=0" indicates that the CPU is
120   -not in the process of trying to force itself into dyntick-idle state, the
121   -"." indicates that the CPU has not given up forcing RCU into dyntick-idle
122   -mode (it would be "H" otherwise), and the "timer not pending" indicates
123   -that the CPU has not recently forced RCU into dyntick-idle mode (it
124   -would otherwise indicate the number of microseconds remaining in this
125   -forced state).
  119 +The "softirq=" portion of the message tracks the number of RCU softirq
  120 +handlers that the stalled CPU has executed. The number before the "/"
  121 +is the number that had executed since boot at the time that this CPU
  122 +last noted the beginning of a grace period, which might be the current
  123 +(stalled) grace period, or it might be some earlier grace period (for
  124 +example, if the CPU might have been in dyntick-idle mode for an extended
  125 +time period. The number after the "/" is the number that have executed
  126 +since boot until the current time. If this latter number stays constant
  127 +across repeated stall-warning messages, it is possible that RCU's softirq
  128 +handlers are no longer able to execute on this CPU. This can happen if
  129 +the stalled CPU is spinning with interrupts are disabled, or, in -rt
  130 +kernels, if a high-priority process is starving RCU's softirq handler.
  131 +
  132 +For CONFIG_RCU_FAST_NO_HZ kernels, the "last_accelerate:" prints the
  133 +low-order 16 bits (in hex) of the jiffies counter when this CPU last
  134 +invoked rcu_try_advance_all_cbs() from rcu_needs_cpu() or last invoked
  135 +rcu_accelerate_cbs() from rcu_prepare_for_idle(). The "nonlazy_posted:"
  136 +prints the number of non-lazy callbacks posted since the last call to
  137 +rcu_needs_cpu(). Finally, an "L" indicates that there are currently
  138 +no non-lazy callbacks ("." is printed otherwise, as shown above) and
  139 +"D" indicates that dyntick-idle processing is enabled ("." is printed
  140 +otherwise, for example, if disabled via the "nohz=" kernel boot parameter).
126 141  
127 142  
128 143 Multiple Warnings From One Stall
Documentation/kernel-parameters.txt
... ... @@ -2461,9 +2461,12 @@
2461 2461 In kernels built with CONFIG_RCU_NOCB_CPU=y, set
2462 2462 the specified list of CPUs to be no-callback CPUs.
2463 2463 Invocation of these CPUs' RCU callbacks will
2464   - be offloaded to "rcuoN" kthreads created for
2465   - that purpose. This reduces OS jitter on the
  2464 + be offloaded to "rcuox/N" kthreads created for
  2465 + that purpose, where "x" is "b" for RCU-bh, "p"
  2466 + for RCU-preempt, and "s" for RCU-sched, and "N"
  2467 + is the CPU number. This reduces OS jitter on the
2466 2468 offloaded CPUs, which can be useful for HPC and
  2469 +
2467 2470 real-time workloads. It can also improve energy
2468 2471 efficiency for asymmetric multiprocessors.
2469 2472  
... ... @@ -2487,6 +2490,17 @@
2487 2490 leaf rcu_node structure. Useful for very large
2488 2491 systems.
2489 2492  
  2493 + rcutree.jiffies_till_first_fqs= [KNL,BOOT]
  2494 + Set delay from grace-period initialization to
  2495 + first attempt to force quiescent states.
  2496 + Units are jiffies, minimum value is zero,
  2497 + and maximum value is HZ.
  2498 +
  2499 + rcutree.jiffies_till_next_fqs= [KNL,BOOT]
  2500 + Set delay between subsequent attempts to force
  2501 + quiescent states. Units are jiffies, minimum
  2502 + value is one, and maximum value is HZ.
  2503 +
2490 2504 rcutree.qhimark= [KNL,BOOT]
2491 2505 Set threshold of queued
2492 2506 RCU callbacks over which batch limiting is disabled.
2493 2507  
... ... @@ -2501,16 +2515,15 @@
2501 2515 rcutree.rcu_cpu_stall_timeout= [KNL,BOOT]
2502 2516 Set timeout for RCU CPU stall warning messages.
2503 2517  
2504   - rcutree.jiffies_till_first_fqs= [KNL,BOOT]
2505   - Set delay from grace-period initialization to
2506   - first attempt to force quiescent states.
2507   - Units are jiffies, minimum value is zero,
2508   - and maximum value is HZ.
  2518 + rcutree.rcu_idle_gp_delay= [KNL,BOOT]
  2519 + Set wakeup interval for idle CPUs that have
  2520 + RCU callbacks (RCU_FAST_NO_HZ=y).
2509 2521  
2510   - rcutree.jiffies_till_next_fqs= [KNL,BOOT]
2511   - Set delay between subsequent attempts to force
2512   - quiescent states. Units are jiffies, minimum
2513   - value is one, and maximum value is HZ.
  2522 + rcutree.rcu_idle_lazy_gp_delay= [KNL,BOOT]
  2523 + Set wakeup interval for idle CPUs that have
  2524 + only "lazy" RCU callbacks (RCU_FAST_NO_HZ=y).
  2525 + Lazy RCU callbacks are those which RCU can
  2526 + prove do nothing more than free memory.
2514 2527  
2515 2528 rcutorture.fqs_duration= [KNL,BOOT]
2516 2529 Set duration of force_quiescent_state bursts.
include/linux/list_bl.h
... ... @@ -125,6 +125,11 @@
125 125 __bit_spin_unlock(0, (unsigned long *)b);
126 126 }
127 127  
  128 +static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
  129 +{
  130 + return bit_spin_is_locked(0, (unsigned long *)b);
  131 +}
  132 +
128 133 /**
129 134 * hlist_bl_for_each_entry - iterate over list of given type
130 135 * @tpos: the type * to use as a loop cursor.
include/linux/rculist_bl.h
... ... @@ -20,7 +20,7 @@
20 20 static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h)
21 21 {
22 22 return (struct hlist_bl_node *)
23   - ((unsigned long)rcu_dereference(h->first) & ~LIST_BL_LOCKMASK);
  23 + ((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK);
24 24 }
25 25  
26 26 /**
include/linux/rcupdate.h
... ... @@ -80,6 +80,7 @@
80 80 #define UINT_CMP_LT(a, b) (UINT_MAX / 2 < (a) - (b))
81 81 #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
82 82 #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
  83 +#define ulong2long(a) (*(long *)(&(a)))
83 84  
84 85 /* Exported common interfaces */
85 86  
include/trace/events/rcu.h
... ... @@ -72,6 +72,58 @@
72 72 );
73 73  
74 74 /*
  75 + * Tracepoint for future grace-period events, including those for no-callbacks
  76 + * CPUs. The caller should pull the data from the rcu_node structure,
  77 + * other than rcuname, which comes from the rcu_state structure, and event,
  78 + * which is one of the following:
  79 + *
  80 + * "Startleaf": Request a nocb grace period based on leaf-node data.
  81 + * "Startedleaf": Leaf-node start proved sufficient.
  82 + * "Startedleafroot": Leaf-node start proved sufficient after checking root.
  83 + * "Startedroot": Requested a nocb grace period based on root-node data.
  84 + * "StartWait": Start waiting for the requested grace period.
  85 + * "ResumeWait": Resume waiting after signal.
  86 + * "EndWait": Complete wait.
  87 + * "Cleanup": Clean up rcu_node structure after previous GP.
  88 + * "CleanupMore": Clean up, and another no-CB GP is needed.
  89 + */
  90 +TRACE_EVENT(rcu_future_grace_period,
  91 +
  92 + TP_PROTO(char *rcuname, unsigned long gpnum, unsigned long completed,
  93 + unsigned long c, u8 level, int grplo, int grphi,
  94 + char *gpevent),
  95 +
  96 + TP_ARGS(rcuname, gpnum, completed, c, level, grplo, grphi, gpevent),
  97 +
  98 + TP_STRUCT__entry(
  99 + __field(char *, rcuname)
  100 + __field(unsigned long, gpnum)
  101 + __field(unsigned long, completed)
  102 + __field(unsigned long, c)
  103 + __field(u8, level)
  104 + __field(int, grplo)
  105 + __field(int, grphi)
  106 + __field(char *, gpevent)
  107 + ),
  108 +
  109 + TP_fast_assign(
  110 + __entry->rcuname = rcuname;
  111 + __entry->gpnum = gpnum;
  112 + __entry->completed = completed;
  113 + __entry->c = c;
  114 + __entry->level = level;
  115 + __entry->grplo = grplo;
  116 + __entry->grphi = grphi;
  117 + __entry->gpevent = gpevent;
  118 + ),
  119 +
  120 + TP_printk("%s %lu %lu %lu %u %d %d %s",
  121 + __entry->rcuname, __entry->gpnum, __entry->completed,
  122 + __entry->c, __entry->level, __entry->grplo, __entry->grphi,
  123 + __entry->gpevent)
  124 +);
  125 +
  126 +/*
75 127 * Tracepoint for grace-period-initialization events. These are
76 128 * distinguished by the type of RCU, the new grace-period number, the
77 129 * rcu_node structure level, the starting and ending CPU covered by the
... ... @@ -601,6 +653,9 @@
601 653 #define trace_rcu_grace_period(rcuname, gpnum, gpevent) do { } while (0)
602 654 #define trace_rcu_grace_period_init(rcuname, gpnum, level, grplo, grphi, \
603 655 qsmask) do { } while (0)
  656 +#define trace_rcu_future_grace_period(rcuname, gpnum, completed, c, \
  657 + level, grplo, grphi, event) \
  658 + do { } while (0)
604 659 #define trace_rcu_preempt_task(rcuname, pid, gpnum) do { } while (0)
605 660 #define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0)
606 661 #define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, \
... ... @@ -582,13 +582,16 @@
582 582 depends on NO_HZ && SMP
583 583 default n
584 584 help
585   - This option causes RCU to attempt to accelerate grace periods in
586   - order to allow CPUs to enter dynticks-idle state more quickly.
587   - On the other hand, this option increases the overhead of the
588   - dynticks-idle checking, thus degrading scheduling latency.
  585 + This option permits CPUs to enter dynticks-idle state even if
  586 + they have RCU callbacks queued, and prevents RCU from waking
  587 + these CPUs up more than roughly once every four jiffies (by
  588 + default, you can adjust this using the rcutree.rcu_idle_gp_delay
  589 + parameter), thus improving energy efficiency. On the other
  590 + hand, this option increases the duration of RCU grace periods,
  591 + for example, slowing down synchronize_rcu().
589 592  
590   - Say Y if energy efficiency is critically important, and you don't
591   - care about real-time response.
  593 + Say Y if energy efficiency is critically important, and you
  594 + don't care about increased grace-period durations.
592 595  
593 596 Say N if you are unsure.
594 597  
... ... @@ -655,7 +658,7 @@
655 658 Accept the default if unsure.
656 659  
657 660 config RCU_NOCB_CPU
658   - bool "Offload RCU callback processing from boot-selected CPUs"
  661 + bool "Offload RCU callback processing from boot-selected CPUs (EXPERIMENTAL"
659 662 depends on TREE_RCU || TREE_PREEMPT_RCU
660 663 default n
661 664 help
662 665  
663 666  
... ... @@ -666,15 +669,55 @@
666 669  
667 670 This option offloads callback invocation from the set of
668 671 CPUs specified at boot time by the rcu_nocbs parameter.
669   - For each such CPU, a kthread ("rcuoN") will be created to
670   - invoke callbacks, where the "N" is the CPU being offloaded.
671   - Nothing prevents this kthread from running on the specified
672   - CPUs, but (1) the kthreads may be preempted between each
673   - callback, and (2) affinity or cgroups can be used to force
674   - the kthreads to run on whatever set of CPUs is desired.
  672 + For each such CPU, a kthread ("rcuox/N") will be created to
  673 + invoke callbacks, where the "N" is the CPU being offloaded,
  674 + and where the "x" is "b" for RCU-bh, "p" for RCU-preempt, and
  675 + "s" for RCU-sched. Nothing prevents this kthread from running
  676 + on the specified CPUs, but (1) the kthreads may be preempted
  677 + between each callback, and (2) affinity or cgroups can be used
  678 + to force the kthreads to run on whatever set of CPUs is desired.
675 679  
676   - Say Y here if you want reduced OS jitter on selected CPUs.
  680 + Say Y here if you want to help to debug reduced OS jitter.
677 681 Say N here if you are unsure.
  682 +
  683 +choice
  684 + prompt "Build-forced no-CBs CPUs"
  685 + default RCU_NOCB_CPU_NONE
  686 + help
  687 + This option allows no-CBs CPUs to be specified at build time.
  688 + Additional no-CBs CPUs may be specified by the rcu_nocbs=
  689 + boot parameter.
  690 +
  691 +config RCU_NOCB_CPU_NONE
  692 + bool "No build_forced no-CBs CPUs"
  693 + depends on RCU_NOCB_CPU
  694 + help
  695 + This option does not force any of the CPUs to be no-CBs CPUs.
  696 + Only CPUs designated by the rcu_nocbs= boot parameter will be
  697 + no-CBs CPUs.
  698 +
  699 +config RCU_NOCB_CPU_ZERO
  700 + bool "CPU 0 is a build_forced no-CBs CPU"
  701 + depends on RCU_NOCB_CPU
  702 + help
  703 + This option forces CPU 0 to be a no-CBs CPU. Additional CPUs
  704 + may be designated as no-CBs CPUs using the rcu_nocbs= boot
  705 + parameter will be no-CBs CPUs.
  706 +
  707 + Select this if CPU 0 needs to be a no-CBs CPU for real-time
  708 + or energy-efficiency reasons.
  709 +
  710 +config RCU_NOCB_CPU_ALL
  711 + bool "All CPUs are build_forced no-CBs CPUs"
  712 + depends on RCU_NOCB_CPU
  713 + help
  714 + This option forces all CPUs to be no-CBs CPUs. The rcu_nocbs=
  715 + boot parameter will be ignored.
  716 +
  717 + Select this if all CPUs need to be no-CBs CPUs for real-time
  718 + or energy-efficiency reasons.
  719 +
  720 +endchoice
678 721  
679 722 endmenu # "RCU Subsystem"
680 723  
... ... @@ -64,7 +64,7 @@
64 64 static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
65 65 static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
66 66  
67   -#define RCU_STATE_INITIALIZER(sname, cr) { \
  67 +#define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \
68 68 .level = { &sname##_state.node[0] }, \
69 69 .call = cr, \
70 70 .fqs_state = RCU_GP_IDLE, \
71 71  
72 72  
... ... @@ -76,13 +76,14 @@
76 76 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
77 77 .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
78 78 .name = #sname, \
  79 + .abbr = sabbr, \
79 80 }
80 81  
81 82 struct rcu_state rcu_sched_state =
82   - RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched);
  83 + RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
83 84 DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
84 85  
85   -struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh);
  86 +struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
86 87 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
87 88  
88 89 static struct rcu_state *rcu_state;
... ... @@ -223,6 +224,8 @@
223 224 module_param(jiffies_till_first_fqs, ulong, 0644);
224 225 module_param(jiffies_till_next_fqs, ulong, 0644);
225 226  
  227 +static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
  228 + struct rcu_data *rdp);
226 229 static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *));
227 230 static void force_quiescent_state(struct rcu_state *rsp);
228 231 static int rcu_pending(int cpu);
... ... @@ -310,6 +313,8 @@
310 313  
311 314 if (rcu_gp_in_progress(rsp))
312 315 return 0; /* No, a grace period is already in progress. */
  316 + if (rcu_nocb_needs_gp(rsp))
  317 + return 1; /* Yes, a no-CBs CPU needs one. */
313 318 if (!rdp->nxttail[RCU_NEXT_TAIL])
314 319 return 0; /* No, this is a no-CBs (or offline) CPU. */
315 320 if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
316 321  
... ... @@ -1035,10 +1040,11 @@
1035 1040 {
1036 1041 int i;
1037 1042  
  1043 + if (init_nocb_callback_list(rdp))
  1044 + return;
1038 1045 rdp->nxtlist = NULL;
1039 1046 for (i = 0; i < RCU_NEXT_SIZE; i++)
1040 1047 rdp->nxttail[i] = &rdp->nxtlist;
1041   - init_nocb_callback_list(rdp);
1042 1048 }
1043 1049  
1044 1050 /*
... ... @@ -1071,6 +1077,120 @@
1071 1077 }
1072 1078  
1073 1079 /*
  1080 + * Trace-event helper function for rcu_start_future_gp() and
  1081 + * rcu_nocb_wait_gp().
  1082 + */
  1083 +static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
  1084 + unsigned long c, char *s)
  1085 +{
  1086 + trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum,
  1087 + rnp->completed, c, rnp->level,
  1088 + rnp->grplo, rnp->grphi, s);
  1089 +}
  1090 +
  1091 +/*
  1092 + * Start some future grace period, as needed to handle newly arrived
  1093 + * callbacks. The required future grace periods are recorded in each
  1094 + * rcu_node structure's ->need_future_gp field.
  1095 + *
  1096 + * The caller must hold the specified rcu_node structure's ->lock.
  1097 + */
  1098 +static unsigned long __maybe_unused
  1099 +rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
  1100 +{
  1101 + unsigned long c;
  1102 + int i;
  1103 + struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
  1104 +
  1105 + /*
  1106 + * Pick up grace-period number for new callbacks. If this
  1107 + * grace period is already marked as needed, return to the caller.
  1108 + */
  1109 + c = rcu_cbs_completed(rdp->rsp, rnp);
  1110 + trace_rcu_future_gp(rnp, rdp, c, "Startleaf");
  1111 + if (rnp->need_future_gp[c & 0x1]) {
  1112 + trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf");
  1113 + return c;
  1114 + }
  1115 +
  1116 + /*
  1117 + * If either this rcu_node structure or the root rcu_node structure
  1118 + * believe that a grace period is in progress, then we must wait
  1119 + * for the one following, which is in "c". Because our request
  1120 + * will be noticed at the end of the current grace period, we don't
  1121 + * need to explicitly start one.
  1122 + */
  1123 + if (rnp->gpnum != rnp->completed ||
  1124 + ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
  1125 + rnp->need_future_gp[c & 0x1]++;
  1126 + trace_rcu_future_gp(rnp, rdp, c, "Startedleaf");
  1127 + return c;
  1128 + }
  1129 +
  1130 + /*
  1131 + * There might be no grace period in progress. If we don't already
  1132 + * hold it, acquire the root rcu_node structure's lock in order to
  1133 + * start one (if needed).
  1134 + */
  1135 + if (rnp != rnp_root)
  1136 + raw_spin_lock(&rnp_root->lock);
  1137 +
  1138 + /*
  1139 + * Get a new grace-period number. If there really is no grace
  1140 + * period in progress, it will be smaller than the one we obtained
  1141 + * earlier. Adjust callbacks as needed. Note that even no-CBs
  1142 + * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed.
  1143 + */
  1144 + c = rcu_cbs_completed(rdp->rsp, rnp_root);
  1145 + for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++)
  1146 + if (ULONG_CMP_LT(c, rdp->nxtcompleted[i]))
  1147 + rdp->nxtcompleted[i] = c;
  1148 +
  1149 + /*
  1150 + * If the needed for the required grace period is already
  1151 + * recorded, trace and leave.
  1152 + */
  1153 + if (rnp_root->need_future_gp[c & 0x1]) {
  1154 + trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot");
  1155 + goto unlock_out;
  1156 + }
  1157 +
  1158 + /* Record the need for the future grace period. */
  1159 + rnp_root->need_future_gp[c & 0x1]++;
  1160 +
  1161 + /* If a grace period is not already in progress, start one. */
  1162 + if (rnp_root->gpnum != rnp_root->completed) {
  1163 + trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot");
  1164 + } else {
  1165 + trace_rcu_future_gp(rnp, rdp, c, "Startedroot");
  1166 + rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
  1167 + }
  1168 +unlock_out:
  1169 + if (rnp != rnp_root)
  1170 + raw_spin_unlock(&rnp_root->lock);
  1171 + return c;
  1172 +}
  1173 +
  1174 +/*
  1175 + * Clean up any old requests for the just-ended grace period. Also return
  1176 + * whether any additional grace periods have been requested. Also invoke
  1177 + * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads
  1178 + * waiting for this grace period to complete.
  1179 + */
  1180 +static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
  1181 +{
  1182 + int c = rnp->completed;
  1183 + int needmore;
  1184 + struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
  1185 +
  1186 + rcu_nocb_gp_cleanup(rsp, rnp);
  1187 + rnp->need_future_gp[c & 0x1] = 0;
  1188 + needmore = rnp->need_future_gp[(c + 1) & 0x1];
  1189 + trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup");
  1190 + return needmore;
  1191 +}
  1192 +
  1193 +/*
1074 1194 * If there is room, assign a ->completed number to any callbacks on
1075 1195 * this CPU that have not already been assigned. Also accelerate any
1076 1196 * callbacks that were previously assigned a ->completed number that has
... ... @@ -1129,6 +1249,8 @@
1129 1249 rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL];
1130 1250 rdp->nxtcompleted[i] = c;
1131 1251 }
  1252 + /* Record any needed additional grace periods. */
  1253 + rcu_start_future_gp(rnp, rdp);
1132 1254  
1133 1255 /* Trace depending on how much we were able to accelerate. */
1134 1256 if (!*rdp->nxttail[RCU_WAIT_TAIL])
1135 1257  
... ... @@ -1308,9 +1430,9 @@
1308 1430 rdp = this_cpu_ptr(rsp->rda);
1309 1431 rcu_preempt_check_blocked_tasks(rnp);
1310 1432 rnp->qsmask = rnp->qsmaskinit;
1311   - rnp->gpnum = rsp->gpnum;
  1433 + ACCESS_ONCE(rnp->gpnum) = rsp->gpnum;
1312 1434 WARN_ON_ONCE(rnp->completed != rsp->completed);
1313   - rnp->completed = rsp->completed;
  1435 + ACCESS_ONCE(rnp->completed) = rsp->completed;
1314 1436 if (rnp == rdp->mynode)
1315 1437 rcu_start_gp_per_cpu(rsp, rnp, rdp);
1316 1438 rcu_preempt_boost_start_gp(rnp);
... ... @@ -1319,7 +1441,8 @@
1319 1441 rnp->grphi, rnp->qsmask);
1320 1442 raw_spin_unlock_irq(&rnp->lock);
1321 1443 #ifdef CONFIG_PROVE_RCU_DELAY
1322   - if ((random32() % (rcu_num_nodes * 8)) == 0)
  1444 + if ((random32() % (rcu_num_nodes * 8)) == 0 &&
  1445 + system_state == SYSTEM_RUNNING)
1323 1446 schedule_timeout_uninterruptible(2);
1324 1447 #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
1325 1448 cond_resched();
... ... @@ -1361,6 +1484,7 @@
1361 1484 static void rcu_gp_cleanup(struct rcu_state *rsp)
1362 1485 {
1363 1486 unsigned long gp_duration;
  1487 + int nocb = 0;
1364 1488 struct rcu_data *rdp;
1365 1489 struct rcu_node *rnp = rcu_get_root(rsp);
1366 1490  
1367 1491  
1368 1492  
... ... @@ -1390,17 +1514,23 @@
1390 1514 */
1391 1515 rcu_for_each_node_breadth_first(rsp, rnp) {
1392 1516 raw_spin_lock_irq(&rnp->lock);
1393   - rnp->completed = rsp->gpnum;
  1517 + ACCESS_ONCE(rnp->completed) = rsp->gpnum;
  1518 + rdp = this_cpu_ptr(rsp->rda);
  1519 + if (rnp == rdp->mynode)
  1520 + __rcu_process_gp_end(rsp, rnp, rdp);
  1521 + nocb += rcu_future_gp_cleanup(rsp, rnp);
1394 1522 raw_spin_unlock_irq(&rnp->lock);
1395 1523 cond_resched();
1396 1524 }
1397 1525 rnp = rcu_get_root(rsp);
1398 1526 raw_spin_lock_irq(&rnp->lock);
  1527 + rcu_nocb_gp_set(rnp, nocb);
1399 1528  
1400 1529 rsp->completed = rsp->gpnum; /* Declare grace period done. */
1401 1530 trace_rcu_grace_period(rsp->name, rsp->completed, "end");
1402 1531 rsp->fqs_state = RCU_GP_IDLE;
1403 1532 rdp = this_cpu_ptr(rsp->rda);
  1533 + rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */
1404 1534 if (cpu_needs_another_gp(rsp, rdp))
1405 1535 rsp->gp_flags = 1;
1406 1536 raw_spin_unlock_irq(&rnp->lock);
1407 1537  
1408 1538  
1409 1539  
1410 1540  
1411 1541  
1412 1542  
1413 1543  
1414 1544  
... ... @@ -1476,57 +1606,62 @@
1476 1606 /*
1477 1607 * Start a new RCU grace period if warranted, re-initializing the hierarchy
1478 1608 * in preparation for detecting the next grace period. The caller must hold
1479   - * the root node's ->lock, which is released before return. Hard irqs must
1480   - * be disabled.
  1609 + * the root node's ->lock and hard irqs must be disabled.
1481 1610 *
1482 1611 * Note that it is legal for a dying CPU (which is marked as offline) to
1483 1612 * invoke this function. This can happen when the dying CPU reports its
1484 1613 * quiescent state.
1485 1614 */
1486 1615 static void
1487   -rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
1488   - __releases(rcu_get_root(rsp)->lock)
  1616 +rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
  1617 + struct rcu_data *rdp)
1489 1618 {
1490   - struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1491   - struct rcu_node *rnp = rcu_get_root(rsp);
1492   -
1493   - if (!rsp->gp_kthread ||
1494   - !cpu_needs_another_gp(rsp, rdp)) {
  1619 + if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) {
1495 1620 /*
1496 1621 * Either we have not yet spawned the grace-period
1497 1622 * task, this CPU does not need another grace period,
1498 1623 * or a grace period is already in progress.
1499 1624 * Either way, don't start a new grace period.
1500 1625 */
1501   - raw_spin_unlock_irqrestore(&rnp->lock, flags);
1502 1626 return;
1503 1627 }
1504   -
1505   - /*
1506   - * Because there is no grace period in progress right now,
1507   - * any callbacks we have up to this point will be satisfied
1508   - * by the next grace period. So this is a good place to
1509   - * assign a grace period number to recently posted callbacks.
1510   - */
1511   - rcu_accelerate_cbs(rsp, rnp, rdp);
1512   -
1513 1628 rsp->gp_flags = RCU_GP_FLAG_INIT;
1514   - raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
1515 1629  
1516   - /* Ensure that CPU is aware of completion of last grace period. */
1517   - rcu_process_gp_end(rsp, rdp);
1518   - local_irq_restore(flags);
1519   -
1520 1630 /* Wake up rcu_gp_kthread() to start the grace period. */
1521 1631 wake_up(&rsp->gp_wq);
1522 1632 }
1523 1633  
1524 1634 /*
  1635 + * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's
  1636 + * callbacks. Note that rcu_start_gp_advanced() cannot do this because it
  1637 + * is invoked indirectly from rcu_advance_cbs(), which would result in
  1638 + * endless recursion -- or would do so if it wasn't for the self-deadlock
  1639 + * that is encountered beforehand.
  1640 + */
  1641 +static void
  1642 +rcu_start_gp(struct rcu_state *rsp)
  1643 +{
  1644 + struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
  1645 + struct rcu_node *rnp = rcu_get_root(rsp);
  1646 +
  1647 + /*
  1648 + * If there is no grace period in progress right now, any
  1649 + * callbacks we have up to this point will be satisfied by the
  1650 + * next grace period. Also, advancing the callbacks reduces the
  1651 + * probability of false positives from cpu_needs_another_gp()
  1652 + * resulting in pointless grace periods. So, advance callbacks
  1653 + * then start the grace period!
  1654 + */
  1655 + rcu_advance_cbs(rsp, rnp, rdp);
  1656 + rcu_start_gp_advanced(rsp, rnp, rdp);
  1657 +}
  1658 +
  1659 +/*
1525 1660 * Report a full set of quiescent states to the specified rcu_state
1526 1661 * data structure. This involves cleaning up after the prior grace
1527 1662 * period and letting rcu_start_gp() start up the next grace period
1528   - * if one is needed. Note that the caller must hold rnp->lock, as
1529   - * required by rcu_start_gp(), which will release it.
  1663 + * if one is needed. Note that the caller must hold rnp->lock, which
  1664 + * is released before return.
1530 1665 */
1531 1666 static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
1532 1667 __releases(rcu_get_root(rsp)->lock)
... ... @@ -2124,7 +2259,8 @@
2124 2259 local_irq_save(flags);
2125 2260 if (cpu_needs_another_gp(rsp, rdp)) {
2126 2261 raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
2127   - rcu_start_gp(rsp, flags); /* releases above lock */
  2262 + rcu_start_gp(rsp);
  2263 + raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
2128 2264 } else {
2129 2265 local_irq_restore(flags);
2130 2266 }
... ... @@ -2169,7 +2305,8 @@
2169 2305  
2170 2306 static void invoke_rcu_core(void)
2171 2307 {
2172   - raise_softirq(RCU_SOFTIRQ);
  2308 + if (cpu_online(smp_processor_id()))
  2309 + raise_softirq(RCU_SOFTIRQ);
2173 2310 }
2174 2311  
2175 2312 /*
2176 2313  
... ... @@ -2204,11 +2341,11 @@
2204 2341  
2205 2342 /* Start a new grace period if one not already started. */
2206 2343 if (!rcu_gp_in_progress(rsp)) {
2207   - unsigned long nestflag;
2208 2344 struct rcu_node *rnp_root = rcu_get_root(rsp);
2209 2345  
2210   - raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
2211   - rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */
  2346 + raw_spin_lock(&rnp_root->lock);
  2347 + rcu_start_gp(rsp);
  2348 + raw_spin_unlock(&rnp_root->lock);
2212 2349 } else {
2213 2350 /* Give the grace period a kick. */
2214 2351 rdp->blimit = LONG_MAX;
2215 2352  
2216 2353  
2217 2354  
... ... @@ -2628,19 +2765,27 @@
2628 2765 }
2629 2766  
2630 2767 /*
2631   - * Check to see if any future RCU-related work will need to be done
2632   - * by the current CPU, even if none need be done immediately, returning
2633   - * 1 if so.
  2768 + * Return true if the specified CPU has any callback. If all_lazy is
  2769 + * non-NULL, store an indication of whether all callbacks are lazy.
  2770 + * (If there are no callbacks, all of them are deemed to be lazy.)
2634 2771 */
2635   -static int rcu_cpu_has_callbacks(int cpu)
  2772 +static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
2636 2773 {
  2774 + bool al = true;
  2775 + bool hc = false;
  2776 + struct rcu_data *rdp;
2637 2777 struct rcu_state *rsp;
2638 2778  
2639   - /* RCU callbacks either ready or pending? */
2640   - for_each_rcu_flavor(rsp)
2641   - if (per_cpu_ptr(rsp->rda, cpu)->nxtlist)
2642   - return 1;
2643   - return 0;
  2779 + for_each_rcu_flavor(rsp) {
  2780 + rdp = per_cpu_ptr(rsp->rda, cpu);
  2781 + if (rdp->qlen != rdp->qlen_lazy)
  2782 + al = false;
  2783 + if (rdp->nxtlist)
  2784 + hc = true;
  2785 + }
  2786 + if (all_lazy)
  2787 + *all_lazy = al;
  2788 + return hc;
2644 2789 }
2645 2790  
2646 2791 /*
... ... @@ -2859,7 +3004,6 @@
2859 3004 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
2860 3005 atomic_set(&rdp->dynticks->dynticks,
2861 3006 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
2862   - rcu_prepare_for_idle_init(cpu);
2863 3007 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2864 3008  
2865 3009 /* Add CPU to rcu_node bitmasks. */
... ... @@ -2909,7 +3053,6 @@
2909 3053 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
2910 3054 struct rcu_node *rnp = rdp->mynode;
2911 3055 struct rcu_state *rsp;
2912   - int ret = NOTIFY_OK;
2913 3056  
2914 3057 trace_rcu_utilization("Start CPU hotplug");
2915 3058 switch (action) {
2916 3059  
2917 3060  
... ... @@ -2923,21 +3066,12 @@
2923 3066 rcu_boost_kthread_setaffinity(rnp, -1);
2924 3067 break;
2925 3068 case CPU_DOWN_PREPARE:
2926   - if (nocb_cpu_expendable(cpu))
2927   - rcu_boost_kthread_setaffinity(rnp, cpu);
2928   - else
2929   - ret = NOTIFY_BAD;
  3069 + rcu_boost_kthread_setaffinity(rnp, cpu);
2930 3070 break;
2931 3071 case CPU_DYING:
2932 3072 case CPU_DYING_FROZEN:
2933   - /*
2934   - * The whole machine is "stopped" except this CPU, so we can
2935   - * touch any data without introducing corruption. We send the
2936   - * dying CPU's callbacks to an arbitrarily chosen online CPU.
2937   - */
2938 3073 for_each_rcu_flavor(rsp)
2939 3074 rcu_cleanup_dying_cpu(rsp);
2940   - rcu_cleanup_after_idle(cpu);
2941 3075 break;
2942 3076 case CPU_DEAD:
2943 3077 case CPU_DEAD_FROZEN:
... ... @@ -2950,7 +3084,7 @@
2950 3084 break;
2951 3085 }
2952 3086 trace_rcu_utilization("End CPU hotplug");
2953   - return ret;
  3087 + return NOTIFY_OK;
2954 3088 }
2955 3089  
2956 3090 /*
... ... @@ -3085,6 +3219,7 @@
3085 3219 }
3086 3220 rnp->level = i;
3087 3221 INIT_LIST_HEAD(&rnp->blkd_tasks);
  3222 + rcu_init_one_nocb(rnp);
3088 3223 }
3089 3224 }
3090 3225  
... ... @@ -3170,8 +3305,7 @@
3170 3305 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
3171 3306 rcu_init_one(&rcu_bh_state, &rcu_bh_data);
3172 3307 __rcu_init_preempt();
3173   - rcu_init_nocb();
3174   - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
  3308 + open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
3175 3309  
3176 3310 /*
3177 3311 * We don't need protection against CPU-hotplug here because
... ... @@ -88,18 +88,13 @@
88 88 int dynticks_nmi_nesting; /* Track NMI nesting level. */
89 89 atomic_t dynticks; /* Even value for idle, else odd. */
90 90 #ifdef CONFIG_RCU_FAST_NO_HZ
91   - int dyntick_drain; /* Prepare-for-idle state variable. */
92   - unsigned long dyntick_holdoff;
93   - /* No retries for the jiffy of failure. */
94   - struct timer_list idle_gp_timer;
95   - /* Wake up CPU sleeping with callbacks. */
96   - unsigned long idle_gp_timer_expires;
97   - /* When to wake up CPU (for repost). */
98   - bool idle_first_pass; /* First pass of attempt to go idle? */
  91 + bool all_lazy; /* Are all CPU's CBs lazy? */
99 92 unsigned long nonlazy_posted;
100 93 /* # times non-lazy CBs posted to CPU. */
101 94 unsigned long nonlazy_posted_snap;
102 95 /* idle-period nonlazy_posted snapshot. */
  96 + unsigned long last_accelerate;
  97 + /* Last jiffy CBs were accelerated. */
103 98 int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
104 99 #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
105 100 };
... ... @@ -134,9 +129,6 @@
134 129 /* elements that need to drain to allow the */
135 130 /* current expedited grace period to */
136 131 /* complete (only for TREE_PREEMPT_RCU). */
137   - atomic_t wakemask; /* CPUs whose kthread needs to be awakened. */
138   - /* Since this has meaning only for leaf */
139   - /* rcu_node structures, 32 bits suffices. */
140 132 unsigned long qsmaskinit;
141 133 /* Per-GP initial value for qsmask & expmask. */
142 134 unsigned long grpmask; /* Mask to apply to parent qsmask. */
... ... @@ -196,6 +188,12 @@
196 188 /* Refused to boost: not sure why, though. */
197 189 /* This can happen due to race conditions. */
198 190 #endif /* #ifdef CONFIG_RCU_BOOST */
  191 +#ifdef CONFIG_RCU_NOCB_CPU
  192 + wait_queue_head_t nocb_gp_wq[2];
  193 + /* Place for rcu_nocb_kthread() to wait GP. */
  194 +#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
  195 + int need_future_gp[2];
  196 + /* Counts of upcoming no-CB GP requests. */
199 197 raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
200 198 } ____cacheline_internodealigned_in_smp;
201 199  
... ... @@ -328,6 +326,11 @@
328 326 struct task_struct *nocb_kthread;
329 327 #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
330 328  
  329 + /* 8) RCU CPU stall data. */
  330 +#ifdef CONFIG_RCU_CPU_STALL_INFO
  331 + unsigned int softirq_snap; /* Snapshot of softirq activity. */
  332 +#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
  333 +
331 334 int cpu;
332 335 struct rcu_state *rsp;
333 336 };
... ... @@ -375,12 +378,6 @@
375 378 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
376 379 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
377 380 void (*func)(struct rcu_head *head));
378   -#ifdef CONFIG_RCU_NOCB_CPU
379   - void (*call_remote)(struct rcu_head *head,
380   - void (*func)(struct rcu_head *head));
381   - /* call_rcu() flavor, but for */
382   - /* placing on remote CPU. */
383   -#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
384 381  
385 382 /* The following fields are guarded by the root rcu_node's lock. */
386 383  
... ... @@ -443,6 +440,7 @@
443 440 unsigned long gp_max; /* Maximum GP duration in */
444 441 /* jiffies. */
445 442 char *name; /* Name of structure. */
  443 + char abbr; /* Abbreviated name. */
446 444 struct list_head flavors; /* List of RCU flavors. */
447 445 };
448 446  
... ... @@ -520,7 +518,6 @@
520 518 struct rcu_node *rnp);
521 519 #endif /* #ifdef CONFIG_RCU_BOOST */
522 520 static void __cpuinit rcu_prepare_kthreads(int cpu);
523   -static void rcu_prepare_for_idle_init(int cpu);
524 521 static void rcu_cleanup_after_idle(int cpu);
525 522 static void rcu_prepare_for_idle(int cpu);
526 523 static void rcu_idle_count_callbacks_posted(void);
527 524  
528 525  
... ... @@ -529,16 +526,18 @@
529 526 static void print_cpu_stall_info_end(void);
530 527 static void zero_cpu_stall_ticks(struct rcu_data *rdp);
531 528 static void increment_cpu_stall_ticks(void);
  529 +static int rcu_nocb_needs_gp(struct rcu_state *rsp);
  530 +static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
  531 +static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
  532 +static void rcu_init_one_nocb(struct rcu_node *rnp);
532 533 static bool is_nocb_cpu(int cpu);
533 534 static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
534 535 bool lazy);
535 536 static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
536 537 struct rcu_data *rdp);
537   -static bool nocb_cpu_expendable(int cpu);
538 538 static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
539 539 static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
540   -static void init_nocb_callback_list(struct rcu_data *rdp);
541   -static void __init rcu_init_nocb(void);
  540 +static bool init_nocb_callback_list(struct rcu_data *rdp);
542 541  
543 542 #endif /* #ifndef RCU_TREE_NONCORE */
544 543  
kernel/rcutree_plugin.h
... ... @@ -85,11 +85,21 @@
85 85 if (nr_cpu_ids != NR_CPUS)
86 86 printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
87 87 #ifdef CONFIG_RCU_NOCB_CPU
  88 +#ifndef CONFIG_RCU_NOCB_CPU_NONE
  89 + if (!have_rcu_nocb_mask) {
  90 + alloc_bootmem_cpumask_var(&rcu_nocb_mask);
  91 + have_rcu_nocb_mask = true;
  92 + }
  93 +#ifdef CONFIG_RCU_NOCB_CPU_ZERO
  94 + pr_info("\tExperimental no-CBs CPU 0\n");
  95 + cpumask_set_cpu(0, rcu_nocb_mask);
  96 +#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
  97 +#ifdef CONFIG_RCU_NOCB_CPU_ALL
  98 + pr_info("\tExperimental no-CBs for all CPUs\n");
  99 + cpumask_setall(rcu_nocb_mask);
  100 +#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
  101 +#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
88 102 if (have_rcu_nocb_mask) {
89   - if (cpumask_test_cpu(0, rcu_nocb_mask)) {
90   - cpumask_clear_cpu(0, rcu_nocb_mask);
91   - pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n");
92   - }
93 103 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
94 104 pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf);
95 105 if (rcu_nocb_poll)
... ... @@ -101,7 +111,7 @@
101 111 #ifdef CONFIG_TREE_PREEMPT_RCU
102 112  
103 113 struct rcu_state rcu_preempt_state =
104   - RCU_STATE_INITIALIZER(rcu_preempt, call_rcu);
  114 + RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
105 115 DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
106 116 static struct rcu_state *rcu_state = &rcu_preempt_state;
107 117  
108 118  
... ... @@ -1533,17 +1543,10 @@
1533 1543 int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
1534 1544 {
1535 1545 *delta_jiffies = ULONG_MAX;
1536   - return rcu_cpu_has_callbacks(cpu);
  1546 + return rcu_cpu_has_callbacks(cpu, NULL);
1537 1547 }
1538 1548  
1539 1549 /*
1540   - * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it.
1541   - */
1542   -static void rcu_prepare_for_idle_init(int cpu)
1543   -{
1544   -}
1545   -
1546   -/*
1547 1550 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
1548 1551 * after it.
1549 1552 */
... ... @@ -1577,16 +1580,6 @@
1577 1580 *
1578 1581 * The following three proprocessor symbols control this state machine:
1579 1582 *
1580   - * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt
1581   - * to satisfy RCU. Beyond this point, it is better to incur a periodic
1582   - * scheduling-clock interrupt than to loop through the state machine
1583   - * at full power.
1584   - * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are
1585   - * optional if RCU does not need anything immediately from this
1586   - * CPU, even if this CPU still has RCU callbacks queued. The first
1587   - * times through the state machine are mandatory: we need to give
1588   - * the state machine a chance to communicate a quiescent state
1589   - * to the RCU core.
1590 1583 * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
1591 1584 * to sleep in dyntick-idle mode with RCU callbacks pending. This
1592 1585 * is sized to be roughly one RCU grace period. Those energy-efficiency
1593 1586  
1594 1587  
1595 1588  
1596 1589  
1597 1590  
1598 1591  
1599 1592  
1600 1593  
1601 1594  
1602 1595  
1603 1596  
1604 1597  
1605 1598  
1606 1599  
1607 1600  
1608 1601  
1609 1602  
1610 1603  
1611 1604  
1612 1605  
1613 1606  
... ... @@ -1602,186 +1595,108 @@
1602 1595 * adjustment, they can be converted into kernel config parameters, though
1603 1596 * making the state machine smarter might be a better option.
1604 1597 */
1605   -#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */
1606   -#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */
1607 1598 #define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */
1608 1599 #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
1609 1600  
  1601 +static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY;
  1602 +module_param(rcu_idle_gp_delay, int, 0644);
  1603 +static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
  1604 +module_param(rcu_idle_lazy_gp_delay, int, 0644);
  1605 +
1610 1606 extern int tick_nohz_enabled;
1611 1607  
1612 1608 /*
1613   - * Does the specified flavor of RCU have non-lazy callbacks pending on
1614   - * the specified CPU? Both RCU flavor and CPU are specified by the
1615   - * rcu_data structure.
  1609 + * Try to advance callbacks for all flavors of RCU on the current CPU.
  1610 + * Afterwards, if there are any callbacks ready for immediate invocation,
  1611 + * return true.
1616 1612 */
1617   -static bool __rcu_cpu_has_nonlazy_callbacks(struct rcu_data *rdp)
  1613 +static bool rcu_try_advance_all_cbs(void)
1618 1614 {
1619   - return rdp->qlen != rdp->qlen_lazy;
1620   -}
  1615 + bool cbs_ready = false;
  1616 + struct rcu_data *rdp;
  1617 + struct rcu_node *rnp;
  1618 + struct rcu_state *rsp;
1621 1619  
1622   -#ifdef CONFIG_TREE_PREEMPT_RCU
  1620 + for_each_rcu_flavor(rsp) {
  1621 + rdp = this_cpu_ptr(rsp->rda);
  1622 + rnp = rdp->mynode;
1623 1623  
1624   -/*
1625   - * Are there non-lazy RCU-preempt callbacks? (There cannot be if there
1626   - * is no RCU-preempt in the kernel.)
1627   - */
1628   -static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu)
1629   -{
1630   - struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
  1624 + /*
  1625 + * Don't bother checking unless a grace period has
  1626 + * completed since we last checked and there are
  1627 + * callbacks not yet ready to invoke.
  1628 + */
  1629 + if (rdp->completed != rnp->completed &&
  1630 + rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
  1631 + rcu_process_gp_end(rsp, rdp);
1631 1632  
1632   - return __rcu_cpu_has_nonlazy_callbacks(rdp);
  1633 + if (cpu_has_callbacks_ready_to_invoke(rdp))
  1634 + cbs_ready = true;
  1635 + }
  1636 + return cbs_ready;
1633 1637 }
1634 1638  
1635   -#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1636   -
1637   -static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu)
1638   -{
1639   - return 0;
1640   -}
1641   -
1642   -#endif /* else #ifdef CONFIG_TREE_PREEMPT_RCU */
1643   -
1644 1639 /*
1645   - * Does any flavor of RCU have non-lazy callbacks on the specified CPU?
1646   - */
1647   -static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
1648   -{
1649   - return __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_sched_data, cpu)) ||
1650   - __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_bh_data, cpu)) ||
1651   - rcu_preempt_cpu_has_nonlazy_callbacks(cpu);
1652   -}
1653   -
1654   -/*
1655   - * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
1656   - * callbacks on this CPU, (2) this CPU has not yet attempted to enter
1657   - * dyntick-idle mode, or (3) this CPU is in the process of attempting to
1658   - * enter dyntick-idle mode. Otherwise, if we have recently tried and failed
1659   - * to enter dyntick-idle mode, we refuse to try to enter it. After all,
1660   - * it is better to incur scheduling-clock interrupts than to spin
1661   - * continuously for the same time duration!
  1640 + * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
  1641 + * to invoke. If the CPU has callbacks, try to advance them. Tell the
  1642 + * caller to set the timeout based on whether or not there are non-lazy
  1643 + * callbacks.
1662 1644 *
1663   - * The delta_jiffies argument is used to store the time when RCU is
1664   - * going to need the CPU again if it still has callbacks. The reason
1665   - * for this is that rcu_prepare_for_idle() might need to post a timer,
1666   - * but if so, it will do so after tick_nohz_stop_sched_tick() has set
1667   - * the wakeup time for this CPU. This means that RCU's timer can be
1668   - * delayed until the wakeup time, which defeats the purpose of posting
1669   - * a timer.
  1645 + * The caller must have disabled interrupts.
1670 1646 */
1671   -int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
  1647 +int rcu_needs_cpu(int cpu, unsigned long *dj)
1672 1648 {
1673 1649 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1674 1650  
1675   - /* Flag a new idle sojourn to the idle-entry state machine. */
1676   - rdtp->idle_first_pass = 1;
  1651 + /* Snapshot to detect later posting of non-lazy callback. */
  1652 + rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
  1653 +
1677 1654 /* If no callbacks, RCU doesn't need the CPU. */
1678   - if (!rcu_cpu_has_callbacks(cpu)) {
1679   - *delta_jiffies = ULONG_MAX;
  1655 + if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) {
  1656 + *dj = ULONG_MAX;
1680 1657 return 0;
1681 1658 }
1682   - if (rdtp->dyntick_holdoff == jiffies) {
1683   - /* RCU recently tried and failed, so don't try again. */
1684   - *delta_jiffies = 1;
  1659 +
  1660 + /* Attempt to advance callbacks. */
  1661 + if (rcu_try_advance_all_cbs()) {
  1662 + /* Some ready to invoke, so initiate later invocation. */
  1663 + invoke_rcu_core();
1685 1664 return 1;
1686 1665 }
1687   - /* Set up for the possibility that RCU will post a timer. */
1688   - if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
1689   - *delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies,
1690   - RCU_IDLE_GP_DELAY) - jiffies;
  1666 + rdtp->last_accelerate = jiffies;
  1667 +
  1668 + /* Request timer delay depending on laziness, and round. */
  1669 + if (rdtp->all_lazy) {
  1670 + *dj = round_up(rcu_idle_gp_delay + jiffies,
  1671 + rcu_idle_gp_delay) - jiffies;
1691 1672 } else {
1692   - *delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY;
1693   - *delta_jiffies = round_jiffies(*delta_jiffies) - jiffies;
  1673 + *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
1694 1674 }
1695 1675 return 0;
1696 1676 }
1697 1677  
1698 1678 /*
1699   - * Handler for smp_call_function_single(). The only point of this
1700   - * handler is to wake the CPU up, so the handler does only tracing.
1701   - */
1702   -void rcu_idle_demigrate(void *unused)
1703   -{
1704   - trace_rcu_prep_idle("Demigrate");
1705   -}
1706   -
1707   -/*
1708   - * Timer handler used to force CPU to start pushing its remaining RCU
1709   - * callbacks in the case where it entered dyntick-idle mode with callbacks
1710   - * pending. The hander doesn't really need to do anything because the
1711   - * real work is done upon re-entry to idle, or by the next scheduling-clock
1712   - * interrupt should idle not be re-entered.
  1679 + * Prepare a CPU for idle from an RCU perspective. The first major task
  1680 + * is to sense whether nohz mode has been enabled or disabled via sysfs.
  1681 + * The second major task is to check to see if a non-lazy callback has
  1682 + * arrived at a CPU that previously had only lazy callbacks. The third
  1683 + * major task is to accelerate (that is, assign grace-period numbers to)
  1684 + * any recently arrived callbacks.
1713 1685 *
1714   - * One special case: the timer gets migrated without awakening the CPU
1715   - * on which the timer was scheduled on. In this case, we must wake up
1716   - * that CPU. We do so with smp_call_function_single().
1717   - */
1718   -static void rcu_idle_gp_timer_func(unsigned long cpu_in)
1719   -{
1720   - int cpu = (int)cpu_in;
1721   -
1722   - trace_rcu_prep_idle("Timer");
1723   - if (cpu != smp_processor_id())
1724   - smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0);
1725   - else
1726   - WARN_ON_ONCE(1); /* Getting here can hang the system... */
1727   -}
1728   -
1729   -/*
1730   - * Initialize the timer used to pull CPUs out of dyntick-idle mode.
1731   - */
1732   -static void rcu_prepare_for_idle_init(int cpu)
1733   -{
1734   - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1735   -
1736   - rdtp->dyntick_holdoff = jiffies - 1;
1737   - setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu);
1738   - rdtp->idle_gp_timer_expires = jiffies - 1;
1739   - rdtp->idle_first_pass = 1;
1740   -}
1741   -
1742   -/*
1743   - * Clean up for exit from idle. Because we are exiting from idle, there
1744   - * is no longer any point to ->idle_gp_timer, so cancel it. This will
1745   - * do nothing if this timer is not active, so just cancel it unconditionally.
1746   - */
1747   -static void rcu_cleanup_after_idle(int cpu)
1748   -{
1749   - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1750   -
1751   - del_timer(&rdtp->idle_gp_timer);
1752   - trace_rcu_prep_idle("Cleanup after idle");
1753   - rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled);
1754   -}
1755   -
1756   -/*
1757   - * Check to see if any RCU-related work can be done by the current CPU,
1758   - * and if so, schedule a softirq to get it done. This function is part
1759   - * of the RCU implementation; it is -not- an exported member of the RCU API.
1760   - *
1761   - * The idea is for the current CPU to clear out all work required by the
1762   - * RCU core for the current grace period, so that this CPU can be permitted
1763   - * to enter dyntick-idle mode. In some cases, it will need to be awakened
1764   - * at the end of the grace period by whatever CPU ends the grace period.
1765   - * This allows CPUs to go dyntick-idle more quickly, and to reduce the
1766   - * number of wakeups by a modest integer factor.
1767   - *
1768   - * Because it is not legal to invoke rcu_process_callbacks() with irqs
1769   - * disabled, we do one pass of force_quiescent_state(), then do a
1770   - * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
1771   - * later. The ->dyntick_drain field controls the sequencing.
1772   - *
1773 1686 * The caller must have disabled interrupts.
1774 1687 */
1775 1688 static void rcu_prepare_for_idle(int cpu)
1776 1689 {
1777   - struct timer_list *tp;
  1690 + struct rcu_data *rdp;
1778 1691 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
  1692 + struct rcu_node *rnp;
  1693 + struct rcu_state *rsp;
1779 1694 int tne;
1780 1695  
1781 1696 /* Handle nohz enablement switches conservatively. */
1782 1697 tne = ACCESS_ONCE(tick_nohz_enabled);
1783 1698 if (tne != rdtp->tick_nohz_enabled_snap) {
1784   - if (rcu_cpu_has_callbacks(cpu))
  1699 + if (rcu_cpu_has_callbacks(cpu, NULL))
1785 1700 invoke_rcu_core(); /* force nohz to see update. */
1786 1701 rdtp->tick_nohz_enabled_snap = tne;
1787 1702 return;
1788 1703  
1789 1704  
1790 1705  
1791 1706  
1792 1707  
1793 1708  
1794 1709  
1795 1710  
1796 1711  
1797 1712  
1798 1713  
1799 1714  
... ... @@ -1789,126 +1704,57 @@
1789 1704 if (!tne)
1790 1705 return;
1791 1706  
1792   - /* Adaptive-tick mode, where usermode execution is idle to RCU. */
1793   - if (!is_idle_task(current)) {
1794   - rdtp->dyntick_holdoff = jiffies - 1;
1795   - if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
1796   - trace_rcu_prep_idle("User dyntick with callbacks");
1797   - rdtp->idle_gp_timer_expires =
1798   - round_up(jiffies + RCU_IDLE_GP_DELAY,
1799   - RCU_IDLE_GP_DELAY);
1800   - } else if (rcu_cpu_has_callbacks(cpu)) {
1801   - rdtp->idle_gp_timer_expires =
1802   - round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
1803   - trace_rcu_prep_idle("User dyntick with lazy callbacks");
1804   - } else {
1805   - return;
1806   - }
1807   - tp = &rdtp->idle_gp_timer;
1808   - mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
  1707 + /* If this is a no-CBs CPU, no callbacks, just return. */
  1708 + if (is_nocb_cpu(cpu))
1809 1709 return;
1810   - }
1811 1710  
1812 1711 /*
1813   - * If this is an idle re-entry, for example, due to use of
1814   - * RCU_NONIDLE() or the new idle-loop tracing API within the idle
1815   - * loop, then don't take any state-machine actions, unless the
1816   - * momentary exit from idle queued additional non-lazy callbacks.
1817   - * Instead, repost the ->idle_gp_timer if this CPU has callbacks
1818   - * pending.
  1712 + * If a non-lazy callback arrived at a CPU having only lazy
  1713 + * callbacks, invoke RCU core for the side-effect of recalculating
  1714 + * idle duration on re-entry to idle.
1819 1715 */
1820   - if (!rdtp->idle_first_pass &&
1821   - (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) {
1822   - if (rcu_cpu_has_callbacks(cpu)) {
1823   - tp = &rdtp->idle_gp_timer;
1824   - mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
1825   - }
  1716 + if (rdtp->all_lazy &&
  1717 + rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) {
  1718 + invoke_rcu_core();
1826 1719 return;
1827 1720 }
1828   - rdtp->idle_first_pass = 0;
1829   - rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1;
1830 1721  
1831 1722 /*
1832   - * If there are no callbacks on this CPU, enter dyntick-idle mode.
1833   - * Also reset state to avoid prejudicing later attempts.
  1723 + * If we have not yet accelerated this jiffy, accelerate all
  1724 + * callbacks on this CPU.
1834 1725 */
1835   - if (!rcu_cpu_has_callbacks(cpu)) {
1836   - rdtp->dyntick_holdoff = jiffies - 1;
1837   - rdtp->dyntick_drain = 0;
1838   - trace_rcu_prep_idle("No callbacks");
  1726 + if (rdtp->last_accelerate == jiffies)
1839 1727 return;
  1728 + rdtp->last_accelerate = jiffies;
  1729 + for_each_rcu_flavor(rsp) {
  1730 + rdp = per_cpu_ptr(rsp->rda, cpu);
  1731 + if (!*rdp->nxttail[RCU_DONE_TAIL])
  1732 + continue;
  1733 + rnp = rdp->mynode;
  1734 + raw_spin_lock(&rnp->lock); /* irqs already disabled. */
  1735 + rcu_accelerate_cbs(rsp, rnp, rdp);
  1736 + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1840 1737 }
  1738 +}
1841 1739  
1842   - /*
1843   - * If in holdoff mode, just return. We will presumably have
1844   - * refrained from disabling the scheduling-clock tick.
1845   - */
1846   - if (rdtp->dyntick_holdoff == jiffies) {
1847   - trace_rcu_prep_idle("In holdoff");
1848   - return;
1849   - }
  1740 +/*
  1741 + * Clean up for exit from idle. Attempt to advance callbacks based on
  1742 + * any grace periods that elapsed while the CPU was idle, and if any
  1743 + * callbacks are now ready to invoke, initiate invocation.
  1744 + */
  1745 +static void rcu_cleanup_after_idle(int cpu)
  1746 +{
  1747 + struct rcu_data *rdp;
  1748 + struct rcu_state *rsp;
1850 1749  
1851   - /* Check and update the ->dyntick_drain sequencing. */
1852   - if (rdtp->dyntick_drain <= 0) {
1853   - /* First time through, initialize the counter. */
1854   - rdtp->dyntick_drain = RCU_IDLE_FLUSHES;
1855   - } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES &&
1856   - !rcu_pending(cpu) &&
1857   - !local_softirq_pending()) {
1858   - /* Can we go dyntick-idle despite still having callbacks? */
1859   - rdtp->dyntick_drain = 0;
1860   - rdtp->dyntick_holdoff = jiffies;
1861   - if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
1862   - trace_rcu_prep_idle("Dyntick with callbacks");
1863   - rdtp->idle_gp_timer_expires =
1864   - round_up(jiffies + RCU_IDLE_GP_DELAY,
1865   - RCU_IDLE_GP_DELAY);
1866   - } else {
1867   - rdtp->idle_gp_timer_expires =
1868   - round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
1869   - trace_rcu_prep_idle("Dyntick with lazy callbacks");
1870   - }
1871   - tp = &rdtp->idle_gp_timer;
1872   - mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
1873   - rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
1874   - return; /* Nothing more to do immediately. */
1875   - } else if (--(rdtp->dyntick_drain) <= 0) {
1876   - /* We have hit the limit, so time to give up. */
1877   - rdtp->dyntick_holdoff = jiffies;
1878   - trace_rcu_prep_idle("Begin holdoff");
1879   - invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */
  1750 + if (is_nocb_cpu(cpu))
1880 1751 return;
  1752 + rcu_try_advance_all_cbs();
  1753 + for_each_rcu_flavor(rsp) {
  1754 + rdp = per_cpu_ptr(rsp->rda, cpu);
  1755 + if (cpu_has_callbacks_ready_to_invoke(rdp))
  1756 + invoke_rcu_core();
1881 1757 }
1882   -
1883   - /*
1884   - * Do one step of pushing the remaining RCU callbacks through
1885   - * the RCU core state machine.
1886   - */
1887   -#ifdef CONFIG_TREE_PREEMPT_RCU
1888   - if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
1889   - rcu_preempt_qs(cpu);
1890   - force_quiescent_state(&rcu_preempt_state);
1891   - }
1892   -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1893   - if (per_cpu(rcu_sched_data, cpu).nxtlist) {
1894   - rcu_sched_qs(cpu);
1895   - force_quiescent_state(&rcu_sched_state);
1896   - }
1897   - if (per_cpu(rcu_bh_data, cpu).nxtlist) {
1898   - rcu_bh_qs(cpu);
1899   - force_quiescent_state(&rcu_bh_state);
1900   - }
1901   -
1902   - /*
1903   - * If RCU callbacks are still pending, RCU still needs this CPU.
1904   - * So try forcing the callbacks through the grace period.
1905   - */
1906   - if (rcu_cpu_has_callbacks(cpu)) {
1907   - trace_rcu_prep_idle("More callbacks");
1908   - invoke_rcu_core();
1909   - } else {
1910   - trace_rcu_prep_idle("Callbacks drained");
1911   - }
1912 1758 }
1913 1759  
1914 1760 /*
1915 1761  
... ... @@ -2015,16 +1861,13 @@
2015 1861 static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
2016 1862 {
2017 1863 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
2018   - struct timer_list *tltp = &rdtp->idle_gp_timer;
2019   - char c;
  1864 + unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap;
2020 1865  
2021   - c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.';
2022   - if (timer_pending(tltp))
2023   - sprintf(cp, "drain=%d %c timer=%lu",
2024   - rdtp->dyntick_drain, c, tltp->expires - jiffies);
2025   - else
2026   - sprintf(cp, "drain=%d %c timer not pending",
2027   - rdtp->dyntick_drain, c);
  1866 + sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c",
  1867 + rdtp->last_accelerate & 0xffff, jiffies & 0xffff,
  1868 + ulong2long(nlpd),
  1869 + rdtp->all_lazy ? 'L' : '.',
  1870 + rdtp->tick_nohz_enabled_snap ? '.' : 'D');
2028 1871 }
2029 1872  
2030 1873 #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
2031 1874  
... ... @@ -2070,10 +1913,11 @@
2070 1913 ticks_value = rsp->gpnum - rdp->gpnum;
2071 1914 }
2072 1915 print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
2073   - printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d %s\n",
  1916 + printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n",
2074 1917 cpu, ticks_value, ticks_title,
2075 1918 atomic_read(&rdtp->dynticks) & 0xfff,
2076 1919 rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
  1920 + rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
2077 1921 fast_no_hz);
2078 1922 }
2079 1923  
... ... @@ -2087,6 +1931,7 @@
2087 1931 static void zero_cpu_stall_ticks(struct rcu_data *rdp)
2088 1932 {
2089 1933 rdp->ticks_this_gp = 0;
  1934 + rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
2090 1935 }
2091 1936  
2092 1937 /* Increment ->ticks_this_gp for all flavors of RCU. */
... ... @@ -2165,6 +2010,47 @@
2165 2010 }
2166 2011 early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
2167 2012  
  2013 +/*
  2014 + * Do any no-CBs CPUs need another grace period?
  2015 + *
  2016 + * Interrupts must be disabled. If the caller does not hold the root
  2017 + * rnp_node structure's ->lock, the results are advisory only.
  2018 + */
  2019 +static int rcu_nocb_needs_gp(struct rcu_state *rsp)
  2020 +{
  2021 + struct rcu_node *rnp = rcu_get_root(rsp);
  2022 +
  2023 + return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1];
  2024 +}
  2025 +
  2026 +/*
  2027 + * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
  2028 + * grace period.
  2029 + */
  2030 +static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
  2031 +{
  2032 + wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
  2033 +}
  2034 +
  2035 +/*
  2036 + * Set the root rcu_node structure's ->need_future_gp field
  2037 + * based on the sum of those of all rcu_node structures. This does
  2038 + * double-count the root rcu_node structure's requests, but this
  2039 + * is necessary to handle the possibility of a rcu_nocb_kthread()
  2040 + * having awakened during the time that the rcu_node structures
  2041 + * were being updated for the end of the previous grace period.
  2042 + */
  2043 +static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
  2044 +{
  2045 + rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
  2046 +}
  2047 +
  2048 +static void rcu_init_one_nocb(struct rcu_node *rnp)
  2049 +{
  2050 + init_waitqueue_head(&rnp->nocb_gp_wq[0]);
  2051 + init_waitqueue_head(&rnp->nocb_gp_wq[1]);
  2052 +}
  2053 +
2168 2054 /* Is the specified CPU a no-CPUs CPU? */
2169 2055 static bool is_nocb_cpu(int cpu)
2170 2056 {
... ... @@ -2227,6 +2113,13 @@
2227 2113 if (!is_nocb_cpu(rdp->cpu))
2228 2114 return 0;
2229 2115 __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy);
  2116 + if (__is_kfree_rcu_offset((unsigned long)rhp->func))
  2117 + trace_rcu_kfree_callback(rdp->rsp->name, rhp,
  2118 + (unsigned long)rhp->func,
  2119 + rdp->qlen_lazy, rdp->qlen);
  2120 + else
  2121 + trace_rcu_callback(rdp->rsp->name, rhp,
  2122 + rdp->qlen_lazy, rdp->qlen);
2230 2123 return 1;
2231 2124 }
2232 2125  
2233 2126  
2234 2127  
2235 2128  
2236 2129  
2237 2130  
2238 2131  
... ... @@ -2265,98 +2158,39 @@
2265 2158 }
2266 2159  
2267 2160 /*
2268   - * There must be at least one non-no-CBs CPU in operation at any given
2269   - * time, because no-CBs CPUs are not capable of initiating grace periods
2270   - * independently. This function therefore complains if the specified
2271   - * CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to
2272   - * avoid offlining the last such CPU. (Recursion is a wonderful thing,
2273   - * but you have to have a base case!)
  2161 + * If necessary, kick off a new grace period, and either way wait
  2162 + * for a subsequent grace period to complete.
2274 2163 */
2275   -static bool nocb_cpu_expendable(int cpu)
  2164 +static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2276 2165 {
2277   - cpumask_var_t non_nocb_cpus;
2278   - int ret;
  2166 + unsigned long c;
  2167 + bool d;
  2168 + unsigned long flags;
  2169 + struct rcu_node *rnp = rdp->mynode;
2279 2170  
  2171 + raw_spin_lock_irqsave(&rnp->lock, flags);
  2172 + c = rcu_start_future_gp(rnp, rdp);
  2173 + raw_spin_unlock_irqrestore(&rnp->lock, flags);
  2174 +
2280 2175 /*
2281   - * If there are no no-CB CPUs or if this CPU is not a no-CB CPU,
2282   - * then offlining this CPU is harmless. Let it happen.
  2176 + * Wait for the grace period. Do so interruptibly to avoid messing
  2177 + * up the load average.
2283 2178 */
2284   - if (!have_rcu_nocb_mask || is_nocb_cpu(cpu))
2285   - return 1;
2286   -
2287   - /* If no memory, play it safe and keep the CPU around. */
2288   - if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO))
2289   - return 0;
2290   - cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask);
2291   - cpumask_clear_cpu(cpu, non_nocb_cpus);
2292   - ret = !cpumask_empty(non_nocb_cpus);
2293   - free_cpumask_var(non_nocb_cpus);
2294   - return ret;
  2179 + trace_rcu_future_gp(rnp, rdp, c, "StartWait");
  2180 + for (;;) {
  2181 + wait_event_interruptible(
  2182 + rnp->nocb_gp_wq[c & 0x1],
  2183 + (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
  2184 + if (likely(d))
  2185 + break;
  2186 + flush_signals(current);
  2187 + trace_rcu_future_gp(rnp, rdp, c, "ResumeWait");
  2188 + }
  2189 + trace_rcu_future_gp(rnp, rdp, c, "EndWait");
  2190 + smp_mb(); /* Ensure that CB invocation happens after GP end. */
2295 2191 }
2296 2192  
2297 2193 /*
2298   - * Helper structure for remote registry of RCU callbacks.
2299   - * This is needed for when a no-CBs CPU needs to start a grace period.
2300   - * If it just invokes call_rcu(), the resulting callback will be queued,
2301   - * which can result in deadlock.
2302   - */
2303   -struct rcu_head_remote {
2304   - struct rcu_head *rhp;
2305   - call_rcu_func_t *crf;
2306   - void (*func)(struct rcu_head *rhp);
2307   -};
2308   -
2309   -/*
2310   - * Register a callback as specified by the rcu_head_remote struct.
2311   - * This function is intended to be invoked via smp_call_function_single().
2312   - */
2313   -static void call_rcu_local(void *arg)
2314   -{
2315   - struct rcu_head_remote *rhrp =
2316   - container_of(arg, struct rcu_head_remote, rhp);
2317   -
2318   - rhrp->crf(rhrp->rhp, rhrp->func);
2319   -}
2320   -
2321   -/*
2322   - * Set up an rcu_head_remote structure and the invoke call_rcu_local()
2323   - * on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via
2324   - * smp_call_function_single().
2325   - */
2326   -static void invoke_crf_remote(struct rcu_head *rhp,
2327   - void (*func)(struct rcu_head *rhp),
2328   - call_rcu_func_t crf)
2329   -{
2330   - struct rcu_head_remote rhr;
2331   -
2332   - rhr.rhp = rhp;
2333   - rhr.crf = crf;
2334   - rhr.func = func;
2335   - smp_call_function_single(0, call_rcu_local, &rhr, 1);
2336   -}
2337   -
2338   -/*
2339   - * Helper functions to be passed to wait_rcu_gp(), each of which
2340   - * invokes invoke_crf_remote() to register a callback appropriately.
2341   - */
2342   -static void __maybe_unused
2343   -call_rcu_preempt_remote(struct rcu_head *rhp,
2344   - void (*func)(struct rcu_head *rhp))
2345   -{
2346   - invoke_crf_remote(rhp, func, call_rcu);
2347   -}
2348   -static void call_rcu_bh_remote(struct rcu_head *rhp,
2349   - void (*func)(struct rcu_head *rhp))
2350   -{
2351   - invoke_crf_remote(rhp, func, call_rcu_bh);
2352   -}
2353   -static void call_rcu_sched_remote(struct rcu_head *rhp,
2354   - void (*func)(struct rcu_head *rhp))
2355   -{
2356   - invoke_crf_remote(rhp, func, call_rcu_sched);
2357   -}
2358   -
2359   -/*
2360 2194 * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes
2361 2195 * callbacks queued by the corresponding no-CBs CPU.
2362 2196 */
... ... @@ -2390,7 +2224,7 @@
2390 2224 cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
2391 2225 ACCESS_ONCE(rdp->nocb_p_count) += c;
2392 2226 ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl;
2393   - wait_rcu_gp(rdp->rsp->call_remote);
  2227 + rcu_nocb_wait_gp(rdp);
2394 2228  
2395 2229 /* Each pass through the following loop invokes a callback. */
2396 2230 trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
2397 2231  
2398 2232  
2399 2233  
2400 2234  
2401 2235  
2402 2236  
2403 2237  
... ... @@ -2436,33 +2270,42 @@
2436 2270 return;
2437 2271 for_each_cpu(cpu, rcu_nocb_mask) {
2438 2272 rdp = per_cpu_ptr(rsp->rda, cpu);
2439   - t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu);
  2273 + t = kthread_run(rcu_nocb_kthread, rdp,
  2274 + "rcuo%c/%d", rsp->abbr, cpu);
2440 2275 BUG_ON(IS_ERR(t));
2441 2276 ACCESS_ONCE(rdp->nocb_kthread) = t;
2442 2277 }
2443 2278 }
2444 2279  
2445 2280 /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
2446   -static void init_nocb_callback_list(struct rcu_data *rdp)
  2281 +static bool init_nocb_callback_list(struct rcu_data *rdp)
2447 2282 {
2448 2283 if (rcu_nocb_mask == NULL ||
2449 2284 !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask))
2450   - return;
  2285 + return false;
2451 2286 rdp->nxttail[RCU_NEXT_TAIL] = NULL;
  2287 + return true;
2452 2288 }
2453 2289  
2454   -/* Initialize the ->call_remote fields in the rcu_state structures. */
2455   -static void __init rcu_init_nocb(void)
  2290 +#else /* #ifdef CONFIG_RCU_NOCB_CPU */
  2291 +
  2292 +static int rcu_nocb_needs_gp(struct rcu_state *rsp)
2456 2293 {
2457   -#ifdef CONFIG_PREEMPT_RCU
2458   - rcu_preempt_state.call_remote = call_rcu_preempt_remote;
2459   -#endif /* #ifdef CONFIG_PREEMPT_RCU */
2460   - rcu_bh_state.call_remote = call_rcu_bh_remote;
2461   - rcu_sched_state.call_remote = call_rcu_sched_remote;
  2294 + return 0;
2462 2295 }
2463 2296  
2464   -#else /* #ifdef CONFIG_RCU_NOCB_CPU */
  2297 +static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
  2298 +{
  2299 +}
2465 2300  
  2301 +static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
  2302 +{
  2303 +}
  2304 +
  2305 +static void rcu_init_one_nocb(struct rcu_node *rnp)
  2306 +{
  2307 +}
  2308 +
2466 2309 static bool is_nocb_cpu(int cpu)
2467 2310 {
2468 2311 return false;
... ... @@ -2480,11 +2323,6 @@
2480 2323 return 0;
2481 2324 }
2482 2325  
2483   -static bool nocb_cpu_expendable(int cpu)
2484   -{
2485   - return 1;
2486   -}
2487   -
2488 2326 static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2489 2327 {
2490 2328 }
2491 2329  
... ... @@ -2493,12 +2331,9 @@
2493 2331 {
2494 2332 }
2495 2333  
2496   -static void init_nocb_callback_list(struct rcu_data *rdp)
  2334 +static bool init_nocb_callback_list(struct rcu_data *rdp)
2497 2335 {
2498   -}
2499   -
2500   -static void __init rcu_init_nocb(void)
2501   -{
  2336 + return false;
2502 2337 }
2503 2338  
2504 2339 #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
kernel/rcutree_trace.c
... ... @@ -46,8 +46,6 @@
46 46 #define RCU_TREE_NONCORE
47 47 #include "rcutree.h"
48 48  
49   -#define ulong2long(a) (*(long *)(&(a)))
50   -
51 49 static int r_open(struct inode *inode, struct file *file,
52 50 const struct seq_operations *op)
53 51 {