Commit 4446a36ff8c74ac3b32feb009b651048e129c6af

Authored by Paul E. McKenney
Committed by Ingo Molnar
1 parent 8b09dee67f

rcu: add call_rcu_sched()

Fourth cut of patch to provide the call_rcu_sched().  This is again to
synchronize_sched() as call_rcu() is to synchronize_rcu().

Should be fine for experimental and -rt use, but not ready for inclusion.
With some luck, I will be able to tell Andrew to come out of hiding on
the next round.

Passes multi-day rcutorture sessions with concurrent CPU hotplugging.

Fixes since the first version include a bug that could result in
indefinite blocking (spotted by Gautham Shenoy), better resiliency
against CPU-hotplug operations, and other minor fixes.

Fixes since the second version include reworking grace-period detection
to avoid deadlocks that could happen when running concurrently with
CPU hotplug, adding Mathieu's fix to avoid the softlockup messages,
as well as Mathieu's fix to allow use earlier in boot.

Fixes since the third version include a wrong-CPU bug spotted by
Andrew, getting rid of the obsolete synchronize_kernel API that somehow
snuck back in, merging spin_unlock() and local_irq_restore() in a
few places, commenting the code that checks for quiescent states based
on interrupting from user-mode execution or the idle loop, removing
some inline attributes, and some code-style changes.

Known/suspected shortcomings:

o	I still do not entirely trust the sleep/wakeup logic.  Next step
	will be to use a private snapshot of the CPU online mask in
	rcu_sched_grace_period() -- if the CPU wasn't there at the start
	of the grace period, we don't need to hear from it.  And the
	bit about accounting for changes in online CPUs inside of
	rcu_sched_grace_period() is ugly anyway.

o	It might be good for rcu_sched_grace_period() to invoke
	resched_cpu() when a given CPU wasn't responding quickly,
	but resched_cpu() is declared static...

This patch also fixes a long-standing bug in the earlier preemptable-RCU
implementation of synchronize_rcu() that could result in loss of
concurrent external changes to a task's CPU affinity mask.  I still cannot
remember who reported this...

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Showing 6 changed files with 434 additions and 68 deletions Side-by-side Diff

include/linux/rcuclassic.h
... ... @@ -151,7 +151,10 @@
151 151  
152 152 #define __synchronize_sched() synchronize_rcu()
153 153  
  154 +#define call_rcu_sched(head, func) call_rcu(head, func)
  155 +
154 156 extern void __rcu_init(void);
  157 +#define rcu_init_sched() do { } while (0)
155 158 extern void rcu_check_callbacks(int cpu, int user);
156 159 extern void rcu_restart_cpu(int cpu);
157 160  
include/linux/rcupdate.h
... ... @@ -40,6 +40,7 @@
40 40 #include <linux/cpumask.h>
41 41 #include <linux/seqlock.h>
42 42 #include <linux/lockdep.h>
  43 +#include <linux/completion.h>
43 44  
44 45 /**
45 46 * struct rcu_head - callback structure for use with RCU
... ... @@ -167,6 +168,27 @@
167 168 smp_wmb(); \
168 169 (p) = (v); \
169 170 })
  171 +
  172 +/* Infrastructure to implement the synchronize_() primitives. */
  173 +
  174 +struct rcu_synchronize {
  175 + struct rcu_head head;
  176 + struct completion completion;
  177 +};
  178 +
  179 +extern void wakeme_after_rcu(struct rcu_head *head);
  180 +
  181 +#define synchronize_rcu_xxx(name, func) \
  182 +void name(void) \
  183 +{ \
  184 + struct rcu_synchronize rcu; \
  185 + \
  186 + init_completion(&rcu.completion); \
  187 + /* Will wake me after RCU finished. */ \
  188 + func(&rcu.head, wakeme_after_rcu); \
  189 + /* Wait for it. */ \
  190 + wait_for_completion(&rcu.completion); \
  191 +}
170 192  
171 193 /**
172 194 * synchronize_sched - block until all CPUs have exited any non-preemptive
include/linux/rcupreempt.h
... ... @@ -40,10 +40,39 @@
40 40 #include <linux/cpumask.h>
41 41 #include <linux/seqlock.h>
42 42  
43   -#define rcu_qsctr_inc(cpu)
  43 +struct rcu_dyntick_sched {
  44 + int dynticks;
  45 + int dynticks_snap;
  46 + int sched_qs;
  47 + int sched_qs_snap;
  48 + int sched_dynticks_snap;
  49 +};
  50 +
  51 +DECLARE_PER_CPU(struct rcu_dyntick_sched, rcu_dyntick_sched);
  52 +
  53 +static inline void rcu_qsctr_inc(int cpu)
  54 +{
  55 + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
  56 +
  57 + rdssp->sched_qs++;
  58 +}
44 59 #define rcu_bh_qsctr_inc(cpu)
45 60 #define call_rcu_bh(head, rcu) call_rcu(head, rcu)
46 61  
  62 +/**
  63 + * call_rcu_sched - Queue RCU callback for invocation after sched grace period.
  64 + * @head: structure to be used for queueing the RCU updates.
  65 + * @func: actual update function to be invoked after the grace period
  66 + *
  67 + * The update function will be invoked some time after a full
  68 + * synchronize_sched()-style grace period elapses, in other words after
  69 + * all currently executing preempt-disabled sections of code (including
  70 + * hardirq handlers, NMI handlers, and local_irq_save() blocks) have
  71 + * completed.
  72 + */
  73 +extern void call_rcu_sched(struct rcu_head *head,
  74 + void (*func)(struct rcu_head *head));
  75 +
47 76 extern void __rcu_read_lock(void) __acquires(RCU);
48 77 extern void __rcu_read_unlock(void) __releases(RCU);
49 78 extern int rcu_pending(int cpu);
... ... @@ -55,6 +84,7 @@
55 84 extern void __synchronize_sched(void);
56 85  
57 86 extern void __rcu_init(void);
  87 +extern void rcu_init_sched(void);
58 88 extern void rcu_check_callbacks(int cpu, int user);
59 89 extern void rcu_restart_cpu(int cpu);
60 90 extern long rcu_batches_completed(void);
61 91  
62 92  
63 93  
... ... @@ -81,20 +111,20 @@
81 111 struct softirq_action;
82 112  
83 113 #ifdef CONFIG_NO_HZ
84   -DECLARE_PER_CPU(long, dynticks_progress_counter);
  114 +DECLARE_PER_CPU(struct rcu_dyntick_sched, rcu_dyntick_sched);
85 115  
86 116 static inline void rcu_enter_nohz(void)
87 117 {
88 118 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
89   - __get_cpu_var(dynticks_progress_counter)++;
90   - WARN_ON(__get_cpu_var(dynticks_progress_counter) & 0x1);
  119 + __get_cpu_var(rcu_dyntick_sched).dynticks++;
  120 + WARN_ON(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1);
91 121 }
92 122  
93 123 static inline void rcu_exit_nohz(void)
94 124 {
95   - __get_cpu_var(dynticks_progress_counter)++;
96 125 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
97   - WARN_ON(!(__get_cpu_var(dynticks_progress_counter) & 0x1));
  126 + __get_cpu_var(rcu_dyntick_sched).dynticks++;
  127 + WARN_ON(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1));
98 128 }
99 129  
100 130 #else /* CONFIG_NO_HZ */
... ... @@ -758,6 +758,7 @@
758 758 */
759 759 static void __init do_basic_setup(void)
760 760 {
  761 + rcu_init_sched(); /* needed by module_init stage. */
761 762 /* drivers will send hotplug events */
762 763 init_workqueues();
763 764 usermodehelper_init();
... ... @@ -39,18 +39,12 @@
39 39 #include <linux/sched.h>
40 40 #include <asm/atomic.h>
41 41 #include <linux/bitops.h>
42   -#include <linux/completion.h>
43 42 #include <linux/percpu.h>
44 43 #include <linux/notifier.h>
45 44 #include <linux/cpu.h>
46 45 #include <linux/mutex.h>
47 46 #include <linux/module.h>
48 47  
49   -struct rcu_synchronize {
50   - struct rcu_head head;
51   - struct completion completion;
52   -};
53   -
54 48 static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
55 49 static atomic_t rcu_barrier_cpu_count;
56 50 static DEFINE_MUTEX(rcu_barrier_mutex);
... ... @@ -60,7 +54,7 @@
60 54 * Awaken the corresponding synchronize_rcu() instance now that a
61 55 * grace period has elapsed.
62 56 */
63   -static void wakeme_after_rcu(struct rcu_head *head)
  57 +void wakeme_after_rcu(struct rcu_head *head)
64 58 {
65 59 struct rcu_synchronize *rcu;
66 60  
... ... @@ -77,17 +71,7 @@
77 71 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
78 72 * and may be nested.
79 73 */
80   -void synchronize_rcu(void)
81   -{
82   - struct rcu_synchronize rcu;
83   -
84   - init_completion(&rcu.completion);
85   - /* Will wake me after RCU finished */
86   - call_rcu(&rcu.head, wakeme_after_rcu);
87   -
88   - /* Wait for it */
89   - wait_for_completion(&rcu.completion);
90   -}
  74 +synchronize_rcu_xxx(synchronize_rcu, call_rcu)
91 75 EXPORT_SYMBOL_GPL(synchronize_rcu);
92 76  
93 77 static void rcu_barrier_callback(struct rcu_head *notused)
... ... @@ -46,6 +46,7 @@
46 46 #include <asm/atomic.h>
47 47 #include <linux/bitops.h>
48 48 #include <linux/module.h>
  49 +#include <linux/kthread.h>
49 50 #include <linux/completion.h>
50 51 #include <linux/moduleparam.h>
51 52 #include <linux/percpu.h>
52 53  
... ... @@ -87,9 +88,14 @@
87 88 struct rcu_head **nexttail;
88 89 struct rcu_head *waitlist[GP_STAGES];
89 90 struct rcu_head **waittail[GP_STAGES];
90   - struct rcu_head *donelist;
  91 + struct rcu_head *donelist; /* from waitlist & waitschedlist */
91 92 struct rcu_head **donetail;
92 93 long rcu_flipctr[2];
  94 + struct rcu_head *nextschedlist;
  95 + struct rcu_head **nextschedtail;
  96 + struct rcu_head *waitschedlist;
  97 + struct rcu_head **waitschedtail;
  98 + int rcu_sched_sleeping;
93 99 #ifdef CONFIG_RCU_TRACE
94 100 struct rcupreempt_trace trace;
95 101 #endif /* #ifdef CONFIG_RCU_TRACE */
96 102  
... ... @@ -131,11 +137,24 @@
131 137 rcu_try_flip_waitmb_state,
132 138 };
133 139  
  140 +/*
  141 + * States for rcu_ctrlblk.rcu_sched_sleep.
  142 + */
  143 +
  144 +enum rcu_sched_sleep_states {
  145 + rcu_sched_not_sleeping, /* Not sleeping, callbacks need GP. */
  146 + rcu_sched_sleep_prep, /* Thinking of sleeping, rechecking. */
  147 + rcu_sched_sleeping, /* Sleeping, awaken if GP needed. */
  148 +};
  149 +
134 150 struct rcu_ctrlblk {
135 151 spinlock_t fliplock; /* Protect state-machine transitions. */
136 152 long completed; /* Number of last completed batch. */
137 153 enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
138 154 the rcu state machine */
  155 + spinlock_t schedlock; /* Protect rcu_sched sleep state. */
  156 + enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */
  157 + wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */
139 158 };
140 159  
141 160 static DEFINE_PER_CPU(struct rcu_data, rcu_data);
142 161  
... ... @@ -143,8 +162,12 @@
143 162 .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
144 163 .completed = 0,
145 164 .rcu_try_flip_state = rcu_try_flip_idle_state,
  165 + .schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock),
  166 + .sched_sleep = rcu_sched_not_sleeping,
  167 + .sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq),
146 168 };
147 169  
  170 +static struct task_struct *rcu_sched_grace_period_task;
148 171  
149 172 #ifdef CONFIG_RCU_TRACE
150 173 static char *rcu_try_flip_state_names[] =
... ... @@ -207,6 +230,8 @@
207 230 */
208 231 #define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
209 232  
  233 +#define RCU_SCHED_BATCH_TIME (HZ / 50)
  234 +
210 235 /*
211 236 * Return the number of RCU batches processed thus far. Useful
212 237 * for debug and statistics.
213 238  
214 239  
215 240  
216 241  
217 242  
... ... @@ -411,32 +436,34 @@
411 436 }
412 437 }
413 438  
414   -#ifdef CONFIG_NO_HZ
  439 +DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
  440 + .dynticks = 1,
  441 +};
415 442  
416   -DEFINE_PER_CPU(long, dynticks_progress_counter) = 1;
417   -static DEFINE_PER_CPU(long, rcu_dyntick_snapshot);
  443 +#ifdef CONFIG_NO_HZ
418 444 static DEFINE_PER_CPU(int, rcu_update_flag);
419 445  
420 446 /**
421 447 * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
422 448 *
423 449 * If the CPU was idle with dynamic ticks active, this updates the
424   - * dynticks_progress_counter to let the RCU handling know that the
  450 + * rcu_dyntick_sched.dynticks to let the RCU handling know that the
425 451 * CPU is active.
426 452 */
427 453 void rcu_irq_enter(void)
428 454 {
429 455 int cpu = smp_processor_id();
  456 + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
430 457  
431 458 if (per_cpu(rcu_update_flag, cpu))
432 459 per_cpu(rcu_update_flag, cpu)++;
433 460  
434 461 /*
435 462 * Only update if we are coming from a stopped ticks mode
436   - * (dynticks_progress_counter is even).
  463 + * (rcu_dyntick_sched.dynticks is even).
437 464 */
438 465 if (!in_interrupt() &&
439   - (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) {
  466 + (rdssp->dynticks & 0x1) == 0) {
440 467 /*
441 468 * The following might seem like we could have a race
442 469 * with NMI/SMIs. But this really isn't a problem.
443 470  
... ... @@ -459,12 +486,12 @@
459 486 * RCU read-side critical sections on this CPU would
460 487 * have already completed.
461 488 */
462   - per_cpu(dynticks_progress_counter, cpu)++;
  489 + rdssp->dynticks++;
463 490 /*
464 491 * The following memory barrier ensures that any
465 492 * rcu_read_lock() primitives in the irq handler
466 493 * are seen by other CPUs to follow the above
467   - * increment to dynticks_progress_counter. This is
  494 + * increment to rcu_dyntick_sched.dynticks. This is
468 495 * required in order for other CPUs to correctly
469 496 * determine when it is safe to advance the RCU
470 497 * grace-period state machine.
... ... @@ -472,7 +499,7 @@
472 499 smp_mb(); /* see above block comment. */
473 500 /*
474 501 * Since we can't determine the dynamic tick mode from
475   - * the dynticks_progress_counter after this routine,
  502 + * the rcu_dyntick_sched.dynticks after this routine,
476 503 * we use a second flag to acknowledge that we came
477 504 * from an idle state with ticks stopped.
478 505 */
... ... @@ -480,7 +507,7 @@
480 507 /*
481 508 * If we take an NMI/SMI now, they will also increment
482 509 * the rcu_update_flag, and will not update the
483   - * dynticks_progress_counter on exit. That is for
  510 + * rcu_dyntick_sched.dynticks on exit. That is for
484 511 * this IRQ to do.
485 512 */
486 513 }
487 514  
... ... @@ -490,12 +517,13 @@
490 517 * rcu_irq_exit - Called from exiting Hard irq context.
491 518 *
492 519 * If the CPU was idle with dynamic ticks active, update the
493   - * dynticks_progress_counter to put let the RCU handling be
  520 + * rcu_dyntick_sched.dynticks to put let the RCU handling be
494 521 * aware that the CPU is going back to idle with no ticks.
495 522 */
496 523 void rcu_irq_exit(void)
497 524 {
498 525 int cpu = smp_processor_id();
  526 + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
499 527  
500 528 /*
501 529 * rcu_update_flag is set if we interrupted the CPU
... ... @@ -503,7 +531,7 @@
503 531 * Once this occurs, we keep track of interrupt nesting
504 532 * because a NMI/SMI could also come in, and we still
505 533 * only want the IRQ that started the increment of the
506   - * dynticks_progress_counter to be the one that modifies
  534 + * rcu_dyntick_sched.dynticks to be the one that modifies
507 535 * it on exit.
508 536 */
509 537 if (per_cpu(rcu_update_flag, cpu)) {
510 538  
511 539  
512 540  
... ... @@ -515,28 +543,29 @@
515 543  
516 544 /*
517 545 * If an NMI/SMI happens now we are still
518   - * protected by the dynticks_progress_counter being odd.
  546 + * protected by the rcu_dyntick_sched.dynticks being odd.
519 547 */
520 548  
521 549 /*
522 550 * The following memory barrier ensures that any
523 551 * rcu_read_unlock() primitives in the irq handler
524 552 * are seen by other CPUs to preceed the following
525   - * increment to dynticks_progress_counter. This
  553 + * increment to rcu_dyntick_sched.dynticks. This
526 554 * is required in order for other CPUs to determine
527 555 * when it is safe to advance the RCU grace-period
528 556 * state machine.
529 557 */
530 558 smp_mb(); /* see above block comment. */
531   - per_cpu(dynticks_progress_counter, cpu)++;
532   - WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1);
  559 + rdssp->dynticks++;
  560 + WARN_ON(rdssp->dynticks & 0x1);
533 561 }
534 562 }
535 563  
536 564 static void dyntick_save_progress_counter(int cpu)
537 565 {
538   - per_cpu(rcu_dyntick_snapshot, cpu) =
539   - per_cpu(dynticks_progress_counter, cpu);
  566 + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
  567 +
  568 + rdssp->dynticks_snap = rdssp->dynticks;
540 569 }
541 570  
542 571 static inline int
543 572  
... ... @@ -544,9 +573,10 @@
544 573 {
545 574 long curr;
546 575 long snap;
  576 + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
547 577  
548   - curr = per_cpu(dynticks_progress_counter, cpu);
549   - snap = per_cpu(rcu_dyntick_snapshot, cpu);
  578 + curr = rdssp->dynticks;
  579 + snap = rdssp->dynticks_snap;
550 580 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
551 581  
552 582 /*
553 583  
... ... @@ -580,9 +610,10 @@
580 610 {
581 611 long curr;
582 612 long snap;
  613 + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
583 614  
584   - curr = per_cpu(dynticks_progress_counter, cpu);
585   - snap = per_cpu(rcu_dyntick_snapshot, cpu);
  615 + curr = rdssp->dynticks;
  616 + snap = rdssp->dynticks_snap;
586 617 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
587 618  
588 619 /*
589 620  
590 621  
591 622  
... ... @@ -609,14 +640,86 @@
609 640 return 1;
610 641 }
611 642  
  643 +static void dyntick_save_progress_counter_sched(int cpu)
  644 +{
  645 + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
  646 +
  647 + rdssp->sched_dynticks_snap = rdssp->dynticks;
  648 +}
  649 +
  650 +static int rcu_qsctr_inc_needed_dyntick(int cpu)
  651 +{
  652 + long curr;
  653 + long snap;
  654 + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
  655 +
  656 + curr = rdssp->dynticks;
  657 + snap = rdssp->sched_dynticks_snap;
  658 + smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
  659 +
  660 + /*
  661 + * If the CPU remained in dynticks mode for the entire time
  662 + * and didn't take any interrupts, NMIs, SMIs, or whatever,
  663 + * then it cannot be in the middle of an rcu_read_lock(), so
  664 + * the next rcu_read_lock() it executes must use the new value
  665 + * of the counter. Therefore, this CPU has been in a quiescent
  666 + * state the entire time, and we don't need to wait for it.
  667 + */
  668 +
  669 + if ((curr == snap) && ((curr & 0x1) == 0))
  670 + return 0;
  671 +
  672 + /*
  673 + * If the CPU passed through or entered a dynticks idle phase with
  674 + * no active irq handlers, then, as above, this CPU has already
  675 + * passed through a quiescent state.
  676 + */
  677 +
  678 + if ((curr - snap) > 2 || (snap & 0x1) == 0)
  679 + return 0;
  680 +
  681 + /* We need this CPU to go through a quiescent state. */
  682 +
  683 + return 1;
  684 +}
  685 +
612 686 #else /* !CONFIG_NO_HZ */
613 687  
614   -# define dyntick_save_progress_counter(cpu) do { } while (0)
615   -# define rcu_try_flip_waitack_needed(cpu) (1)
616   -# define rcu_try_flip_waitmb_needed(cpu) (1)
  688 +# define dyntick_save_progress_counter(cpu) do { } while (0)
  689 +# define rcu_try_flip_waitack_needed(cpu) (1)
  690 +# define rcu_try_flip_waitmb_needed(cpu) (1)
617 691  
  692 +# define dyntick_save_progress_counter_sched(cpu) do { } while (0)
  693 +# define rcu_qsctr_inc_needed_dyntick(cpu) (1)
  694 +
618 695 #endif /* CONFIG_NO_HZ */
619 696  
  697 +static void save_qsctr_sched(int cpu)
  698 +{
  699 + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
  700 +
  701 + rdssp->sched_qs_snap = rdssp->sched_qs;
  702 +}
  703 +
  704 +static inline int rcu_qsctr_inc_needed(int cpu)
  705 +{
  706 + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
  707 +
  708 + /*
  709 + * If there has been a quiescent state, no more need to wait
  710 + * on this CPU.
  711 + */
  712 +
  713 + if (rdssp->sched_qs != rdssp->sched_qs_snap) {
  714 + smp_mb(); /* force ordering with cpu entering schedule(). */
  715 + return 0;
  716 + }
  717 +
  718 + /* We need this CPU to go through a quiescent state. */
  719 +
  720 + return 1;
  721 +}
  722 +
620 723 /*
621 724 * Get here when RCU is idle. Decide whether we need to
622 725 * move out of idle state, and return non-zero if so.
... ... @@ -819,6 +922,26 @@
819 922 unsigned long flags;
820 923 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
821 924  
  925 + /*
  926 + * If this CPU took its interrupt from user mode or from the
  927 + * idle loop, and this is not a nested interrupt, then
  928 + * this CPU has to have exited all prior preept-disable
  929 + * sections of code. So increment the counter to note this.
  930 + *
  931 + * The memory barrier is needed to handle the case where
  932 + * writes from a preempt-disable section of code get reordered
  933 + * into schedule() by this CPU's write buffer. So the memory
  934 + * barrier makes sure that the rcu_qsctr_inc() is seen by other
  935 + * CPUs to happen after any such write.
  936 + */
  937 +
  938 + if (user ||
  939 + (idle_cpu(cpu) && !in_softirq() &&
  940 + hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
  941 + smp_mb(); /* Guard against aggressive schedule(). */
  942 + rcu_qsctr_inc(cpu);
  943 + }
  944 +
822 945 rcu_check_mb(cpu);
823 946 if (rcu_ctrlblk.completed == rdp->completed)
824 947 rcu_try_flip();
... ... @@ -869,6 +992,8 @@
869 992 struct rcu_head *list = NULL;
870 993 unsigned long flags;
871 994 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
  995 + struct rcu_head *schedlist = NULL;
  996 + struct rcu_head **schedtail = &schedlist;
872 997 struct rcu_head **tail = &list;
873 998  
874 999 /*
... ... @@ -882,6 +1007,11 @@
882 1007 rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
883 1008 list, tail);
884 1009 rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
  1010 + rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail,
  1011 + schedlist, schedtail);
  1012 + rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail,
  1013 + schedlist, schedtail);
  1014 + rdp->rcu_sched_sleeping = 0;
885 1015 spin_unlock_irqrestore(&rdp->lock, flags);
886 1016 rdp->waitlistcount = 0;
887 1017  
888 1018  
889 1019  
890 1020  
... ... @@ -916,22 +1046,40 @@
916 1046 * fix.
917 1047 */
918 1048  
919   - local_irq_save(flags);
  1049 + local_irq_save(flags); /* disable preempt till we know what lock. */
920 1050 rdp = RCU_DATA_ME();
921 1051 spin_lock(&rdp->lock);
922 1052 *rdp->nexttail = list;
923 1053 if (list)
924 1054 rdp->nexttail = tail;
  1055 + *rdp->nextschedtail = schedlist;
  1056 + if (schedlist)
  1057 + rdp->nextschedtail = schedtail;
925 1058 spin_unlock_irqrestore(&rdp->lock, flags);
926 1059 }
927 1060  
928 1061 void __devinit rcu_online_cpu(int cpu)
929 1062 {
930 1063 unsigned long flags;
  1064 + struct rcu_data *rdp;
931 1065  
932 1066 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
933 1067 cpu_set(cpu, rcu_cpu_online_map);
934 1068 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
  1069 +
  1070 + /*
  1071 + * The rcu_sched grace-period processing might have bypassed
  1072 + * this CPU, given that it was not in the rcu_cpu_online_map
  1073 + * when the grace-period scan started. This means that the
  1074 + * grace-period task might sleep. So make sure that if this
  1075 + * should happen, the first callback posted to this CPU will
  1076 + * wake up the grace-period task if need be.
  1077 + */
  1078 +
  1079 + rdp = RCU_DATA_CPU(cpu);
  1080 + spin_lock_irqsave(&rdp->lock, flags);
  1081 + rdp->rcu_sched_sleeping = 1;
  1082 + spin_unlock_irqrestore(&rdp->lock, flags);
935 1083 }
936 1084  
937 1085 #else /* #ifdef CONFIG_HOTPLUG_CPU */
938 1086  
939 1087  
940 1088  
941 1089  
942 1090  
943 1091  
... ... @@ -986,31 +1134,196 @@
986 1134 *rdp->nexttail = head;
987 1135 rdp->nexttail = &head->next;
988 1136 RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
989   - spin_unlock(&rdp->lock);
990   - local_irq_restore(flags);
  1137 + spin_unlock_irqrestore(&rdp->lock, flags);
991 1138 }
992 1139 EXPORT_SYMBOL_GPL(call_rcu);
993 1140  
  1141 +void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
  1142 +{
  1143 + unsigned long flags;
  1144 + struct rcu_data *rdp;
  1145 + int wake_gp = 0;
  1146 +
  1147 + head->func = func;
  1148 + head->next = NULL;
  1149 + local_irq_save(flags);
  1150 + rdp = RCU_DATA_ME();
  1151 + spin_lock(&rdp->lock);
  1152 + *rdp->nextschedtail = head;
  1153 + rdp->nextschedtail = &head->next;
  1154 + if (rdp->rcu_sched_sleeping) {
  1155 +
  1156 + /* Grace-period processing might be sleeping... */
  1157 +
  1158 + rdp->rcu_sched_sleeping = 0;
  1159 + wake_gp = 1;
  1160 + }
  1161 + spin_unlock_irqrestore(&rdp->lock, flags);
  1162 + if (wake_gp) {
  1163 +
  1164 + /* Wake up grace-period processing, unless someone beat us. */
  1165 +
  1166 + spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
  1167 + if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping)
  1168 + wake_gp = 0;
  1169 + rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping;
  1170 + spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
  1171 + if (wake_gp)
  1172 + wake_up_interruptible(&rcu_ctrlblk.sched_wq);
  1173 + }
  1174 +}
  1175 +EXPORT_SYMBOL_GPL(call_rcu_sched);
  1176 +
994 1177 /*
995 1178 * Wait until all currently running preempt_disable() code segments
996 1179 * (including hardware-irq-disable segments) complete. Note that
997 1180 * in -rt this does -not- necessarily result in all currently executing
998 1181 * interrupt -handlers- having completed.
999 1182 */
1000   -void __synchronize_sched(void)
  1183 +synchronize_rcu_xxx(__synchronize_sched, call_rcu_sched)
  1184 +EXPORT_SYMBOL_GPL(__synchronize_sched);
  1185 +
  1186 +/*
  1187 + * kthread function that manages call_rcu_sched grace periods.
  1188 + */
  1189 +static int rcu_sched_grace_period(void *arg)
1001 1190 {
1002   - cpumask_t oldmask;
  1191 + int couldsleep; /* might sleep after current pass. */
  1192 + int couldsleepnext = 0; /* might sleep after next pass. */
1003 1193 int cpu;
  1194 + unsigned long flags;
  1195 + struct rcu_data *rdp;
  1196 + int ret;
1004 1197  
1005   - if (sched_getaffinity(0, &oldmask) < 0)
1006   - oldmask = cpu_possible_map;
1007   - for_each_online_cpu(cpu) {
1008   - sched_setaffinity(0, &cpumask_of_cpu(cpu));
1009   - schedule();
1010   - }
1011   - sched_setaffinity(0, &oldmask);
  1198 + /*
  1199 + * Each pass through the following loop handles one
  1200 + * rcu_sched grace period cycle.
  1201 + */
  1202 + do {
  1203 + /* Save each CPU's current state. */
  1204 +
  1205 + for_each_online_cpu(cpu) {
  1206 + dyntick_save_progress_counter_sched(cpu);
  1207 + save_qsctr_sched(cpu);
  1208 + }
  1209 +
  1210 + /*
  1211 + * Sleep for about an RCU grace-period's worth to
  1212 + * allow better batching and to consume less CPU.
  1213 + */
  1214 + schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME);
  1215 +
  1216 + /*
  1217 + * If there was nothing to do last time, prepare to
  1218 + * sleep at the end of the current grace period cycle.
  1219 + */
  1220 + couldsleep = couldsleepnext;
  1221 + couldsleepnext = 1;
  1222 + if (couldsleep) {
  1223 + spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
  1224 + rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep;
  1225 + spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
  1226 + }
  1227 +
  1228 + /*
  1229 + * Wait on each CPU in turn to have either visited
  1230 + * a quiescent state or been in dynticks-idle mode.
  1231 + */
  1232 + for_each_online_cpu(cpu) {
  1233 + while (rcu_qsctr_inc_needed(cpu) &&
  1234 + rcu_qsctr_inc_needed_dyntick(cpu)) {
  1235 + /* resched_cpu(cpu); @@@ */
  1236 + schedule_timeout_interruptible(1);
  1237 + }
  1238 + }
  1239 +
  1240 + /* Advance callbacks for each CPU. */
  1241 +
  1242 + for_each_online_cpu(cpu) {
  1243 +
  1244 + rdp = RCU_DATA_CPU(cpu);
  1245 + spin_lock_irqsave(&rdp->lock, flags);
  1246 +
  1247 + /*
  1248 + * We are running on this CPU irq-disabled, so no
  1249 + * CPU can go offline until we re-enable irqs.
  1250 + * The current CPU might have already gone
  1251 + * offline (between the for_each_offline_cpu and
  1252 + * the spin_lock_irqsave), but in that case all its
  1253 + * callback lists will be empty, so no harm done.
  1254 + *
  1255 + * Advance the callbacks! We share normal RCU's
  1256 + * donelist, since callbacks are invoked the
  1257 + * same way in either case.
  1258 + */
  1259 + if (rdp->waitschedlist != NULL) {
  1260 + *rdp->donetail = rdp->waitschedlist;
  1261 + rdp->donetail = rdp->waitschedtail;
  1262 +
  1263 + /*
  1264 + * Next rcu_check_callbacks() will
  1265 + * do the required raise_softirq().
  1266 + */
  1267 + }
  1268 + if (rdp->nextschedlist != NULL) {
  1269 + rdp->waitschedlist = rdp->nextschedlist;
  1270 + rdp->waitschedtail = rdp->nextschedtail;
  1271 + couldsleep = 0;
  1272 + couldsleepnext = 0;
  1273 + } else {
  1274 + rdp->waitschedlist = NULL;
  1275 + rdp->waitschedtail = &rdp->waitschedlist;
  1276 + }
  1277 + rdp->nextschedlist = NULL;
  1278 + rdp->nextschedtail = &rdp->nextschedlist;
  1279 +
  1280 + /* Mark sleep intention. */
  1281 +
  1282 + rdp->rcu_sched_sleeping = couldsleep;
  1283 +
  1284 + spin_unlock_irqrestore(&rdp->lock, flags);
  1285 + }
  1286 +
  1287 + /* If we saw callbacks on the last scan, go deal with them. */
  1288 +
  1289 + if (!couldsleep)
  1290 + continue;
  1291 +
  1292 + /* Attempt to block... */
  1293 +
  1294 + spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
  1295 + if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) {
  1296 +
  1297 + /*
  1298 + * Someone posted a callback after we scanned.
  1299 + * Go take care of it.
  1300 + */
  1301 + spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
  1302 + couldsleepnext = 0;
  1303 + continue;
  1304 + }
  1305 +
  1306 + /* Block until the next person posts a callback. */
  1307 +
  1308 + rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
  1309 + spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
  1310 + ret = 0;
  1311 + __wait_event_interruptible(rcu_ctrlblk.sched_wq,
  1312 + rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
  1313 + ret);
  1314 +
  1315 + /*
  1316 + * Signals would prevent us from sleeping, and we cannot
  1317 + * do much with them in any case. So flush them.
  1318 + */
  1319 + if (ret)
  1320 + flush_signals(current);
  1321 + couldsleepnext = 0;
  1322 +
  1323 + } while (!kthread_should_stop());
  1324 +
  1325 + return (0);
1012 1326 }
1013   -EXPORT_SYMBOL_GPL(__synchronize_sched);
1014 1327  
1015 1328 /*
1016 1329 * Check to see if any future RCU-related work will need to be done
... ... @@ -1027,7 +1340,9 @@
1027 1340  
1028 1341 return (rdp->donelist != NULL ||
1029 1342 !!rdp->waitlistcount ||
1030   - rdp->nextlist != NULL);
  1343 + rdp->nextlist != NULL ||
  1344 + rdp->nextschedlist != NULL ||
  1345 + rdp->waitschedlist != NULL);
1031 1346 }
1032 1347  
1033 1348 int rcu_pending(int cpu)
... ... @@ -1038,7 +1353,9 @@
1038 1353  
1039 1354 if (rdp->donelist != NULL ||
1040 1355 !!rdp->waitlistcount ||
1041   - rdp->nextlist != NULL)
  1356 + rdp->nextlist != NULL ||
  1357 + rdp->nextschedlist != NULL ||
  1358 + rdp->waitschedlist != NULL)
1042 1359 return 1;
1043 1360  
1044 1361 /* The RCU core needs an acknowledgement from this CPU. */
... ... @@ -1105,6 +1422,11 @@
1105 1422 rdp->donetail = &rdp->donelist;
1106 1423 rdp->rcu_flipctr[0] = 0;
1107 1424 rdp->rcu_flipctr[1] = 0;
  1425 + rdp->nextschedlist = NULL;
  1426 + rdp->nextschedtail = &rdp->nextschedlist;
  1427 + rdp->waitschedlist = NULL;
  1428 + rdp->waitschedtail = &rdp->waitschedlist;
  1429 + rdp->rcu_sched_sleeping = 0;
1108 1430 }
1109 1431 register_cpu_notifier(&rcu_nb);
1110 1432  
1111 1433  
1112 1434  
... ... @@ -1127,11 +1449,15 @@
1127 1449 }
1128 1450  
1129 1451 /*
1130   - * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
  1452 + * Late-boot-time RCU initialization that must wait until after scheduler
  1453 + * has been initialized.
1131 1454 */
1132   -void synchronize_kernel(void)
  1455 +void __init rcu_init_sched(void)
1133 1456 {
1134   - synchronize_rcu();
  1457 + rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period,
  1458 + NULL,
  1459 + "rcu_sched_grace_period");
  1460 + WARN_ON(IS_ERR(rcu_sched_grace_period_task));
1135 1461 }
1136 1462  
1137 1463 #ifdef CONFIG_RCU_TRACE