Commit 2232c2d8e0a6a31061dec311f3d1cf7624bc14f1

Authored by Steven Rostedt
Committed by Ingo Molnar
1 parent c0f4133b8f

rcu: add support for dynamic ticks and preempt rcu

The PREEMPT-RCU can get stuck if a CPU goes idle and NO_HZ is set. The
idle CPU will not progress the RCU through its grace period and a
synchronize_rcu my get stuck. Without this patch I have a box that will
not boot when PREEMPT_RCU and NO_HZ are set. That same box boots fine
with this patch.

This patch comes from the -rt kernel where it has been tested for
several months.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

Showing 6 changed files with 259 additions and 4 deletions Side-by-side Diff

include/linux/hardirq.h
... ... @@ -109,6 +109,14 @@
109 109 }
110 110 #endif
111 111  
  112 +#if defined(CONFIG_PREEMPT_RCU) && defined(CONFIG_NO_HZ)
  113 +extern void rcu_irq_enter(void);
  114 +extern void rcu_irq_exit(void);
  115 +#else
  116 +# define rcu_irq_enter() do { } while (0)
  117 +# define rcu_irq_exit() do { } while (0)
  118 +#endif /* CONFIG_PREEMPT_RCU */
  119 +
112 120 /*
113 121 * It is safe to do non-atomic ops on ->hardirq_context,
114 122 * because NMI handlers may not preempt and the ops are
... ... @@ -117,6 +125,7 @@
117 125 */
118 126 #define __irq_enter() \
119 127 do { \
  128 + rcu_irq_enter(); \
120 129 account_system_vtime(current); \
121 130 add_preempt_count(HARDIRQ_OFFSET); \
122 131 trace_hardirq_enter(); \
... ... @@ -135,6 +144,7 @@
135 144 trace_hardirq_exit(); \
136 145 account_system_vtime(current); \
137 146 sub_preempt_count(HARDIRQ_OFFSET); \
  147 + rcu_irq_exit(); \
138 148 } while (0)
139 149  
140 150 /*
include/linux/rcuclassic.h
... ... @@ -160,6 +160,9 @@
160 160 extern long rcu_batches_completed(void);
161 161 extern long rcu_batches_completed_bh(void);
162 162  
  163 +#define rcu_enter_nohz() do { } while (0)
  164 +#define rcu_exit_nohz() do { } while (0)
  165 +
163 166 #endif /* __KERNEL__ */
164 167 #endif /* __LINUX_RCUCLASSIC_H */
include/linux/rcupreempt.h
... ... @@ -82,6 +82,28 @@
82 82  
83 83 struct softirq_action;
84 84  
  85 +#ifdef CONFIG_NO_HZ
  86 +DECLARE_PER_CPU(long, dynticks_progress_counter);
  87 +
  88 +static inline void rcu_enter_nohz(void)
  89 +{
  90 + __get_cpu_var(dynticks_progress_counter)++;
  91 + WARN_ON(__get_cpu_var(dynticks_progress_counter) & 0x1);
  92 + mb();
  93 +}
  94 +
  95 +static inline void rcu_exit_nohz(void)
  96 +{
  97 + mb();
  98 + __get_cpu_var(dynticks_progress_counter)++;
  99 + WARN_ON(!(__get_cpu_var(dynticks_progress_counter) & 0x1));
  100 +}
  101 +
  102 +#else /* CONFIG_NO_HZ */
  103 +#define rcu_enter_nohz() do { } while (0)
  104 +#define rcu_exit_nohz() do { } while (0)
  105 +#endif /* CONFIG_NO_HZ */
  106 +
85 107 #endif /* __KERNEL__ */
86 108 #endif /* __LINUX_RCUPREEMPT_H */
... ... @@ -23,6 +23,10 @@
23 23 * to Suparna Bhattacharya for pushing me completely away
24 24 * from atomic instructions on the read side.
25 25 *
  26 + * - Added handling of Dynamic Ticks
  27 + * Copyright 2007 - Paul E. Mckenney <paulmck@us.ibm.com>
  28 + * - Steven Rostedt <srostedt@redhat.com>
  29 + *
26 30 * Papers: http://www.rdrop.com/users/paulmck/RCU
27 31 *
28 32 * Design Document: http://lwn.net/Articles/253651/
... ... @@ -409,6 +413,212 @@
409 413 }
410 414 }
411 415  
  416 +#ifdef CONFIG_NO_HZ
  417 +
  418 +DEFINE_PER_CPU(long, dynticks_progress_counter) = 1;
  419 +static DEFINE_PER_CPU(long, rcu_dyntick_snapshot);
  420 +static DEFINE_PER_CPU(int, rcu_update_flag);
  421 +
  422 +/**
  423 + * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
  424 + *
  425 + * If the CPU was idle with dynamic ticks active, this updates the
  426 + * dynticks_progress_counter to let the RCU handling know that the
  427 + * CPU is active.
  428 + */
  429 +void rcu_irq_enter(void)
  430 +{
  431 + int cpu = smp_processor_id();
  432 +
  433 + if (per_cpu(rcu_update_flag, cpu))
  434 + per_cpu(rcu_update_flag, cpu)++;
  435 +
  436 + /*
  437 + * Only update if we are coming from a stopped ticks mode
  438 + * (dynticks_progress_counter is even).
  439 + */
  440 + if (!in_interrupt() &&
  441 + (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) {
  442 + /*
  443 + * The following might seem like we could have a race
  444 + * with NMI/SMIs. But this really isn't a problem.
  445 + * Here we do a read/modify/write, and the race happens
  446 + * when an NMI/SMI comes in after the read and before
  447 + * the write. But NMI/SMIs will increment this counter
  448 + * twice before returning, so the zero bit will not
  449 + * be corrupted by the NMI/SMI which is the most important
  450 + * part.
  451 + *
  452 + * The only thing is that we would bring back the counter
  453 + * to a postion that it was in during the NMI/SMI.
  454 + * But the zero bit would be set, so the rest of the
  455 + * counter would again be ignored.
  456 + *
  457 + * On return from the IRQ, the counter may have the zero
  458 + * bit be 0 and the counter the same as the return from
  459 + * the NMI/SMI. If the state machine was so unlucky to
  460 + * see that, it still doesn't matter, since all
  461 + * RCU read-side critical sections on this CPU would
  462 + * have already completed.
  463 + */
  464 + per_cpu(dynticks_progress_counter, cpu)++;
  465 + /*
  466 + * The following memory barrier ensures that any
  467 + * rcu_read_lock() primitives in the irq handler
  468 + * are seen by other CPUs to follow the above
  469 + * increment to dynticks_progress_counter. This is
  470 + * required in order for other CPUs to correctly
  471 + * determine when it is safe to advance the RCU
  472 + * grace-period state machine.
  473 + */
  474 + smp_mb(); /* see above block comment. */
  475 + /*
  476 + * Since we can't determine the dynamic tick mode from
  477 + * the dynticks_progress_counter after this routine,
  478 + * we use a second flag to acknowledge that we came
  479 + * from an idle state with ticks stopped.
  480 + */
  481 + per_cpu(rcu_update_flag, cpu)++;
  482 + /*
  483 + * If we take an NMI/SMI now, they will also increment
  484 + * the rcu_update_flag, and will not update the
  485 + * dynticks_progress_counter on exit. That is for
  486 + * this IRQ to do.
  487 + */
  488 + }
  489 +}
  490 +
  491 +/**
  492 + * rcu_irq_exit - Called from exiting Hard irq context.
  493 + *
  494 + * If the CPU was idle with dynamic ticks active, update the
  495 + * dynticks_progress_counter to put let the RCU handling be
  496 + * aware that the CPU is going back to idle with no ticks.
  497 + */
  498 +void rcu_irq_exit(void)
  499 +{
  500 + int cpu = smp_processor_id();
  501 +
  502 + /*
  503 + * rcu_update_flag is set if we interrupted the CPU
  504 + * when it was idle with ticks stopped.
  505 + * Once this occurs, we keep track of interrupt nesting
  506 + * because a NMI/SMI could also come in, and we still
  507 + * only want the IRQ that started the increment of the
  508 + * dynticks_progress_counter to be the one that modifies
  509 + * it on exit.
  510 + */
  511 + if (per_cpu(rcu_update_flag, cpu)) {
  512 + if (--per_cpu(rcu_update_flag, cpu))
  513 + return;
  514 +
  515 + /* This must match the interrupt nesting */
  516 + WARN_ON(in_interrupt());
  517 +
  518 + /*
  519 + * If an NMI/SMI happens now we are still
  520 + * protected by the dynticks_progress_counter being odd.
  521 + */
  522 +
  523 + /*
  524 + * The following memory barrier ensures that any
  525 + * rcu_read_unlock() primitives in the irq handler
  526 + * are seen by other CPUs to preceed the following
  527 + * increment to dynticks_progress_counter. This
  528 + * is required in order for other CPUs to determine
  529 + * when it is safe to advance the RCU grace-period
  530 + * state machine.
  531 + */
  532 + smp_mb(); /* see above block comment. */
  533 + per_cpu(dynticks_progress_counter, cpu)++;
  534 + WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1);
  535 + }
  536 +}
  537 +
  538 +static void dyntick_save_progress_counter(int cpu)
  539 +{
  540 + per_cpu(rcu_dyntick_snapshot, cpu) =
  541 + per_cpu(dynticks_progress_counter, cpu);
  542 +}
  543 +
  544 +static inline int
  545 +rcu_try_flip_waitack_needed(int cpu)
  546 +{
  547 + long curr;
  548 + long snap;
  549 +
  550 + curr = per_cpu(dynticks_progress_counter, cpu);
  551 + snap = per_cpu(rcu_dyntick_snapshot, cpu);
  552 + smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
  553 +
  554 + /*
  555 + * If the CPU remained in dynticks mode for the entire time
  556 + * and didn't take any interrupts, NMIs, SMIs, or whatever,
  557 + * then it cannot be in the middle of an rcu_read_lock(), so
  558 + * the next rcu_read_lock() it executes must use the new value
  559 + * of the counter. So we can safely pretend that this CPU
  560 + * already acknowledged the counter.
  561 + */
  562 +
  563 + if ((curr == snap) && ((curr & 0x1) == 0))
  564 + return 0;
  565 +
  566 + /*
  567 + * If the CPU passed through or entered a dynticks idle phase with
  568 + * no active irq handlers, then, as above, we can safely pretend
  569 + * that this CPU already acknowledged the counter.
  570 + */
  571 +
  572 + if ((curr - snap) > 2 || (snap & 0x1) == 0)
  573 + return 0;
  574 +
  575 + /* We need this CPU to explicitly acknowledge the counter flip. */
  576 +
  577 + return 1;
  578 +}
  579 +
  580 +static inline int
  581 +rcu_try_flip_waitmb_needed(int cpu)
  582 +{
  583 + long curr;
  584 + long snap;
  585 +
  586 + curr = per_cpu(dynticks_progress_counter, cpu);
  587 + snap = per_cpu(rcu_dyntick_snapshot, cpu);
  588 + smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
  589 +
  590 + /*
  591 + * If the CPU remained in dynticks mode for the entire time
  592 + * and didn't take any interrupts, NMIs, SMIs, or whatever,
  593 + * then it cannot have executed an RCU read-side critical section
  594 + * during that time, so there is no need for it to execute a
  595 + * memory barrier.
  596 + */
  597 +
  598 + if ((curr == snap) && ((curr & 0x1) == 0))
  599 + return 0;
  600 +
  601 + /*
  602 + * If the CPU either entered or exited an outermost interrupt,
  603 + * SMI, NMI, or whatever handler, then we know that it executed
  604 + * a memory barrier when doing so. So we don't need another one.
  605 + */
  606 + if (curr != snap)
  607 + return 0;
  608 +
  609 + /* We need the CPU to execute a memory barrier. */
  610 +
  611 + return 1;
  612 +}
  613 +
  614 +#else /* !CONFIG_NO_HZ */
  615 +
  616 +# define dyntick_save_progress_counter(cpu) do { } while (0)
  617 +# define rcu_try_flip_waitack_needed(cpu) (1)
  618 +# define rcu_try_flip_waitmb_needed(cpu) (1)
  619 +
  620 +#endif /* CONFIG_NO_HZ */
  621 +
412 622 /*
413 623 * Get here when RCU is idle. Decide whether we need to
414 624 * move out of idle state, and return non-zero if so.
415 625  
... ... @@ -447,8 +657,10 @@
447 657  
448 658 /* Now ask each CPU for acknowledgement of the flip. */
449 659  
450   - for_each_cpu_mask(cpu, rcu_cpu_online_map)
  660 + for_each_cpu_mask(cpu, rcu_cpu_online_map) {
451 661 per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
  662 + dyntick_save_progress_counter(cpu);
  663 + }
452 664  
453 665 return 1;
454 666 }
... ... @@ -464,7 +676,8 @@
464 676  
465 677 RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
466 678 for_each_cpu_mask(cpu, rcu_cpu_online_map)
467   - if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
  679 + if (rcu_try_flip_waitack_needed(cpu) &&
  680 + per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
468 681 RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
469 682 return 0;
470 683 }
471 684  
... ... @@ -509,8 +722,10 @@
509 722 smp_mb(); /* ^^^^^^^^^^^^ */
510 723  
511 724 /* Call for a memory barrier from each CPU. */
512   - for_each_cpu_mask(cpu, rcu_cpu_online_map)
  725 + for_each_cpu_mask(cpu, rcu_cpu_online_map) {
513 726 per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
  727 + dyntick_save_progress_counter(cpu);
  728 + }
514 729  
515 730 RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
516 731 return 1;
... ... @@ -528,7 +743,8 @@
528 743  
529 744 RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
530 745 for_each_cpu_mask(cpu, rcu_cpu_online_map)
531   - if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
  746 + if (rcu_try_flip_waitmb_needed(cpu) &&
  747 + per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
532 748 RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
533 749 return 0;
534 750 }
... ... @@ -313,6 +313,7 @@
313 313 /* Make sure that timer wheel updates are propagated */
314 314 if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
315 315 tick_nohz_stop_sched_tick();
  316 + rcu_irq_exit();
316 317 #endif
317 318 preempt_enable_no_resched();
318 319 }
kernel/time/tick-sched.c
... ... @@ -282,6 +282,7 @@
282 282 ts->idle_tick = ts->sched_timer.expires;
283 283 ts->tick_stopped = 1;
284 284 ts->idle_jiffies = last_jiffies;
  285 + rcu_enter_nohz();
285 286 }
286 287  
287 288 /*
... ... @@ -374,6 +375,8 @@
374 375 local_irq_enable();
375 376 return;
376 377 }
  378 +
  379 + rcu_exit_nohz();
377 380  
378 381 /* Update jiffies first */
379 382 select_nohz_load_balancer(0);