Commit 2232c2d8e0a6a31061dec311f3d1cf7624bc14f1
Committed by
Ingo Molnar
1 parent
c0f4133b8f
Exists in
master
and in
7 other branches
rcu: add support for dynamic ticks and preempt rcu
The PREEMPT-RCU can get stuck if a CPU goes idle and NO_HZ is set. The idle CPU will not progress the RCU through its grace period and a synchronize_rcu my get stuck. Without this patch I have a box that will not boot when PREEMPT_RCU and NO_HZ are set. That same box boots fine with this patch. This patch comes from the -rt kernel where it has been tested for several months. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Showing 6 changed files with 259 additions and 4 deletions Side-by-side Diff
include/linux/hardirq.h
... | ... | @@ -109,6 +109,14 @@ |
109 | 109 | } |
110 | 110 | #endif |
111 | 111 | |
112 | +#if defined(CONFIG_PREEMPT_RCU) && defined(CONFIG_NO_HZ) | |
113 | +extern void rcu_irq_enter(void); | |
114 | +extern void rcu_irq_exit(void); | |
115 | +#else | |
116 | +# define rcu_irq_enter() do { } while (0) | |
117 | +# define rcu_irq_exit() do { } while (0) | |
118 | +#endif /* CONFIG_PREEMPT_RCU */ | |
119 | + | |
112 | 120 | /* |
113 | 121 | * It is safe to do non-atomic ops on ->hardirq_context, |
114 | 122 | * because NMI handlers may not preempt and the ops are |
... | ... | @@ -117,6 +125,7 @@ |
117 | 125 | */ |
118 | 126 | #define __irq_enter() \ |
119 | 127 | do { \ |
128 | + rcu_irq_enter(); \ | |
120 | 129 | account_system_vtime(current); \ |
121 | 130 | add_preempt_count(HARDIRQ_OFFSET); \ |
122 | 131 | trace_hardirq_enter(); \ |
... | ... | @@ -135,6 +144,7 @@ |
135 | 144 | trace_hardirq_exit(); \ |
136 | 145 | account_system_vtime(current); \ |
137 | 146 | sub_preempt_count(HARDIRQ_OFFSET); \ |
147 | + rcu_irq_exit(); \ | |
138 | 148 | } while (0) |
139 | 149 | |
140 | 150 | /* |
include/linux/rcuclassic.h
... | ... | @@ -160,6 +160,9 @@ |
160 | 160 | extern long rcu_batches_completed(void); |
161 | 161 | extern long rcu_batches_completed_bh(void); |
162 | 162 | |
163 | +#define rcu_enter_nohz() do { } while (0) | |
164 | +#define rcu_exit_nohz() do { } while (0) | |
165 | + | |
163 | 166 | #endif /* __KERNEL__ */ |
164 | 167 | #endif /* __LINUX_RCUCLASSIC_H */ |
include/linux/rcupreempt.h
... | ... | @@ -82,6 +82,28 @@ |
82 | 82 | |
83 | 83 | struct softirq_action; |
84 | 84 | |
85 | +#ifdef CONFIG_NO_HZ | |
86 | +DECLARE_PER_CPU(long, dynticks_progress_counter); | |
87 | + | |
88 | +static inline void rcu_enter_nohz(void) | |
89 | +{ | |
90 | + __get_cpu_var(dynticks_progress_counter)++; | |
91 | + WARN_ON(__get_cpu_var(dynticks_progress_counter) & 0x1); | |
92 | + mb(); | |
93 | +} | |
94 | + | |
95 | +static inline void rcu_exit_nohz(void) | |
96 | +{ | |
97 | + mb(); | |
98 | + __get_cpu_var(dynticks_progress_counter)++; | |
99 | + WARN_ON(!(__get_cpu_var(dynticks_progress_counter) & 0x1)); | |
100 | +} | |
101 | + | |
102 | +#else /* CONFIG_NO_HZ */ | |
103 | +#define rcu_enter_nohz() do { } while (0) | |
104 | +#define rcu_exit_nohz() do { } while (0) | |
105 | +#endif /* CONFIG_NO_HZ */ | |
106 | + | |
85 | 107 | #endif /* __KERNEL__ */ |
86 | 108 | #endif /* __LINUX_RCUPREEMPT_H */ |
kernel/rcupreempt.c
... | ... | @@ -23,6 +23,10 @@ |
23 | 23 | * to Suparna Bhattacharya for pushing me completely away |
24 | 24 | * from atomic instructions on the read side. |
25 | 25 | * |
26 | + * - Added handling of Dynamic Ticks | |
27 | + * Copyright 2007 - Paul E. Mckenney <paulmck@us.ibm.com> | |
28 | + * - Steven Rostedt <srostedt@redhat.com> | |
29 | + * | |
26 | 30 | * Papers: http://www.rdrop.com/users/paulmck/RCU |
27 | 31 | * |
28 | 32 | * Design Document: http://lwn.net/Articles/253651/ |
... | ... | @@ -409,6 +413,212 @@ |
409 | 413 | } |
410 | 414 | } |
411 | 415 | |
416 | +#ifdef CONFIG_NO_HZ | |
417 | + | |
418 | +DEFINE_PER_CPU(long, dynticks_progress_counter) = 1; | |
419 | +static DEFINE_PER_CPU(long, rcu_dyntick_snapshot); | |
420 | +static DEFINE_PER_CPU(int, rcu_update_flag); | |
421 | + | |
422 | +/** | |
423 | + * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI. | |
424 | + * | |
425 | + * If the CPU was idle with dynamic ticks active, this updates the | |
426 | + * dynticks_progress_counter to let the RCU handling know that the | |
427 | + * CPU is active. | |
428 | + */ | |
429 | +void rcu_irq_enter(void) | |
430 | +{ | |
431 | + int cpu = smp_processor_id(); | |
432 | + | |
433 | + if (per_cpu(rcu_update_flag, cpu)) | |
434 | + per_cpu(rcu_update_flag, cpu)++; | |
435 | + | |
436 | + /* | |
437 | + * Only update if we are coming from a stopped ticks mode | |
438 | + * (dynticks_progress_counter is even). | |
439 | + */ | |
440 | + if (!in_interrupt() && | |
441 | + (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) { | |
442 | + /* | |
443 | + * The following might seem like we could have a race | |
444 | + * with NMI/SMIs. But this really isn't a problem. | |
445 | + * Here we do a read/modify/write, and the race happens | |
446 | + * when an NMI/SMI comes in after the read and before | |
447 | + * the write. But NMI/SMIs will increment this counter | |
448 | + * twice before returning, so the zero bit will not | |
449 | + * be corrupted by the NMI/SMI which is the most important | |
450 | + * part. | |
451 | + * | |
452 | + * The only thing is that we would bring back the counter | |
453 | + * to a postion that it was in during the NMI/SMI. | |
454 | + * But the zero bit would be set, so the rest of the | |
455 | + * counter would again be ignored. | |
456 | + * | |
457 | + * On return from the IRQ, the counter may have the zero | |
458 | + * bit be 0 and the counter the same as the return from | |
459 | + * the NMI/SMI. If the state machine was so unlucky to | |
460 | + * see that, it still doesn't matter, since all | |
461 | + * RCU read-side critical sections on this CPU would | |
462 | + * have already completed. | |
463 | + */ | |
464 | + per_cpu(dynticks_progress_counter, cpu)++; | |
465 | + /* | |
466 | + * The following memory barrier ensures that any | |
467 | + * rcu_read_lock() primitives in the irq handler | |
468 | + * are seen by other CPUs to follow the above | |
469 | + * increment to dynticks_progress_counter. This is | |
470 | + * required in order for other CPUs to correctly | |
471 | + * determine when it is safe to advance the RCU | |
472 | + * grace-period state machine. | |
473 | + */ | |
474 | + smp_mb(); /* see above block comment. */ | |
475 | + /* | |
476 | + * Since we can't determine the dynamic tick mode from | |
477 | + * the dynticks_progress_counter after this routine, | |
478 | + * we use a second flag to acknowledge that we came | |
479 | + * from an idle state with ticks stopped. | |
480 | + */ | |
481 | + per_cpu(rcu_update_flag, cpu)++; | |
482 | + /* | |
483 | + * If we take an NMI/SMI now, they will also increment | |
484 | + * the rcu_update_flag, and will not update the | |
485 | + * dynticks_progress_counter on exit. That is for | |
486 | + * this IRQ to do. | |
487 | + */ | |
488 | + } | |
489 | +} | |
490 | + | |
491 | +/** | |
492 | + * rcu_irq_exit - Called from exiting Hard irq context. | |
493 | + * | |
494 | + * If the CPU was idle with dynamic ticks active, update the | |
495 | + * dynticks_progress_counter to put let the RCU handling be | |
496 | + * aware that the CPU is going back to idle with no ticks. | |
497 | + */ | |
498 | +void rcu_irq_exit(void) | |
499 | +{ | |
500 | + int cpu = smp_processor_id(); | |
501 | + | |
502 | + /* | |
503 | + * rcu_update_flag is set if we interrupted the CPU | |
504 | + * when it was idle with ticks stopped. | |
505 | + * Once this occurs, we keep track of interrupt nesting | |
506 | + * because a NMI/SMI could also come in, and we still | |
507 | + * only want the IRQ that started the increment of the | |
508 | + * dynticks_progress_counter to be the one that modifies | |
509 | + * it on exit. | |
510 | + */ | |
511 | + if (per_cpu(rcu_update_flag, cpu)) { | |
512 | + if (--per_cpu(rcu_update_flag, cpu)) | |
513 | + return; | |
514 | + | |
515 | + /* This must match the interrupt nesting */ | |
516 | + WARN_ON(in_interrupt()); | |
517 | + | |
518 | + /* | |
519 | + * If an NMI/SMI happens now we are still | |
520 | + * protected by the dynticks_progress_counter being odd. | |
521 | + */ | |
522 | + | |
523 | + /* | |
524 | + * The following memory barrier ensures that any | |
525 | + * rcu_read_unlock() primitives in the irq handler | |
526 | + * are seen by other CPUs to preceed the following | |
527 | + * increment to dynticks_progress_counter. This | |
528 | + * is required in order for other CPUs to determine | |
529 | + * when it is safe to advance the RCU grace-period | |
530 | + * state machine. | |
531 | + */ | |
532 | + smp_mb(); /* see above block comment. */ | |
533 | + per_cpu(dynticks_progress_counter, cpu)++; | |
534 | + WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1); | |
535 | + } | |
536 | +} | |
537 | + | |
538 | +static void dyntick_save_progress_counter(int cpu) | |
539 | +{ | |
540 | + per_cpu(rcu_dyntick_snapshot, cpu) = | |
541 | + per_cpu(dynticks_progress_counter, cpu); | |
542 | +} | |
543 | + | |
544 | +static inline int | |
545 | +rcu_try_flip_waitack_needed(int cpu) | |
546 | +{ | |
547 | + long curr; | |
548 | + long snap; | |
549 | + | |
550 | + curr = per_cpu(dynticks_progress_counter, cpu); | |
551 | + snap = per_cpu(rcu_dyntick_snapshot, cpu); | |
552 | + smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ | |
553 | + | |
554 | + /* | |
555 | + * If the CPU remained in dynticks mode for the entire time | |
556 | + * and didn't take any interrupts, NMIs, SMIs, or whatever, | |
557 | + * then it cannot be in the middle of an rcu_read_lock(), so | |
558 | + * the next rcu_read_lock() it executes must use the new value | |
559 | + * of the counter. So we can safely pretend that this CPU | |
560 | + * already acknowledged the counter. | |
561 | + */ | |
562 | + | |
563 | + if ((curr == snap) && ((curr & 0x1) == 0)) | |
564 | + return 0; | |
565 | + | |
566 | + /* | |
567 | + * If the CPU passed through or entered a dynticks idle phase with | |
568 | + * no active irq handlers, then, as above, we can safely pretend | |
569 | + * that this CPU already acknowledged the counter. | |
570 | + */ | |
571 | + | |
572 | + if ((curr - snap) > 2 || (snap & 0x1) == 0) | |
573 | + return 0; | |
574 | + | |
575 | + /* We need this CPU to explicitly acknowledge the counter flip. */ | |
576 | + | |
577 | + return 1; | |
578 | +} | |
579 | + | |
580 | +static inline int | |
581 | +rcu_try_flip_waitmb_needed(int cpu) | |
582 | +{ | |
583 | + long curr; | |
584 | + long snap; | |
585 | + | |
586 | + curr = per_cpu(dynticks_progress_counter, cpu); | |
587 | + snap = per_cpu(rcu_dyntick_snapshot, cpu); | |
588 | + smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ | |
589 | + | |
590 | + /* | |
591 | + * If the CPU remained in dynticks mode for the entire time | |
592 | + * and didn't take any interrupts, NMIs, SMIs, or whatever, | |
593 | + * then it cannot have executed an RCU read-side critical section | |
594 | + * during that time, so there is no need for it to execute a | |
595 | + * memory barrier. | |
596 | + */ | |
597 | + | |
598 | + if ((curr == snap) && ((curr & 0x1) == 0)) | |
599 | + return 0; | |
600 | + | |
601 | + /* | |
602 | + * If the CPU either entered or exited an outermost interrupt, | |
603 | + * SMI, NMI, or whatever handler, then we know that it executed | |
604 | + * a memory barrier when doing so. So we don't need another one. | |
605 | + */ | |
606 | + if (curr != snap) | |
607 | + return 0; | |
608 | + | |
609 | + /* We need the CPU to execute a memory barrier. */ | |
610 | + | |
611 | + return 1; | |
612 | +} | |
613 | + | |
614 | +#else /* !CONFIG_NO_HZ */ | |
615 | + | |
616 | +# define dyntick_save_progress_counter(cpu) do { } while (0) | |
617 | +# define rcu_try_flip_waitack_needed(cpu) (1) | |
618 | +# define rcu_try_flip_waitmb_needed(cpu) (1) | |
619 | + | |
620 | +#endif /* CONFIG_NO_HZ */ | |
621 | + | |
412 | 622 | /* |
413 | 623 | * Get here when RCU is idle. Decide whether we need to |
414 | 624 | * move out of idle state, and return non-zero if so. |
415 | 625 | |
... | ... | @@ -447,8 +657,10 @@ |
447 | 657 | |
448 | 658 | /* Now ask each CPU for acknowledgement of the flip. */ |
449 | 659 | |
450 | - for_each_cpu_mask(cpu, rcu_cpu_online_map) | |
660 | + for_each_cpu_mask(cpu, rcu_cpu_online_map) { | |
451 | 661 | per_cpu(rcu_flip_flag, cpu) = rcu_flipped; |
662 | + dyntick_save_progress_counter(cpu); | |
663 | + } | |
452 | 664 | |
453 | 665 | return 1; |
454 | 666 | } |
... | ... | @@ -464,7 +676,8 @@ |
464 | 676 | |
465 | 677 | RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); |
466 | 678 | for_each_cpu_mask(cpu, rcu_cpu_online_map) |
467 | - if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { | |
679 | + if (rcu_try_flip_waitack_needed(cpu) && | |
680 | + per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { | |
468 | 681 | RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); |
469 | 682 | return 0; |
470 | 683 | } |
471 | 684 | |
... | ... | @@ -509,8 +722,10 @@ |
509 | 722 | smp_mb(); /* ^^^^^^^^^^^^ */ |
510 | 723 | |
511 | 724 | /* Call for a memory barrier from each CPU. */ |
512 | - for_each_cpu_mask(cpu, rcu_cpu_online_map) | |
725 | + for_each_cpu_mask(cpu, rcu_cpu_online_map) { | |
513 | 726 | per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; |
727 | + dyntick_save_progress_counter(cpu); | |
728 | + } | |
514 | 729 | |
515 | 730 | RCU_TRACE_ME(rcupreempt_trace_try_flip_z2); |
516 | 731 | return 1; |
... | ... | @@ -528,7 +743,8 @@ |
528 | 743 | |
529 | 744 | RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); |
530 | 745 | for_each_cpu_mask(cpu, rcu_cpu_online_map) |
531 | - if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { | |
746 | + if (rcu_try_flip_waitmb_needed(cpu) && | |
747 | + per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { | |
532 | 748 | RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); |
533 | 749 | return 0; |
534 | 750 | } |
kernel/softirq.c
kernel/time/tick-sched.c
... | ... | @@ -282,6 +282,7 @@ |
282 | 282 | ts->idle_tick = ts->sched_timer.expires; |
283 | 283 | ts->tick_stopped = 1; |
284 | 284 | ts->idle_jiffies = last_jiffies; |
285 | + rcu_enter_nohz(); | |
285 | 286 | } |
286 | 287 | |
287 | 288 | /* |
... | ... | @@ -374,6 +375,8 @@ |
374 | 375 | local_irq_enable(); |
375 | 376 | return; |
376 | 377 | } |
378 | + | |
379 | + rcu_exit_nohz(); | |
377 | 380 | |
378 | 381 | /* Update jiffies first */ |
379 | 382 | select_nohz_load_balancer(0); |