Commit 818e3dd30a4ff34fff6d90e87ae59c73f6a53691
Committed by
Ingo Molnar
1 parent
b3aa557722
Exists in
master
and in
20 other branches
tracing, ring-buffer: add paranoid checks for loops
While writing a new tracer, I had a bug where I caused the ring-buffer to recurse in a bad way. The bug was with the tracer I was writing and not the ring-buffer itself. But it took a long time to find the problem. This patch adds paranoid checks into the ring-buffer infrastructure that will catch bugs of this nature. Note: I put the bug back in the tracer and this patch showed the error nicely and prevented the lockup. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Showing 1 changed file with 56 additions and 0 deletions Side-by-side Diff
kernel/trace/ring_buffer.c
... | ... | @@ -1022,8 +1022,23 @@ |
1022 | 1022 | struct ring_buffer_event *event; |
1023 | 1023 | u64 ts, delta; |
1024 | 1024 | int commit = 0; |
1025 | + int nr_loops = 0; | |
1025 | 1026 | |
1026 | 1027 | again: |
1028 | + /* | |
1029 | + * We allow for interrupts to reenter here and do a trace. | |
1030 | + * If one does, it will cause this original code to loop | |
1031 | + * back here. Even with heavy interrupts happening, this | |
1032 | + * should only happen a few times in a row. If this happens | |
1033 | + * 1000 times in a row, there must be either an interrupt | |
1034 | + * storm or we have something buggy. | |
1035 | + * Bail! | |
1036 | + */ | |
1037 | + if (unlikely(++nr_loops > 1000)) { | |
1038 | + RB_WARN_ON(cpu_buffer, 1); | |
1039 | + return NULL; | |
1040 | + } | |
1041 | + | |
1027 | 1042 | ts = ring_buffer_time_stamp(cpu_buffer->cpu); |
1028 | 1043 | |
1029 | 1044 | /* |
1030 | 1045 | |
... | ... | @@ -1532,10 +1547,23 @@ |
1532 | 1547 | { |
1533 | 1548 | struct buffer_page *reader = NULL; |
1534 | 1549 | unsigned long flags; |
1550 | + int nr_loops = 0; | |
1535 | 1551 | |
1536 | 1552 | spin_lock_irqsave(&cpu_buffer->lock, flags); |
1537 | 1553 | |
1538 | 1554 | again: |
1555 | + /* | |
1556 | + * This should normally only loop twice. But because the | |
1557 | + * start of the reader inserts an empty page, it causes | |
1558 | + * a case where we will loop three times. There should be no | |
1559 | + * reason to loop four times (that I know of). | |
1560 | + */ | |
1561 | + if (unlikely(++nr_loops > 3)) { | |
1562 | + RB_WARN_ON(cpu_buffer, 1); | |
1563 | + reader = NULL; | |
1564 | + goto out; | |
1565 | + } | |
1566 | + | |
1539 | 1567 | reader = cpu_buffer->reader_page; |
1540 | 1568 | |
1541 | 1569 | /* If there's more to read, return this page */ |
... | ... | @@ -1665,6 +1693,7 @@ |
1665 | 1693 | struct ring_buffer_per_cpu *cpu_buffer; |
1666 | 1694 | struct ring_buffer_event *event; |
1667 | 1695 | struct buffer_page *reader; |
1696 | + int nr_loops = 0; | |
1668 | 1697 | |
1669 | 1698 | if (!cpu_isset(cpu, buffer->cpumask)) |
1670 | 1699 | return NULL; |
... | ... | @@ -1672,6 +1701,19 @@ |
1672 | 1701 | cpu_buffer = buffer->buffers[cpu]; |
1673 | 1702 | |
1674 | 1703 | again: |
1704 | + /* | |
1705 | + * We repeat when a timestamp is encountered. It is possible | |
1706 | + * to get multiple timestamps from an interrupt entering just | |
1707 | + * as one timestamp is about to be written. The max times | |
1708 | + * that this can happen is the number of nested interrupts we | |
1709 | + * can have. Nesting 10 deep of interrupts is clearly | |
1710 | + * an anomaly. | |
1711 | + */ | |
1712 | + if (unlikely(++nr_loops > 10)) { | |
1713 | + RB_WARN_ON(cpu_buffer, 1); | |
1714 | + return NULL; | |
1715 | + } | |
1716 | + | |
1675 | 1717 | reader = rb_get_reader_page(cpu_buffer); |
1676 | 1718 | if (!reader) |
1677 | 1719 | return NULL; |
... | ... | @@ -1722,6 +1764,7 @@ |
1722 | 1764 | struct ring_buffer *buffer; |
1723 | 1765 | struct ring_buffer_per_cpu *cpu_buffer; |
1724 | 1766 | struct ring_buffer_event *event; |
1767 | + int nr_loops = 0; | |
1725 | 1768 | |
1726 | 1769 | if (ring_buffer_iter_empty(iter)) |
1727 | 1770 | return NULL; |
... | ... | @@ -1730,6 +1773,19 @@ |
1730 | 1773 | buffer = cpu_buffer->buffer; |
1731 | 1774 | |
1732 | 1775 | again: |
1776 | + /* | |
1777 | + * We repeat when a timestamp is encountered. It is possible | |
1778 | + * to get multiple timestamps from an interrupt entering just | |
1779 | + * as one timestamp is about to be written. The max times | |
1780 | + * that this can happen is the number of nested interrupts we | |
1781 | + * can have. Nesting 10 deep of interrupts is clearly | |
1782 | + * an anomaly. | |
1783 | + */ | |
1784 | + if (unlikely(++nr_loops > 10)) { | |
1785 | + RB_WARN_ON(cpu_buffer, 1); | |
1786 | + return NULL; | |
1787 | + } | |
1788 | + | |
1733 | 1789 | if (rb_per_cpu_empty(cpu_buffer)) |
1734 | 1790 | return NULL; |
1735 | 1791 |