Commit a2bbe75089d5eb9a3a46d50dd5c215e213790288

Authored by Frederic Weisbecker
1 parent 48ffee7d9e

x86: Don't use frame pointer to save old stack on irq entry

rbp is used in SAVE_ARGS_IRQ to save the old stack pointer
in order to restore it later in ret_from_intr.

It is convenient because we save its value in the irq regs
and it's easily restored using the leave instruction.

However this is a kind of abuse of the frame pointer which
role is to help unwinding the kernel by chaining frames
together, each node following the return address to the
previous frame.

But although we are breaking the frame by changing the stack
pointer, there is no preceding return address before the new
frame. Hence using the frame pointer to link the two stacks
breaks the stack unwinders that find a random value instead of
a return address here.

There is no workaround that can work in every case. We are using
the fixup_bp_irq_link() function to dereference that abused frame
pointer in the case of non nesting interrupt (which means stack
changed).
But that doesn't fix the case of interrupts that don't change the
stack (but we still have the unconditional frame link), which is
the case of hardirq interrupting softirq. We have no way to detect
this transition so the frame irq link is considered as a real frame
pointer and the return address is dereferenced but it is still a
spurious one.

There are two possible results of this: either the spurious return
address, a random stack value, luckily belongs to the kernel text
and then the unwinding can continue and we just have a weird entry
in the stack trace. Or it doesn't belong to the kernel text and
unwinding stops there.

This is the reason why stacktraces (including perf callchains) on
irqs that interrupted softirqs don't work very well.

To solve this, we don't save the old stack pointer on rbp anymore
but we save it to a scratch register that we push on the new
stack and that we pop back later on irq return.

This preserves the whole frame chain without spurious return addresses
in the middle and drops the need for the horrid fixup_bp_irq_link()
workaround.

And finally irqs that interrupt softirq are sanely unwinded.

Before:

    99.81%         perf  [kernel.kallsyms]  [k] perf_pending_event
                   |
                   --- perf_pending_event
                       irq_work_run
                       smp_irq_work_interrupt
                       irq_work_interrupt
                      |
                      |--41.60%-- __read
                      |          |
                      |          |--99.90%-- create_worker
                      |          |          bench_sched_messaging
                      |          |          cmd_bench
                      |          |          run_builtin
                      |          |          main
                      |          |          __libc_start_main
                      |           --0.10%-- [...]

After:

     1.64%  swapper  [kernel.kallsyms]  [k] perf_pending_event
            |
            --- perf_pending_event
                irq_work_run
                smp_irq_work_interrupt
                irq_work_interrupt
               |
               |--95.00%-- arch_irq_work_raise
               |          irq_work_queue
               |          __perf_event_overflow
               |          perf_swevent_overflow
               |          perf_swevent_event
               |          perf_tp_event
               |          perf_trace_softirq
               |          __do_softirq
               |          call_softirq
               |          do_softirq
               |          irq_exit
               |          |
               |          |--73.68%-- smp_apic_timer_interrupt
               |          |          apic_timer_interrupt
               |          |          |
               |          |          |--96.43%-- amd_e400_idle
               |          |          |          cpu_idle
               |          |          |          start_secondary

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jan Beulich <JBeulich@novell.com>

Showing 2 changed files with 15 additions and 42 deletions Side-by-side Diff

arch/x86/kernel/dumpstack_64.c
... ... @@ -105,34 +105,6 @@
105 105 }
106 106  
107 107 /*
108   - * We are returning from the irq stack and go to the previous one.
109   - * If the previous stack is also in the irq stack, then bp in the first
110   - * frame of the irq stack points to the previous, interrupted one.
111   - * Otherwise we have another level of indirection: We first save
112   - * the bp of the previous stack, then we switch the stack to the irq one
113   - * and save a new bp that links to the previous one.
114   - * (See save_args())
115   - */
116   -static inline unsigned long
117   -fixup_bp_irq_link(unsigned long bp, unsigned long *stack,
118   - unsigned long *irq_stack, unsigned long *irq_stack_end)
119   -{
120   -#ifdef CONFIG_FRAME_POINTER
121   - struct stack_frame *frame = (struct stack_frame *)bp;
122   - unsigned long next;
123   -
124   - if (!in_irq_stack(stack, irq_stack, irq_stack_end)) {
125   - if (!probe_kernel_address(&frame->next_frame, next))
126   - return next;
127   - else
128   - WARN_ONCE(1, "Perf: bad frame pointer = %p in "
129   - "callchain\n", &frame->next_frame);
130   - }
131   -#endif
132   - return bp;
133   -}
134   -
135   -/*
136 108 * x86-64 can have up to three kernel stacks:
137 109 * process stack
138 110 * interrupt stack
... ... @@ -208,8 +180,6 @@
208 180 * pointer (index -1 to end) in the IRQ stack:
209 181 */
210 182 stack = (unsigned long *) (irq_stack_end[-1]);
211   - bp = fixup_bp_irq_link(bp, stack, irq_stack,
212   - irq_stack_end);
213 183 irq_stack_end = NULL;
214 184 ops->stack(data, "EOI");
215 185 continue;
arch/x86/kernel/entry_64.S
... ... @@ -310,9 +310,12 @@
310 310 movq_cfi r10, R10-RBP
311 311 movq_cfi r11, R11-RBP
312 312  
313   - movq_cfi rbp, 0 /* push %rbp */
314   - movq %rsp, %rbp
  313 + /* Save rbp so that we can unwind from get_irq_regs() */
  314 + movq_cfi rbp, 0
315 315  
  316 + /* Save previous stack value */
  317 + movq %rsp, %rsi
  318 +
316 319 leaq -RBP(%rsp),%rdi /* arg1 for handler */
317 320 testl $3, CS(%rdi)
318 321 je 1f
... ... @@ -327,10 +330,11 @@
327 330 jne 2f
328 331 mov PER_CPU_VAR(irq_stack_ptr),%rsp
329 332 EMPTY_FRAME 0
330   - /*
331   - * We entered an interrupt context - irqs are off:
332   - */
333   -2: TRACE_IRQS_OFF
  333 +
  334 +2: /* Store previous stack value */
  335 + pushq %rsi
  336 + /* We entered an interrupt context - irqs are off: */
  337 + TRACE_IRQS_OFF
334 338 .endm
335 339  
336 340 ENTRY(save_rest)
337 341  
338 342  
339 343  
... ... @@ -804,15 +808,14 @@
804 808 DISABLE_INTERRUPTS(CLBR_NONE)
805 809 TRACE_IRQS_OFF
806 810 decl PER_CPU_VAR(irq_count)
807   - leaveq
808 811  
809   - CFI_RESTORE rbp
  812 + /* Restore saved previous stack */
  813 + popq %rsi
  814 + leaq 16(%rsi), %rsp
  815 +
810 816 CFI_DEF_CFA_REGISTER rsp
811   - CFI_ADJUST_CFA_OFFSET -8
  817 + CFI_ADJUST_CFA_OFFSET -16
812 818  
813   - /* we did not save rbx, restore only from ARGOFFSET */
814   - addq $8, %rsp
815   - CFI_ADJUST_CFA_OFFSET -8
816 819 exit_intr:
817 820 GET_THREAD_INFO(%rcx)
818 821 testl $3,CS-ARGOFFSET(%rsp)