Commit 83c2f912b43c3a7babbb6cb7ae2a5276c1ed2a3e

Authored by Linus Torvalds

Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

* 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (39 commits)
  perf tools: Fix compile error on x86_64 Ubuntu
  perf report: Fix --stdio output alignment when --showcpuutilization used
  perf annotate: Get rid of field_sep check
  perf annotate: Fix usage string
  perf kmem: Fix a memory leak
  perf kmem: Add missing closedir() calls
  perf top: Add error message for EMFILE
  perf test: Change type of '-v' option to INCR
  perf script: Add missing closedir() calls
  tracing: Fix compile error when static ftrace is enabled
  recordmcount: Fix handling of elf64 big-endian objects.
  perf tools: Add const.h to MANIFEST to make perf-tar-src-pkg work again
  perf tools: Add support for guest/host-only profiling
  perf kvm: Do guest-only counting by default
  perf top: Don't update total_period on process_sample
  perf hists: Stop using 'self' for struct hist_entry
  perf hists: Rename total_session to total_period
  x86: Add counter when debug stack is used with interrupts enabled
  x86: Allow NMIs to hit breakpoints in i386
  x86: Keep current stack in NMI breakpoints
  ...

Showing 29 changed files Side-by-side Diff

Documentation/kernel-parameters.txt
... ... @@ -2475,6 +2475,14 @@
2475 2475 stacktrace [FTRACE]
2476 2476 Enabled the stack tracer on boot up.
2477 2477  
  2478 + stacktrace_filter=[function-list]
  2479 + [FTRACE] Limit the functions that the stack tracer
  2480 + will trace at boot up. function-list is a comma separated
  2481 + list of functions. This list can be changed at run
  2482 + time by the stack_trace_filter file in the debugfs
  2483 + tracing directory. Note, this enables stack tracing
  2484 + and the stacktrace above is not needed.
  2485 +
2478 2486 sti= [PARISC,HW]
2479 2487 Format: <num>
2480 2488 Set the STI (builtin display/keyboard on the HP-PARISC
arch/x86/include/asm/debugreg.h
... ... @@ -101,6 +101,28 @@
101 101  
102 102 extern void hw_breakpoint_restore(void);
103 103  
  104 +#ifdef CONFIG_X86_64
  105 +DECLARE_PER_CPU(int, debug_stack_usage);
  106 +static inline void debug_stack_usage_inc(void)
  107 +{
  108 + __get_cpu_var(debug_stack_usage)++;
  109 +}
  110 +static inline void debug_stack_usage_dec(void)
  111 +{
  112 + __get_cpu_var(debug_stack_usage)--;
  113 +}
  114 +int is_debug_stack(unsigned long addr);
  115 +void debug_stack_set_zero(void);
  116 +void debug_stack_reset(void);
  117 +#else /* !X86_64 */
  118 +static inline int is_debug_stack(unsigned long addr) { return 0; }
  119 +static inline void debug_stack_set_zero(void) { }
  120 +static inline void debug_stack_reset(void) { }
  121 +static inline void debug_stack_usage_inc(void) { }
  122 +static inline void debug_stack_usage_dec(void) { }
  123 +#endif /* X86_64 */
  124 +
  125 +
104 126 #endif /* __KERNEL__ */
105 127  
106 128 #endif /* _ASM_X86_DEBUGREG_H */
arch/x86/include/asm/desc.h
... ... @@ -35,6 +35,8 @@
35 35  
36 36 extern struct desc_ptr idt_descr;
37 37 extern gate_desc idt_table[];
  38 +extern struct desc_ptr nmi_idt_descr;
  39 +extern gate_desc nmi_idt_table[];
38 40  
39 41 struct gdt_page {
40 42 struct desc_struct gdt[GDT_ENTRIES];
... ... @@ -306,6 +308,16 @@
306 308 desc->limit0 = limit & 0xffff;
307 309 desc->limit = (limit >> 16) & 0xf;
308 310 }
  311 +
  312 +#ifdef CONFIG_X86_64
  313 +static inline void set_nmi_gate(int gate, void *addr)
  314 +{
  315 + gate_desc s;
  316 +
  317 + pack_gate(&s, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS);
  318 + write_idt_entry(nmi_idt_table, gate, &s);
  319 +}
  320 +#endif
309 321  
310 322 static inline void _set_gate(int gate, unsigned type, void *addr,
311 323 unsigned dpl, unsigned ist, unsigned seg)
arch/x86/kernel/cpu/common.c
... ... @@ -1021,6 +1021,8 @@
1021 1021  
1022 1022 #ifdef CONFIG_X86_64
1023 1023 struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
  1024 +struct desc_ptr nmi_idt_descr = { NR_VECTORS * 16 - 1,
  1025 + (unsigned long) nmi_idt_table };
1024 1026  
1025 1027 DEFINE_PER_CPU_FIRST(union irq_stack_union,
1026 1028 irq_stack_union) __aligned(PAGE_SIZE);
... ... @@ -1085,6 +1087,26 @@
1085 1087 */
1086 1088 DEFINE_PER_CPU(struct orig_ist, orig_ist);
1087 1089  
  1090 +static DEFINE_PER_CPU(unsigned long, debug_stack_addr);
  1091 +DEFINE_PER_CPU(int, debug_stack_usage);
  1092 +
  1093 +int is_debug_stack(unsigned long addr)
  1094 +{
  1095 + return __get_cpu_var(debug_stack_usage) ||
  1096 + (addr <= __get_cpu_var(debug_stack_addr) &&
  1097 + addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ));
  1098 +}
  1099 +
  1100 +void debug_stack_set_zero(void)
  1101 +{
  1102 + load_idt((const struct desc_ptr *)&nmi_idt_descr);
  1103 +}
  1104 +
  1105 +void debug_stack_reset(void)
  1106 +{
  1107 + load_idt((const struct desc_ptr *)&idt_descr);
  1108 +}
  1109 +
1088 1110 #else /* CONFIG_X86_64 */
1089 1111  
1090 1112 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
... ... @@ -1212,6 +1234,8 @@
1212 1234 estacks += exception_stack_sizes[v];
1213 1235 oist->ist[v] = t->x86_tss.ist[v] =
1214 1236 (unsigned long)estacks;
  1237 + if (v == DEBUG_STACK-1)
  1238 + per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks;
1215 1239 }
1216 1240 }
1217 1241  
arch/x86/kernel/entry_64.S
... ... @@ -1480,61 +1480,213 @@
1480 1480 CFI_ENDPROC
1481 1481 END(error_exit)
1482 1482  
  1483 +/*
  1484 + * Test if a given stack is an NMI stack or not.
  1485 + */
  1486 + .macro test_in_nmi reg stack nmi_ret normal_ret
  1487 + cmpq %\reg, \stack
  1488 + ja \normal_ret
  1489 + subq $EXCEPTION_STKSZ, %\reg
  1490 + cmpq %\reg, \stack
  1491 + jb \normal_ret
  1492 + jmp \nmi_ret
  1493 + .endm
1483 1494  
1484 1495 /* runs on exception stack */
1485 1496 ENTRY(nmi)
1486 1497 INTR_FRAME
1487 1498 PARAVIRT_ADJUST_EXCEPTION_FRAME
1488   - pushq_cfi $-1
  1499 + /*
  1500 + * We allow breakpoints in NMIs. If a breakpoint occurs, then
  1501 + * the iretq it performs will take us out of NMI context.
  1502 + * This means that we can have nested NMIs where the next
  1503 + * NMI is using the top of the stack of the previous NMI. We
  1504 + * can't let it execute because the nested NMI will corrupt the
  1505 + * stack of the previous NMI. NMI handlers are not re-entrant
  1506 + * anyway.
  1507 + *
  1508 + * To handle this case we do the following:
  1509 + * Check the a special location on the stack that contains
  1510 + * a variable that is set when NMIs are executing.
  1511 + * The interrupted task's stack is also checked to see if it
  1512 + * is an NMI stack.
  1513 + * If the variable is not set and the stack is not the NMI
  1514 + * stack then:
  1515 + * o Set the special variable on the stack
  1516 + * o Copy the interrupt frame into a "saved" location on the stack
  1517 + * o Copy the interrupt frame into a "copy" location on the stack
  1518 + * o Continue processing the NMI
  1519 + * If the variable is set or the previous stack is the NMI stack:
  1520 + * o Modify the "copy" location to jump to the repeate_nmi
  1521 + * o return back to the first NMI
  1522 + *
  1523 + * Now on exit of the first NMI, we first clear the stack variable
  1524 + * The NMI stack will tell any nested NMIs at that point that it is
  1525 + * nested. Then we pop the stack normally with iret, and if there was
  1526 + * a nested NMI that updated the copy interrupt stack frame, a
  1527 + * jump will be made to the repeat_nmi code that will handle the second
  1528 + * NMI.
  1529 + */
  1530 +
  1531 + /* Use %rdx as out temp variable throughout */
  1532 + pushq_cfi %rdx
  1533 +
  1534 + /*
  1535 + * Check the special variable on the stack to see if NMIs are
  1536 + * executing.
  1537 + */
  1538 + cmp $1, -8(%rsp)
  1539 + je nested_nmi
  1540 +
  1541 + /*
  1542 + * Now test if the previous stack was an NMI stack.
  1543 + * We need the double check. We check the NMI stack to satisfy the
  1544 + * race when the first NMI clears the variable before returning.
  1545 + * We check the variable because the first NMI could be in a
  1546 + * breakpoint routine using a breakpoint stack.
  1547 + */
  1548 + lea 6*8(%rsp), %rdx
  1549 + test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
  1550 +
  1551 +nested_nmi:
  1552 + /*
  1553 + * Do nothing if we interrupted the fixup in repeat_nmi.
  1554 + * It's about to repeat the NMI handler, so we are fine
  1555 + * with ignoring this one.
  1556 + */
  1557 + movq $repeat_nmi, %rdx
  1558 + cmpq 8(%rsp), %rdx
  1559 + ja 1f
  1560 + movq $end_repeat_nmi, %rdx
  1561 + cmpq 8(%rsp), %rdx
  1562 + ja nested_nmi_out
  1563 +
  1564 +1:
  1565 + /* Set up the interrupted NMIs stack to jump to repeat_nmi */
  1566 + leaq -6*8(%rsp), %rdx
  1567 + movq %rdx, %rsp
  1568 + CFI_ADJUST_CFA_OFFSET 6*8
  1569 + pushq_cfi $__KERNEL_DS
  1570 + pushq_cfi %rdx
  1571 + pushfq_cfi
  1572 + pushq_cfi $__KERNEL_CS
  1573 + pushq_cfi $repeat_nmi
  1574 +
  1575 + /* Put stack back */
  1576 + addq $(11*8), %rsp
  1577 + CFI_ADJUST_CFA_OFFSET -11*8
  1578 +
  1579 +nested_nmi_out:
  1580 + popq_cfi %rdx
  1581 +
  1582 + /* No need to check faults here */
  1583 + INTERRUPT_RETURN
  1584 +
  1585 +first_nmi:
  1586 + /*
  1587 + * Because nested NMIs will use the pushed location that we
  1588 + * stored in rdx, we must keep that space available.
  1589 + * Here's what our stack frame will look like:
  1590 + * +-------------------------+
  1591 + * | original SS |
  1592 + * | original Return RSP |
  1593 + * | original RFLAGS |
  1594 + * | original CS |
  1595 + * | original RIP |
  1596 + * +-------------------------+
  1597 + * | temp storage for rdx |
  1598 + * +-------------------------+
  1599 + * | NMI executing variable |
  1600 + * +-------------------------+
  1601 + * | Saved SS |
  1602 + * | Saved Return RSP |
  1603 + * | Saved RFLAGS |
  1604 + * | Saved CS |
  1605 + * | Saved RIP |
  1606 + * +-------------------------+
  1607 + * | copied SS |
  1608 + * | copied Return RSP |
  1609 + * | copied RFLAGS |
  1610 + * | copied CS |
  1611 + * | copied RIP |
  1612 + * +-------------------------+
  1613 + * | pt_regs |
  1614 + * +-------------------------+
  1615 + *
  1616 + * The saved RIP is used to fix up the copied RIP that a nested
  1617 + * NMI may zero out. The original stack frame and the temp storage
  1618 + * is also used by nested NMIs and can not be trusted on exit.
  1619 + */
  1620 + /* Set the NMI executing variable on the stack. */
  1621 + pushq_cfi $1
  1622 +
  1623 + /* Copy the stack frame to the Saved frame */
  1624 + .rept 5
  1625 + pushq_cfi 6*8(%rsp)
  1626 + .endr
  1627 +
  1628 + /* Make another copy, this one may be modified by nested NMIs */
  1629 + .rept 5
  1630 + pushq_cfi 4*8(%rsp)
  1631 + .endr
  1632 +
  1633 + /* Do not pop rdx, nested NMIs will corrupt it */
  1634 + movq 11*8(%rsp), %rdx
  1635 +
  1636 + /*
  1637 + * Everything below this point can be preempted by a nested
  1638 + * NMI if the first NMI took an exception. Repeated NMIs
  1639 + * caused by an exception and nested NMI will start here, and
  1640 + * can still be preempted by another NMI.
  1641 + */
  1642 +restart_nmi:
  1643 + pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1489 1644 subq $ORIG_RAX-R15, %rsp
1490 1645 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
  1646 + /*
  1647 + * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit
  1648 + * as we should not be calling schedule in NMI context.
  1649 + * Even with normal interrupts enabled. An NMI should not be
  1650 + * setting NEED_RESCHED or anything that normal interrupts and
  1651 + * exceptions might do.
  1652 + */
1491 1653 call save_paranoid
1492 1654 DEFAULT_FRAME 0
1493 1655 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
1494 1656 movq %rsp,%rdi
1495 1657 movq $-1,%rsi
1496 1658 call do_nmi
1497   -#ifdef CONFIG_TRACE_IRQFLAGS
1498   - /* paranoidexit; without TRACE_IRQS_OFF */
1499   - /* ebx: no swapgs flag */
1500   - DISABLE_INTERRUPTS(CLBR_NONE)
1501 1659 testl %ebx,%ebx /* swapgs needed? */
1502 1660 jnz nmi_restore
1503   - testl $3,CS(%rsp)
1504   - jnz nmi_userspace
1505 1661 nmi_swapgs:
1506 1662 SWAPGS_UNSAFE_STACK
1507 1663 nmi_restore:
1508 1664 RESTORE_ALL 8
  1665 + /* Clear the NMI executing stack variable */
  1666 + movq $0, 10*8(%rsp)
1509 1667 jmp irq_return
1510   -nmi_userspace:
1511   - GET_THREAD_INFO(%rcx)
1512   - movl TI_flags(%rcx),%ebx
1513   - andl $_TIF_WORK_MASK,%ebx
1514   - jz nmi_swapgs
1515   - movq %rsp,%rdi /* &pt_regs */
1516   - call sync_regs
1517   - movq %rax,%rsp /* switch stack for scheduling */
1518   - testl $_TIF_NEED_RESCHED,%ebx
1519   - jnz nmi_schedule
1520   - movl %ebx,%edx /* arg3: thread flags */
1521   - ENABLE_INTERRUPTS(CLBR_NONE)
1522   - xorl %esi,%esi /* arg2: oldset */
1523   - movq %rsp,%rdi /* arg1: &pt_regs */
1524   - call do_notify_resume
1525   - DISABLE_INTERRUPTS(CLBR_NONE)
1526   - jmp nmi_userspace
1527   -nmi_schedule:
1528   - ENABLE_INTERRUPTS(CLBR_ANY)
1529   - call schedule
1530   - DISABLE_INTERRUPTS(CLBR_ANY)
1531   - jmp nmi_userspace
1532 1668 CFI_ENDPROC
1533   -#else
1534   - jmp paranoid_exit
1535   - CFI_ENDPROC
1536   -#endif
1537 1669 END(nmi)
  1670 +
  1671 + /*
  1672 + * If an NMI hit an iret because of an exception or breakpoint,
  1673 + * it can lose its NMI context, and a nested NMI may come in.
  1674 + * In that case, the nested NMI will change the preempted NMI's
  1675 + * stack to jump to here when it does the final iret.
  1676 + */
  1677 +repeat_nmi:
  1678 + INTR_FRAME
  1679 + /* Update the stack variable to say we are still in NMI */
  1680 + movq $1, 5*8(%rsp)
  1681 +
  1682 + /* copy the saved stack back to copy stack */
  1683 + .rept 5
  1684 + pushq_cfi 4*8(%rsp)
  1685 + .endr
  1686 +
  1687 + jmp restart_nmi
  1688 + CFI_ENDPROC
  1689 +end_repeat_nmi:
1538 1690  
1539 1691 ENTRY(ignore_sysret)
1540 1692 CFI_STARTPROC
arch/x86/kernel/head_64.S
... ... @@ -417,6 +417,10 @@
417 417 ENTRY(idt_table)
418 418 .skip IDT_ENTRIES * 16
419 419  
  420 + .align L1_CACHE_BYTES
  421 +ENTRY(nmi_idt_table)
  422 + .skip IDT_ENTRIES * 16
  423 +
420 424 __PAGE_ALIGNED_BSS
421 425 .align PAGE_SIZE
422 426 ENTRY(empty_zero_page)
arch/x86/kernel/nmi.c
... ... @@ -405,9 +405,108 @@
405 405 unknown_nmi_error(reason, regs);
406 406 }
407 407  
  408 +/*
  409 + * NMIs can hit breakpoints which will cause it to lose its
  410 + * NMI context with the CPU when the breakpoint does an iret.
  411 + */
  412 +#ifdef CONFIG_X86_32
  413 +/*
  414 + * For i386, NMIs use the same stack as the kernel, and we can
  415 + * add a workaround to the iret problem in C. Simply have 3 states
  416 + * the NMI can be in.
  417 + *
  418 + * 1) not running
  419 + * 2) executing
  420 + * 3) latched
  421 + *
  422 + * When no NMI is in progress, it is in the "not running" state.
  423 + * When an NMI comes in, it goes into the "executing" state.
  424 + * Normally, if another NMI is triggered, it does not interrupt
  425 + * the running NMI and the HW will simply latch it so that when
  426 + * the first NMI finishes, it will restart the second NMI.
  427 + * (Note, the latch is binary, thus multiple NMIs triggering,
  428 + * when one is running, are ignored. Only one NMI is restarted.)
  429 + *
  430 + * If an NMI hits a breakpoint that executes an iret, another
  431 + * NMI can preempt it. We do not want to allow this new NMI
  432 + * to run, but we want to execute it when the first one finishes.
  433 + * We set the state to "latched", and the first NMI will perform
  434 + * an cmpxchg on the state, and if it doesn't successfully
  435 + * reset the state to "not running" it will restart the next
  436 + * NMI.
  437 + */
  438 +enum nmi_states {
  439 + NMI_NOT_RUNNING,
  440 + NMI_EXECUTING,
  441 + NMI_LATCHED,
  442 +};
  443 +static DEFINE_PER_CPU(enum nmi_states, nmi_state);
  444 +
  445 +#define nmi_nesting_preprocess(regs) \
  446 + do { \
  447 + if (__get_cpu_var(nmi_state) != NMI_NOT_RUNNING) { \
  448 + __get_cpu_var(nmi_state) = NMI_LATCHED; \
  449 + return; \
  450 + } \
  451 + nmi_restart: \
  452 + __get_cpu_var(nmi_state) = NMI_EXECUTING; \
  453 + } while (0)
  454 +
  455 +#define nmi_nesting_postprocess() \
  456 + do { \
  457 + if (cmpxchg(&__get_cpu_var(nmi_state), \
  458 + NMI_EXECUTING, NMI_NOT_RUNNING) != NMI_EXECUTING) \
  459 + goto nmi_restart; \
  460 + } while (0)
  461 +#else /* x86_64 */
  462 +/*
  463 + * In x86_64 things are a bit more difficult. This has the same problem
  464 + * where an NMI hitting a breakpoint that calls iret will remove the
  465 + * NMI context, allowing a nested NMI to enter. What makes this more
  466 + * difficult is that both NMIs and breakpoints have their own stack.
  467 + * When a new NMI or breakpoint is executed, the stack is set to a fixed
  468 + * point. If an NMI is nested, it will have its stack set at that same
  469 + * fixed address that the first NMI had, and will start corrupting the
  470 + * stack. This is handled in entry_64.S, but the same problem exists with
  471 + * the breakpoint stack.
  472 + *
  473 + * If a breakpoint is being processed, and the debug stack is being used,
  474 + * if an NMI comes in and also hits a breakpoint, the stack pointer
  475 + * will be set to the same fixed address as the breakpoint that was
  476 + * interrupted, causing that stack to be corrupted. To handle this case,
  477 + * check if the stack that was interrupted is the debug stack, and if
  478 + * so, change the IDT so that new breakpoints will use the current stack
  479 + * and not switch to the fixed address. On return of the NMI, switch back
  480 + * to the original IDT.
  481 + */
  482 +static DEFINE_PER_CPU(int, update_debug_stack);
  483 +
  484 +static inline void nmi_nesting_preprocess(struct pt_regs *regs)
  485 +{
  486 + /*
  487 + * If we interrupted a breakpoint, it is possible that
  488 + * the nmi handler will have breakpoints too. We need to
  489 + * change the IDT such that breakpoints that happen here
  490 + * continue to use the NMI stack.
  491 + */
  492 + if (unlikely(is_debug_stack(regs->sp))) {
  493 + debug_stack_set_zero();
  494 + __get_cpu_var(update_debug_stack) = 1;
  495 + }
  496 +}
  497 +
  498 +static inline void nmi_nesting_postprocess(void)
  499 +{
  500 + if (unlikely(__get_cpu_var(update_debug_stack)))
  501 + debug_stack_reset();
  502 +}
  503 +#endif
  504 +
408 505 dotraplinkage notrace __kprobes void
409 506 do_nmi(struct pt_regs *regs, long error_code)
410 507 {
  508 + nmi_nesting_preprocess(regs);
  509 +
411 510 nmi_enter();
412 511  
413 512 inc_irq_stat(__nmi_count);
... ... @@ -416,6 +515,9 @@
416 515 default_do_nmi(regs);
417 516  
418 517 nmi_exit();
  518 +
  519 + /* On i386, may loop back to preprocess */
  520 + nmi_nesting_postprocess();
419 521 }
420 522  
421 523 void stop_nmi(void)
arch/x86/kernel/traps.c
... ... @@ -311,9 +311,15 @@
311 311 == NOTIFY_STOP)
312 312 return;
313 313  
  314 + /*
  315 + * Let others (NMI) know that the debug stack is in use
  316 + * as we may switch to the interrupt stack.
  317 + */
  318 + debug_stack_usage_inc();
314 319 preempt_conditional_sti(regs);
315 320 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
316 321 preempt_conditional_cli(regs);
  322 + debug_stack_usage_dec();
317 323 }
318 324  
319 325 #ifdef CONFIG_X86_64
... ... @@ -406,6 +412,12 @@
406 412 SIGTRAP) == NOTIFY_STOP)
407 413 return;
408 414  
  415 + /*
  416 + * Let others (NMI) know that the debug stack is in use
  417 + * as we may switch to the interrupt stack.
  418 + */
  419 + debug_stack_usage_inc();
  420 +
409 421 /* It's safe to allow irq's after DR6 has been saved */
410 422 preempt_conditional_sti(regs);
411 423  
... ... @@ -413,6 +425,7 @@
413 425 handle_vm86_trap((struct kernel_vm86_regs *) regs,
414 426 error_code, 1);
415 427 preempt_conditional_cli(regs);
  428 + debug_stack_usage_dec();
416 429 return;
417 430 }
418 431  
... ... @@ -432,6 +445,7 @@
432 445 if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
433 446 send_sigtrap(tsk, regs, error_code, si_code);
434 447 preempt_conditional_cli(regs);
  448 + debug_stack_usage_dec();
435 449  
436 450 return;
437 451 }
... ... @@ -718,5 +732,11 @@
718 732 cpu_init();
719 733  
720 734 x86_init.irqs.trap_init();
  735 +
  736 +#ifdef CONFIG_X86_64
  737 + memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16);
  738 + set_nmi_gate(1, &debug);
  739 + set_nmi_gate(3, &int3);
  740 +#endif
721 741 }
include/linux/compiler-gcc.h
... ... @@ -50,6 +50,11 @@
50 50 # define inline inline __attribute__((always_inline))
51 51 # define __inline__ __inline__ __attribute__((always_inline))
52 52 # define __inline __inline __attribute__((always_inline))
  53 +#else
  54 +/* A lot of inline functions can cause havoc with function tracing */
  55 +# define inline inline notrace
  56 +# define __inline__ __inline__ notrace
  57 +# define __inline __inline notrace
53 58 #endif
54 59  
55 60 #define __deprecated __attribute__((deprecated))
include/linux/ftrace.h
... ... @@ -133,6 +133,8 @@
133 133 int ftrace_arch_code_modify_prepare(void);
134 134 int ftrace_arch_code_modify_post_process(void);
135 135  
  136 +void ftrace_bug(int err, unsigned long ip);
  137 +
136 138 struct seq_file;
137 139  
138 140 struct ftrace_probe_ops {
... ... @@ -161,7 +163,6 @@
161 163  
162 164 enum {
163 165 FTRACE_FL_ENABLED = (1 << 30),
164   - FTRACE_FL_FREE = (1 << 31),
165 166 };
166 167  
167 168 #define FTRACE_FL_MASK (0x3UL << 30)
... ... @@ -172,10 +173,7 @@
172 173 unsigned long ip; /* address of mcount call-site */
173 174 struct dyn_ftrace *freelist;
174 175 };
175   - union {
176   - unsigned long flags;
177   - struct dyn_ftrace *newlist;
178   - };
  176 + unsigned long flags;
179 177 struct dyn_arch_ftrace arch;
180 178 };
181 179  
... ... @@ -190,6 +188,56 @@
190 188 int register_ftrace_command(struct ftrace_func_command *cmd);
191 189 int unregister_ftrace_command(struct ftrace_func_command *cmd);
192 190  
  191 +enum {
  192 + FTRACE_UPDATE_CALLS = (1 << 0),
  193 + FTRACE_DISABLE_CALLS = (1 << 1),
  194 + FTRACE_UPDATE_TRACE_FUNC = (1 << 2),
  195 + FTRACE_START_FUNC_RET = (1 << 3),
  196 + FTRACE_STOP_FUNC_RET = (1 << 4),
  197 +};
  198 +
  199 +enum {
  200 + FTRACE_UPDATE_IGNORE,
  201 + FTRACE_UPDATE_MAKE_CALL,
  202 + FTRACE_UPDATE_MAKE_NOP,
  203 +};
  204 +
  205 +enum {
  206 + FTRACE_ITER_FILTER = (1 << 0),
  207 + FTRACE_ITER_NOTRACE = (1 << 1),
  208 + FTRACE_ITER_PRINTALL = (1 << 2),
  209 + FTRACE_ITER_DO_HASH = (1 << 3),
  210 + FTRACE_ITER_HASH = (1 << 4),
  211 + FTRACE_ITER_ENABLED = (1 << 5),
  212 +};
  213 +
  214 +void arch_ftrace_update_code(int command);
  215 +
  216 +struct ftrace_rec_iter;
  217 +
  218 +struct ftrace_rec_iter *ftrace_rec_iter_start(void);
  219 +struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter);
  220 +struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter);
  221 +
  222 +int ftrace_update_record(struct dyn_ftrace *rec, int enable);
  223 +int ftrace_test_record(struct dyn_ftrace *rec, int enable);
  224 +void ftrace_run_stop_machine(int command);
  225 +int ftrace_location(unsigned long ip);
  226 +
  227 +extern ftrace_func_t ftrace_trace_function;
  228 +
  229 +int ftrace_regex_open(struct ftrace_ops *ops, int flag,
  230 + struct inode *inode, struct file *file);
  231 +ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf,
  232 + size_t cnt, loff_t *ppos);
  233 +ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf,
  234 + size_t cnt, loff_t *ppos);
  235 +loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int origin);
  236 +int ftrace_regex_release(struct inode *inode, struct file *file);
  237 +
  238 +void __init
  239 +ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable);
  240 +
193 241 /* defined in arch */
194 242 extern int ftrace_ip_converted(unsigned long ip);
195 243 extern int ftrace_dyn_arch_init(void *data);
... ... @@ -284,6 +332,25 @@
284 332 {
285 333 return 0;
286 334 }
  335 +
  336 +/*
  337 + * Again users of functions that have ftrace_ops may not
  338 + * have them defined when ftrace is not enabled, but these
  339 + * functions may still be called. Use a macro instead of inline.
  340 + */
  341 +#define ftrace_regex_open(ops, flag, inod, file) ({ -ENODEV; })
  342 +#define ftrace_set_early_filter(ops, buf, enable) do { } while (0)
  343 +
  344 +static inline ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf,
  345 + size_t cnt, loff_t *ppos) { return -ENODEV; }
  346 +static inline ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf,
  347 + size_t cnt, loff_t *ppos) { return -ENODEV; }
  348 +static inline loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
  349 +{
  350 + return -ENODEV;
  351 +}
  352 +static inline int
  353 +ftrace_regex_release(struct inode *inode, struct file *file) { return -ENODEV; }
287 354 #endif /* CONFIG_DYNAMIC_FTRACE */
288 355  
289 356 /* totally disable ftrace - can not re-enable after this */
kernel/trace/ftrace.c
Changes suppressed. Click to show
... ... @@ -22,11 +22,13 @@
22 22 #include <linux/hardirq.h>
23 23 #include <linux/kthread.h>
24 24 #include <linux/uaccess.h>
  25 +#include <linux/bsearch.h>
25 26 #include <linux/module.h>
26 27 #include <linux/ftrace.h>
27 28 #include <linux/sysctl.h>
28 29 #include <linux/slab.h>
29 30 #include <linux/ctype.h>
  31 +#include <linux/sort.h>
30 32 #include <linux/list.h>
31 33 #include <linux/hash.h>
32 34 #include <linux/rcupdate.h>
... ... @@ -947,13 +949,6 @@
947 949 struct rcu_head rcu;
948 950 };
949 951  
950   -enum {
951   - FTRACE_ENABLE_CALLS = (1 << 0),
952   - FTRACE_DISABLE_CALLS = (1 << 1),
953   - FTRACE_UPDATE_TRACE_FUNC = (1 << 2),
954   - FTRACE_START_FUNC_RET = (1 << 3),
955   - FTRACE_STOP_FUNC_RET = (1 << 4),
956   -};
957 952 struct ftrace_func_entry {
958 953 struct hlist_node hlist;
959 954 unsigned long ip;
960 955  
961 956  
962 957  
963 958  
964 959  
... ... @@ -984,26 +979,30 @@
984 979 .filter_hash = EMPTY_HASH,
985 980 };
986 981  
987   -static struct dyn_ftrace *ftrace_new_addrs;
988   -
989 982 static DEFINE_MUTEX(ftrace_regex_lock);
990 983  
991 984 struct ftrace_page {
992 985 struct ftrace_page *next;
  986 + struct dyn_ftrace *records;
993 987 int index;
994   - struct dyn_ftrace records[];
  988 + int size;
995 989 };
996 990  
997   -#define ENTRIES_PER_PAGE \
998   - ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace))
  991 +static struct ftrace_page *ftrace_new_pgs;
999 992  
  993 +#define ENTRY_SIZE sizeof(struct dyn_ftrace)
  994 +#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE)
  995 +
1000 996 /* estimate from running different kernels */
1001 997 #define NR_TO_INIT 10000
1002 998  
1003 999 static struct ftrace_page *ftrace_pages_start;
1004 1000 static struct ftrace_page *ftrace_pages;
1005 1001  
1006   -static struct dyn_ftrace *ftrace_free_records;
  1002 +static bool ftrace_hash_empty(struct ftrace_hash *hash)
  1003 +{
  1004 + return !hash || !hash->count;
  1005 +}
1007 1006  
1008 1007 static struct ftrace_func_entry *
1009 1008 ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
... ... @@ -1013,7 +1012,7 @@
1013 1012 struct hlist_head *hhd;
1014 1013 struct hlist_node *n;
1015 1014  
1016   - if (!hash->count)
  1015 + if (ftrace_hash_empty(hash))
1017 1016 return NULL;
1018 1017  
1019 1018 if (hash->size_bits > 0)
... ... @@ -1157,7 +1156,7 @@
1157 1156 return NULL;
1158 1157  
1159 1158 /* Empty hash? */
1160   - if (!hash || !hash->count)
  1159 + if (ftrace_hash_empty(hash))
1161 1160 return new_hash;
1162 1161  
1163 1162 size = 1 << hash->size_bits;
1164 1163  
... ... @@ -1282,9 +1281,9 @@
1282 1281 filter_hash = rcu_dereference_raw(ops->filter_hash);
1283 1282 notrace_hash = rcu_dereference_raw(ops->notrace_hash);
1284 1283  
1285   - if ((!filter_hash || !filter_hash->count ||
  1284 + if ((ftrace_hash_empty(filter_hash) ||
1286 1285 ftrace_lookup_ip(filter_hash, ip)) &&
1287   - (!notrace_hash || !notrace_hash->count ||
  1286 + (ftrace_hash_empty(notrace_hash) ||
1288 1287 !ftrace_lookup_ip(notrace_hash, ip)))
1289 1288 ret = 1;
1290 1289 else
... ... @@ -1307,6 +1306,47 @@
1307 1306 } \
1308 1307 }
1309 1308  
  1309 +
  1310 +static int ftrace_cmp_recs(const void *a, const void *b)
  1311 +{
  1312 + const struct dyn_ftrace *reca = a;
  1313 + const struct dyn_ftrace *recb = b;
  1314 +
  1315 + if (reca->ip > recb->ip)
  1316 + return 1;
  1317 + if (reca->ip < recb->ip)
  1318 + return -1;
  1319 + return 0;
  1320 +}
  1321 +
  1322 +/**
  1323 + * ftrace_location - return true if the ip giving is a traced location
  1324 + * @ip: the instruction pointer to check
  1325 + *
  1326 + * Returns 1 if @ip given is a pointer to a ftrace location.
  1327 + * That is, the instruction that is either a NOP or call to
  1328 + * the function tracer. It checks the ftrace internal tables to
  1329 + * determine if the address belongs or not.
  1330 + */
  1331 +int ftrace_location(unsigned long ip)
  1332 +{
  1333 + struct ftrace_page *pg;
  1334 + struct dyn_ftrace *rec;
  1335 + struct dyn_ftrace key;
  1336 +
  1337 + key.ip = ip;
  1338 +
  1339 + for (pg = ftrace_pages_start; pg; pg = pg->next) {
  1340 + rec = bsearch(&key, pg->records, pg->index,
  1341 + sizeof(struct dyn_ftrace),
  1342 + ftrace_cmp_recs);
  1343 + if (rec)
  1344 + return 1;
  1345 + }
  1346 +
  1347 + return 0;
  1348 +}
  1349 +
1310 1350 static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1311 1351 int filter_hash,
1312 1352 bool inc)
... ... @@ -1336,7 +1376,7 @@
1336 1376 if (filter_hash) {
1337 1377 hash = ops->filter_hash;
1338 1378 other_hash = ops->notrace_hash;
1339   - if (!hash || !hash->count)
  1379 + if (ftrace_hash_empty(hash))
1340 1380 all = 1;
1341 1381 } else {
1342 1382 inc = !inc;
... ... @@ -1346,7 +1386,7 @@
1346 1386 * If the notrace hash has no items,
1347 1387 * then there's nothing to do.
1348 1388 */
1349   - if (hash && !hash->count)
  1389 + if (ftrace_hash_empty(hash))
1350 1390 return;
1351 1391 }
1352 1392  
... ... @@ -1363,8 +1403,8 @@
1363 1403 if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip))
1364 1404 match = 1;
1365 1405 } else {
1366   - in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip);
1367   - in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip);
  1406 + in_hash = !!ftrace_lookup_ip(hash, rec->ip);
  1407 + in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip);
1368 1408  
1369 1409 /*
1370 1410 *
... ... @@ -1372,7 +1412,7 @@
1372 1412 if (filter_hash && in_hash && !in_other_hash)
1373 1413 match = 1;
1374 1414 else if (!filter_hash && in_hash &&
1375   - (in_other_hash || !other_hash->count))
  1415 + (in_other_hash || ftrace_hash_empty(other_hash)))
1376 1416 match = 1;
1377 1417 }
1378 1418 if (!match)
1379 1419  
1380 1420  
... ... @@ -1406,40 +1446,12 @@
1406 1446 __ftrace_hash_rec_update(ops, filter_hash, 1);
1407 1447 }
1408 1448  
1409   -static void ftrace_free_rec(struct dyn_ftrace *rec)
1410   -{
1411   - rec->freelist = ftrace_free_records;
1412   - ftrace_free_records = rec;
1413   - rec->flags |= FTRACE_FL_FREE;
1414   -}
1415   -
1416 1449 static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
1417 1450 {
1418   - struct dyn_ftrace *rec;
1419   -
1420   - /* First check for freed records */
1421   - if (ftrace_free_records) {
1422   - rec = ftrace_free_records;
1423   -
1424   - if (unlikely(!(rec->flags & FTRACE_FL_FREE))) {
1425   - FTRACE_WARN_ON_ONCE(1);
1426   - ftrace_free_records = NULL;
  1451 + if (ftrace_pages->index == ftrace_pages->size) {
  1452 + /* We should have allocated enough */
  1453 + if (WARN_ON(!ftrace_pages->next))
1427 1454 return NULL;
1428   - }
1429   -
1430   - ftrace_free_records = rec->freelist;
1431   - memset(rec, 0, sizeof(*rec));
1432   - return rec;
1433   - }
1434   -
1435   - if (ftrace_pages->index == ENTRIES_PER_PAGE) {
1436   - if (!ftrace_pages->next) {
1437   - /* allocate another page */
1438   - ftrace_pages->next =
1439   - (void *)get_zeroed_page(GFP_KERNEL);
1440   - if (!ftrace_pages->next)
1441   - return NULL;
1442   - }
1443 1455 ftrace_pages = ftrace_pages->next;
1444 1456 }
1445 1457  
... ... @@ -1459,8 +1471,6 @@
1459 1471 return NULL;
1460 1472  
1461 1473 rec->ip = ip;
1462   - rec->newlist = ftrace_new_addrs;
1463   - ftrace_new_addrs = rec;
1464 1474  
1465 1475 return rec;
1466 1476 }
... ... @@ -1475,7 +1485,19 @@
1475 1485 printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
1476 1486 }
1477 1487  
1478   -static void ftrace_bug(int failed, unsigned long ip)
  1488 +/**
  1489 + * ftrace_bug - report and shutdown function tracer
  1490 + * @failed: The failed type (EFAULT, EINVAL, EPERM)
  1491 + * @ip: The address that failed
  1492 + *
  1493 + * The arch code that enables or disables the function tracing
  1494 + * can call ftrace_bug() when it has detected a problem in
  1495 + * modifying the code. @failed should be one of either:
  1496 + * EFAULT - if the problem happens on reading the @ip address
  1497 + * EINVAL - if what is read at @ip is not what was expected
  1498 + * EPERM - if the problem happens on writting to the @ip address
  1499 + */
  1500 +void ftrace_bug(int failed, unsigned long ip)
1479 1501 {
1480 1502 switch (failed) {
1481 1503 case -EFAULT:
1482 1504  
1483 1505  
1484 1506  
1485 1507  
... ... @@ -1517,24 +1539,19 @@
1517 1539 return 0;
1518 1540 }
1519 1541  
1520   -
1521   -static int
1522   -__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
  1542 +static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
1523 1543 {
1524   - unsigned long ftrace_addr;
1525 1544 unsigned long flag = 0UL;
1526 1545  
1527   - ftrace_addr = (unsigned long)FTRACE_ADDR;
1528   -
1529 1546 /*
1530   - * If we are enabling tracing:
  1547 + * If we are updating calls:
1531 1548 *
1532 1549 * If the record has a ref count, then we need to enable it
1533 1550 * because someone is using it.
1534 1551 *
1535 1552 * Otherwise we make sure its disabled.
1536 1553 *
1537   - * If we are disabling tracing, then disable all records that
  1554 + * If we are disabling calls, then disable all records that
1538 1555 * are enabled.
1539 1556 */
1540 1557 if (enable && (rec->flags & ~FTRACE_FL_MASK))
1541 1558  
1542 1559  
1543 1560  
1544 1561  
... ... @@ -1542,18 +1559,72 @@
1542 1559  
1543 1560 /* If the state of this record hasn't changed, then do nothing */
1544 1561 if ((rec->flags & FTRACE_FL_ENABLED) == flag)
1545   - return 0;
  1562 + return FTRACE_UPDATE_IGNORE;
1546 1563  
1547 1564 if (flag) {
1548   - rec->flags |= FTRACE_FL_ENABLED;
  1565 + if (update)
  1566 + rec->flags |= FTRACE_FL_ENABLED;
  1567 + return FTRACE_UPDATE_MAKE_CALL;
  1568 + }
  1569 +
  1570 + if (update)
  1571 + rec->flags &= ~FTRACE_FL_ENABLED;
  1572 +
  1573 + return FTRACE_UPDATE_MAKE_NOP;
  1574 +}
  1575 +
  1576 +/**
  1577 + * ftrace_update_record, set a record that now is tracing or not
  1578 + * @rec: the record to update
  1579 + * @enable: set to 1 if the record is tracing, zero to force disable
  1580 + *
  1581 + * The records that represent all functions that can be traced need
  1582 + * to be updated when tracing has been enabled.
  1583 + */
  1584 +int ftrace_update_record(struct dyn_ftrace *rec, int enable)
  1585 +{
  1586 + return ftrace_check_record(rec, enable, 1);
  1587 +}
  1588 +
  1589 +/**
  1590 + * ftrace_test_record, check if the record has been enabled or not
  1591 + * @rec: the record to test
  1592 + * @enable: set to 1 to check if enabled, 0 if it is disabled
  1593 + *
  1594 + * The arch code may need to test if a record is already set to
  1595 + * tracing to determine how to modify the function code that it
  1596 + * represents.
  1597 + */
  1598 +int ftrace_test_record(struct dyn_ftrace *rec, int enable)
  1599 +{
  1600 + return ftrace_check_record(rec, enable, 0);
  1601 +}
  1602 +
  1603 +static int
  1604 +__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
  1605 +{
  1606 + unsigned long ftrace_addr;
  1607 + int ret;
  1608 +
  1609 + ftrace_addr = (unsigned long)FTRACE_ADDR;
  1610 +
  1611 + ret = ftrace_update_record(rec, enable);
  1612 +
  1613 + switch (ret) {
  1614 + case FTRACE_UPDATE_IGNORE:
  1615 + return 0;
  1616 +
  1617 + case FTRACE_UPDATE_MAKE_CALL:
1549 1618 return ftrace_make_call(rec, ftrace_addr);
  1619 +
  1620 + case FTRACE_UPDATE_MAKE_NOP:
  1621 + return ftrace_make_nop(NULL, rec, ftrace_addr);
1550 1622 }
1551 1623  
1552   - rec->flags &= ~FTRACE_FL_ENABLED;
1553   - return ftrace_make_nop(NULL, rec, ftrace_addr);
  1624 + return -1; /* unknow ftrace bug */
1554 1625 }
1555 1626  
1556   -static void ftrace_replace_code(int enable)
  1627 +static void ftrace_replace_code(int update)
1557 1628 {
1558 1629 struct dyn_ftrace *rec;
1559 1630 struct ftrace_page *pg;
... ... @@ -1563,11 +1634,7 @@
1563 1634 return;
1564 1635  
1565 1636 do_for_each_ftrace_rec(pg, rec) {
1566   - /* Skip over free records */
1567   - if (rec->flags & FTRACE_FL_FREE)
1568   - continue;
1569   -
1570   - failed = __ftrace_replace_code(rec, enable);
  1637 + failed = __ftrace_replace_code(rec, update);
1571 1638 if (failed) {
1572