Commit bc2b0331e077f576369a2b6c75d15ed4de4ef91f

Authored by K. Y. Srinivasan
Committed by H. Peter Anvin
1 parent db34bbb767

X86: Handle Hyper-V vmbus interrupts as special hypervisor interrupts

Starting with win8, vmbus interrupts can be delivered on any VCPU in the guest
and furthermore can be concurrently active on multiple VCPUs. Support this
interrupt delivery model by setting up a separate IDT entry for Hyper-V vmbus.
interrupts. I would like to thank Jan Beulich <JBeulich@suse.com> and
Thomas Gleixner <tglx@linutronix.de>, for their help.

In this version of the patch, based on the feedback, I have merged the IDT
vector for Xen and Hyper-V and made the necessary adjustments. Furhermore,
based on Jan's feedback I have added the necessary compilation switches.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Link: http://lkml.kernel.org/r/1359940959-32168-3-git-send-email-kys@microsoft.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

Showing 6 changed files with 68 additions and 7 deletions Inline Diff

arch/x86/include/asm/irq_vectors.h
1 #ifndef _ASM_X86_IRQ_VECTORS_H 1 #ifndef _ASM_X86_IRQ_VECTORS_H
2 #define _ASM_X86_IRQ_VECTORS_H 2 #define _ASM_X86_IRQ_VECTORS_H
3 3
4 #include <linux/threads.h> 4 #include <linux/threads.h>
5 /* 5 /*
6 * Linux IRQ vector layout. 6 * Linux IRQ vector layout.
7 * 7 *
8 * There are 256 IDT entries (per CPU - each entry is 8 bytes) which can 8 * There are 256 IDT entries (per CPU - each entry is 8 bytes) which can
9 * be defined by Linux. They are used as a jump table by the CPU when a 9 * be defined by Linux. They are used as a jump table by the CPU when a
10 * given vector is triggered - by a CPU-external, CPU-internal or 10 * given vector is triggered - by a CPU-external, CPU-internal or
11 * software-triggered event. 11 * software-triggered event.
12 * 12 *
13 * Linux sets the kernel code address each entry jumps to early during 13 * Linux sets the kernel code address each entry jumps to early during
14 * bootup, and never changes them. This is the general layout of the 14 * bootup, and never changes them. This is the general layout of the
15 * IDT entries: 15 * IDT entries:
16 * 16 *
17 * Vectors 0 ... 31 : system traps and exceptions - hardcoded events 17 * Vectors 0 ... 31 : system traps and exceptions - hardcoded events
18 * Vectors 32 ... 127 : device interrupts 18 * Vectors 32 ... 127 : device interrupts
19 * Vector 128 : legacy int80 syscall interface 19 * Vector 128 : legacy int80 syscall interface
20 * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 except 204 : device interrupts 20 * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 except 204 : device interrupts
21 * Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts 21 * Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts
22 * 22 *
23 * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table. 23 * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table.
24 * 24 *
25 * This file enumerates the exact layout of them: 25 * This file enumerates the exact layout of them:
26 */ 26 */
27 27
28 #define NMI_VECTOR 0x02 28 #define NMI_VECTOR 0x02
29 #define MCE_VECTOR 0x12 29 #define MCE_VECTOR 0x12
30 30
31 /* 31 /*
32 * IDT vectors usable for external interrupt sources start at 0x20. 32 * IDT vectors usable for external interrupt sources start at 0x20.
33 * (0x80 is the syscall vector, 0x30-0x3f are for ISA) 33 * (0x80 is the syscall vector, 0x30-0x3f are for ISA)
34 */ 34 */
35 #define FIRST_EXTERNAL_VECTOR 0x20 35 #define FIRST_EXTERNAL_VECTOR 0x20
36 /* 36 /*
37 * We start allocating at 0x21 to spread out vectors evenly between 37 * We start allocating at 0x21 to spread out vectors evenly between
38 * priority levels. (0x80 is the syscall vector) 38 * priority levels. (0x80 is the syscall vector)
39 */ 39 */
40 #define VECTOR_OFFSET_START 1 40 #define VECTOR_OFFSET_START 1
41 41
42 /* 42 /*
43 * Reserve the lowest usable vector (and hence lowest priority) 0x20 for 43 * Reserve the lowest usable vector (and hence lowest priority) 0x20 for
44 * triggering cleanup after irq migration. 0x21-0x2f will still be used 44 * triggering cleanup after irq migration. 0x21-0x2f will still be used
45 * for device interrupts. 45 * for device interrupts.
46 */ 46 */
47 #define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR 47 #define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR
48 48
49 #define IA32_SYSCALL_VECTOR 0x80 49 #define IA32_SYSCALL_VECTOR 0x80
50 #ifdef CONFIG_X86_32 50 #ifdef CONFIG_X86_32
51 # define SYSCALL_VECTOR 0x80 51 # define SYSCALL_VECTOR 0x80
52 #endif 52 #endif
53 53
54 /* 54 /*
55 * Vectors 0x30-0x3f are used for ISA interrupts. 55 * Vectors 0x30-0x3f are used for ISA interrupts.
56 * round up to the next 16-vector boundary 56 * round up to the next 16-vector boundary
57 */ 57 */
58 #define IRQ0_VECTOR ((FIRST_EXTERNAL_VECTOR + 16) & ~15) 58 #define IRQ0_VECTOR ((FIRST_EXTERNAL_VECTOR + 16) & ~15)
59 59
60 #define IRQ1_VECTOR (IRQ0_VECTOR + 1) 60 #define IRQ1_VECTOR (IRQ0_VECTOR + 1)
61 #define IRQ2_VECTOR (IRQ0_VECTOR + 2) 61 #define IRQ2_VECTOR (IRQ0_VECTOR + 2)
62 #define IRQ3_VECTOR (IRQ0_VECTOR + 3) 62 #define IRQ3_VECTOR (IRQ0_VECTOR + 3)
63 #define IRQ4_VECTOR (IRQ0_VECTOR + 4) 63 #define IRQ4_VECTOR (IRQ0_VECTOR + 4)
64 #define IRQ5_VECTOR (IRQ0_VECTOR + 5) 64 #define IRQ5_VECTOR (IRQ0_VECTOR + 5)
65 #define IRQ6_VECTOR (IRQ0_VECTOR + 6) 65 #define IRQ6_VECTOR (IRQ0_VECTOR + 6)
66 #define IRQ7_VECTOR (IRQ0_VECTOR + 7) 66 #define IRQ7_VECTOR (IRQ0_VECTOR + 7)
67 #define IRQ8_VECTOR (IRQ0_VECTOR + 8) 67 #define IRQ8_VECTOR (IRQ0_VECTOR + 8)
68 #define IRQ9_VECTOR (IRQ0_VECTOR + 9) 68 #define IRQ9_VECTOR (IRQ0_VECTOR + 9)
69 #define IRQ10_VECTOR (IRQ0_VECTOR + 10) 69 #define IRQ10_VECTOR (IRQ0_VECTOR + 10)
70 #define IRQ11_VECTOR (IRQ0_VECTOR + 11) 70 #define IRQ11_VECTOR (IRQ0_VECTOR + 11)
71 #define IRQ12_VECTOR (IRQ0_VECTOR + 12) 71 #define IRQ12_VECTOR (IRQ0_VECTOR + 12)
72 #define IRQ13_VECTOR (IRQ0_VECTOR + 13) 72 #define IRQ13_VECTOR (IRQ0_VECTOR + 13)
73 #define IRQ14_VECTOR (IRQ0_VECTOR + 14) 73 #define IRQ14_VECTOR (IRQ0_VECTOR + 14)
74 #define IRQ15_VECTOR (IRQ0_VECTOR + 15) 74 #define IRQ15_VECTOR (IRQ0_VECTOR + 15)
75 75
76 /* 76 /*
77 * Special IRQ vectors used by the SMP architecture, 0xf0-0xff 77 * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
78 * 78 *
79 * some of the following vectors are 'rare', they are merged 79 * some of the following vectors are 'rare', they are merged
80 * into a single vector (CALL_FUNCTION_VECTOR) to save vector space. 80 * into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
81 * TLB, reschedule and local APIC vectors are performance-critical. 81 * TLB, reschedule and local APIC vectors are performance-critical.
82 */ 82 */
83 83
84 #define SPURIOUS_APIC_VECTOR 0xff 84 #define SPURIOUS_APIC_VECTOR 0xff
85 /* 85 /*
86 * Sanity check 86 * Sanity check
87 */ 87 */
88 #if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F) 88 #if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F)
89 # error SPURIOUS_APIC_VECTOR definition error 89 # error SPURIOUS_APIC_VECTOR definition error
90 #endif 90 #endif
91 91
92 #define ERROR_APIC_VECTOR 0xfe 92 #define ERROR_APIC_VECTOR 0xfe
93 #define RESCHEDULE_VECTOR 0xfd 93 #define RESCHEDULE_VECTOR 0xfd
94 #define CALL_FUNCTION_VECTOR 0xfc 94 #define CALL_FUNCTION_VECTOR 0xfc
95 #define CALL_FUNCTION_SINGLE_VECTOR 0xfb 95 #define CALL_FUNCTION_SINGLE_VECTOR 0xfb
96 #define THERMAL_APIC_VECTOR 0xfa 96 #define THERMAL_APIC_VECTOR 0xfa
97 #define THRESHOLD_APIC_VECTOR 0xf9 97 #define THRESHOLD_APIC_VECTOR 0xf9
98 #define REBOOT_VECTOR 0xf8 98 #define REBOOT_VECTOR 0xf8
99 99
100 /* 100 /*
101 * Generic system vector for platform specific use 101 * Generic system vector for platform specific use
102 */ 102 */
103 #define X86_PLATFORM_IPI_VECTOR 0xf7 103 #define X86_PLATFORM_IPI_VECTOR 0xf7
104 104
105 /* 105 /*
106 * IRQ work vector: 106 * IRQ work vector:
107 */ 107 */
108 #define IRQ_WORK_VECTOR 0xf6 108 #define IRQ_WORK_VECTOR 0xf6
109 109
110 #define UV_BAU_MESSAGE 0xf5 110 #define UV_BAU_MESSAGE 0xf5
111 111
112 /* Xen vector callback to receive events in a HVM domain */ 112 /* Vector on which hypervisor callbacks will be delivered */
113 #define XEN_HVM_EVTCHN_CALLBACK 0xf3 113 #define HYPERVISOR_CALLBACK_VECTOR 0xf3
114 114
115 /* 115 /*
116 * Local APIC timer IRQ vector is on a different priority level, 116 * Local APIC timer IRQ vector is on a different priority level,
117 * to work around the 'lost local interrupt if more than 2 IRQ 117 * to work around the 'lost local interrupt if more than 2 IRQ
118 * sources per level' errata. 118 * sources per level' errata.
119 */ 119 */
120 #define LOCAL_TIMER_VECTOR 0xef 120 #define LOCAL_TIMER_VECTOR 0xef
121 121
122 #define NR_VECTORS 256 122 #define NR_VECTORS 256
123 123
124 #define FPU_IRQ 13 124 #define FPU_IRQ 13
125 125
126 #define FIRST_VM86_IRQ 3 126 #define FIRST_VM86_IRQ 3
127 #define LAST_VM86_IRQ 15 127 #define LAST_VM86_IRQ 15
128 128
129 #ifndef __ASSEMBLY__ 129 #ifndef __ASSEMBLY__
130 static inline int invalid_vm86_irq(int irq) 130 static inline int invalid_vm86_irq(int irq)
131 { 131 {
132 return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ; 132 return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ;
133 } 133 }
134 #endif 134 #endif
135 135
136 /* 136 /*
137 * Size the maximum number of interrupts. 137 * Size the maximum number of interrupts.
138 * 138 *
139 * If the irq_desc[] array has a sparse layout, we can size things 139 * If the irq_desc[] array has a sparse layout, we can size things
140 * generously - it scales up linearly with the maximum number of CPUs, 140 * generously - it scales up linearly with the maximum number of CPUs,
141 * and the maximum number of IO-APICs, whichever is higher. 141 * and the maximum number of IO-APICs, whichever is higher.
142 * 142 *
143 * In other cases we size more conservatively, to not create too large 143 * In other cases we size more conservatively, to not create too large
144 * static arrays. 144 * static arrays.
145 */ 145 */
146 146
147 #define NR_IRQS_LEGACY 16 147 #define NR_IRQS_LEGACY 16
148 148
149 #define IO_APIC_VECTOR_LIMIT ( 32 * MAX_IO_APICS ) 149 #define IO_APIC_VECTOR_LIMIT ( 32 * MAX_IO_APICS )
150 150
151 #ifdef CONFIG_X86_IO_APIC 151 #ifdef CONFIG_X86_IO_APIC
152 # define CPU_VECTOR_LIMIT (64 * NR_CPUS) 152 # define CPU_VECTOR_LIMIT (64 * NR_CPUS)
153 # define NR_IRQS \ 153 # define NR_IRQS \
154 (CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \ 154 (CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \
155 (NR_VECTORS + CPU_VECTOR_LIMIT) : \ 155 (NR_VECTORS + CPU_VECTOR_LIMIT) : \
156 (NR_VECTORS + IO_APIC_VECTOR_LIMIT)) 156 (NR_VECTORS + IO_APIC_VECTOR_LIMIT))
157 #else /* !CONFIG_X86_IO_APIC: */ 157 #else /* !CONFIG_X86_IO_APIC: */
158 # define NR_IRQS NR_IRQS_LEGACY 158 # define NR_IRQS NR_IRQS_LEGACY
159 #endif 159 #endif
160 160
161 #endif /* _ASM_X86_IRQ_VECTORS_H */ 161 #endif /* _ASM_X86_IRQ_VECTORS_H */
162 162
arch/x86/include/asm/mshyperv.h
1 #ifndef _ASM_X86_MSHYPER_H 1 #ifndef _ASM_X86_MSHYPER_H
2 #define _ASM_X86_MSHYPER_H 2 #define _ASM_X86_MSHYPER_H
3 3
4 #include <linux/types.h> 4 #include <linux/types.h>
5 #include <asm/hyperv.h> 5 #include <asm/hyperv.h>
6 6
7 struct ms_hyperv_info { 7 struct ms_hyperv_info {
8 u32 features; 8 u32 features;
9 u32 hints; 9 u32 hints;
10 }; 10 };
11 11
12 extern struct ms_hyperv_info ms_hyperv; 12 extern struct ms_hyperv_info ms_hyperv;
13 13
14 void hyperv_callback_vector(void);
15 void hyperv_vector_handler(struct pt_regs *regs);
16 void hv_register_vmbus_handler(int irq, irq_handler_t handler);
17
14 #endif 18 #endif
15 19
arch/x86/kernel/cpu/mshyperv.c
1 /* 1 /*
2 * HyperV Detection code. 2 * HyperV Detection code.
3 * 3 *
4 * Copyright (C) 2010, Novell, Inc. 4 * Copyright (C) 2010, Novell, Inc.
5 * Author : K. Y. Srinivasan <ksrinivasan@novell.com> 5 * Author : K. Y. Srinivasan <ksrinivasan@novell.com>
6 * 6 *
7 * This program is free software; you can redistribute it and/or modify 7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by 8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; version 2 of the License. 9 * the Free Software Foundation; version 2 of the License.
10 * 10 *
11 */ 11 */
12 12
13 #include <linux/types.h> 13 #include <linux/types.h>
14 #include <linux/time.h> 14 #include <linux/time.h>
15 #include <linux/clocksource.h> 15 #include <linux/clocksource.h>
16 #include <linux/module.h> 16 #include <linux/module.h>
17 #include <linux/hardirq.h>
18 #include <linux/interrupt.h>
17 #include <asm/processor.h> 19 #include <asm/processor.h>
18 #include <asm/hypervisor.h> 20 #include <asm/hypervisor.h>
19 #include <asm/hyperv.h> 21 #include <asm/hyperv.h>
20 #include <asm/mshyperv.h> 22 #include <asm/mshyperv.h>
23 #include <asm/desc.h>
24 #include <asm/idle.h>
25 #include <asm/irq_regs.h>
21 26
22 struct ms_hyperv_info ms_hyperv; 27 struct ms_hyperv_info ms_hyperv;
23 EXPORT_SYMBOL_GPL(ms_hyperv); 28 EXPORT_SYMBOL_GPL(ms_hyperv);
24 29
25 static bool __init ms_hyperv_platform(void) 30 static bool __init ms_hyperv_platform(void)
26 { 31 {
27 u32 eax; 32 u32 eax;
28 u32 hyp_signature[3]; 33 u32 hyp_signature[3];
29 34
30 if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) 35 if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
31 return false; 36 return false;
32 37
33 /* 38 /*
34 * Xen emulates Hyper-V to support enlightened Windows. 39 * Xen emulates Hyper-V to support enlightened Windows.
35 * Check to see first if we are on a Xen Hypervisor. 40 * Check to see first if we are on a Xen Hypervisor.
36 */ 41 */
37 if (xen_cpuid_base()) 42 if (xen_cpuid_base())
38 return false; 43 return false;
39 44
40 cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS, 45 cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS,
41 &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]); 46 &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]);
42 47
43 return eax >= HYPERV_CPUID_MIN && 48 return eax >= HYPERV_CPUID_MIN &&
44 eax <= HYPERV_CPUID_MAX && 49 eax <= HYPERV_CPUID_MAX &&
45 !memcmp("Microsoft Hv", hyp_signature, 12); 50 !memcmp("Microsoft Hv", hyp_signature, 12);
46 } 51 }
47 52
48 static cycle_t read_hv_clock(struct clocksource *arg) 53 static cycle_t read_hv_clock(struct clocksource *arg)
49 { 54 {
50 cycle_t current_tick; 55 cycle_t current_tick;
51 /* 56 /*
52 * Read the partition counter to get the current tick count. This count 57 * Read the partition counter to get the current tick count. This count
53 * is set to 0 when the partition is created and is incremented in 58 * is set to 0 when the partition is created and is incremented in
54 * 100 nanosecond units. 59 * 100 nanosecond units.
55 */ 60 */
56 rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick); 61 rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick);
57 return current_tick; 62 return current_tick;
58 } 63 }
59 64
60 static struct clocksource hyperv_cs = { 65 static struct clocksource hyperv_cs = {
61 .name = "hyperv_clocksource", 66 .name = "hyperv_clocksource",
62 .rating = 400, /* use this when running on Hyperv*/ 67 .rating = 400, /* use this when running on Hyperv*/
63 .read = read_hv_clock, 68 .read = read_hv_clock,
64 .mask = CLOCKSOURCE_MASK(64), 69 .mask = CLOCKSOURCE_MASK(64),
65 }; 70 };
66 71
67 static void __init ms_hyperv_init_platform(void) 72 static void __init ms_hyperv_init_platform(void)
68 { 73 {
69 /* 74 /*
70 * Extract the features and hints 75 * Extract the features and hints
71 */ 76 */
72 ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES); 77 ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
73 ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); 78 ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
74 79
75 printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n", 80 printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
76 ms_hyperv.features, ms_hyperv.hints); 81 ms_hyperv.features, ms_hyperv.hints);
77 82
78 if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE) 83 if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE)
79 clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100); 84 clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100);
85 #if IS_ENABLED(CONFIG_HYPERV)
86 /*
87 * Setup the IDT for hypervisor callback.
88 */
89 alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector);
90 #endif
80 } 91 }
81 92
82 const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { 93 const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
83 .name = "Microsoft HyperV", 94 .name = "Microsoft HyperV",
84 .detect = ms_hyperv_platform, 95 .detect = ms_hyperv_platform,
85 .init_platform = ms_hyperv_init_platform, 96 .init_platform = ms_hyperv_init_platform,
86 }; 97 };
87 EXPORT_SYMBOL(x86_hyper_ms_hyperv); 98 EXPORT_SYMBOL(x86_hyper_ms_hyperv);
99
100 #if IS_ENABLED(CONFIG_HYPERV)
101 static int vmbus_irq = -1;
102 static irq_handler_t vmbus_isr;
103
104 void hv_register_vmbus_handler(int irq, irq_handler_t handler)
105 {
106 vmbus_irq = irq;
107 vmbus_isr = handler;
108 }
109
110 void hyperv_vector_handler(struct pt_regs *regs)
111 {
112 struct pt_regs *old_regs = set_irq_regs(regs);
113 struct irq_desc *desc;
114
115 irq_enter();
116 exit_idle();
117
118 desc = irq_to_desc(vmbus_irq);
119
120 if (desc)
121 generic_handle_irq_desc(vmbus_irq, desc);
122
123 irq_exit();
124 set_irq_regs(old_regs);
125 }
126 #else
127 void hv_register_vmbus_handler(int irq, irq_handler_t handler)
128 {
129 }
130 #endif
131 EXPORT_SYMBOL_GPL(hv_register_vmbus_handler);
88 132
arch/x86/kernel/entry_32.S
1 /* 1 /*
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 */ 4 */
5 5
6 /* 6 /*
7 * entry.S contains the system-call and fault low-level handling routines. 7 * entry.S contains the system-call and fault low-level handling routines.
8 * This also contains the timer-interrupt handler, as well as all interrupts 8 * This also contains the timer-interrupt handler, as well as all interrupts
9 * and faults that can result in a task-switch. 9 * and faults that can result in a task-switch.
10 * 10 *
11 * NOTE: This code handles signal-recognition, which happens every time 11 * NOTE: This code handles signal-recognition, which happens every time
12 * after a timer-interrupt and after each system call. 12 * after a timer-interrupt and after each system call.
13 * 13 *
14 * I changed all the .align's to 4 (16 byte alignment), as that's faster 14 * I changed all the .align's to 4 (16 byte alignment), as that's faster
15 * on a 486. 15 * on a 486.
16 * 16 *
17 * Stack layout in 'syscall_exit': 17 * Stack layout in 'syscall_exit':
18 * ptrace needs to have all regs on the stack. 18 * ptrace needs to have all regs on the stack.
19 * if the order here is changed, it needs to be 19 * if the order here is changed, it needs to be
20 * updated in fork.c:copy_process, signal.c:do_signal, 20 * updated in fork.c:copy_process, signal.c:do_signal,
21 * ptrace.c and ptrace.h 21 * ptrace.c and ptrace.h
22 * 22 *
23 * 0(%esp) - %ebx 23 * 0(%esp) - %ebx
24 * 4(%esp) - %ecx 24 * 4(%esp) - %ecx
25 * 8(%esp) - %edx 25 * 8(%esp) - %edx
26 * C(%esp) - %esi 26 * C(%esp) - %esi
27 * 10(%esp) - %edi 27 * 10(%esp) - %edi
28 * 14(%esp) - %ebp 28 * 14(%esp) - %ebp
29 * 18(%esp) - %eax 29 * 18(%esp) - %eax
30 * 1C(%esp) - %ds 30 * 1C(%esp) - %ds
31 * 20(%esp) - %es 31 * 20(%esp) - %es
32 * 24(%esp) - %fs 32 * 24(%esp) - %fs
33 * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS 33 * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS
34 * 2C(%esp) - orig_eax 34 * 2C(%esp) - orig_eax
35 * 30(%esp) - %eip 35 * 30(%esp) - %eip
36 * 34(%esp) - %cs 36 * 34(%esp) - %cs
37 * 38(%esp) - %eflags 37 * 38(%esp) - %eflags
38 * 3C(%esp) - %oldesp 38 * 3C(%esp) - %oldesp
39 * 40(%esp) - %oldss 39 * 40(%esp) - %oldss
40 * 40 *
41 * "current" is in register %ebx during any slow entries. 41 * "current" is in register %ebx during any slow entries.
42 */ 42 */
43 43
44 #include <linux/linkage.h> 44 #include <linux/linkage.h>
45 #include <linux/err.h> 45 #include <linux/err.h>
46 #include <asm/thread_info.h> 46 #include <asm/thread_info.h>
47 #include <asm/irqflags.h> 47 #include <asm/irqflags.h>
48 #include <asm/errno.h> 48 #include <asm/errno.h>
49 #include <asm/segment.h> 49 #include <asm/segment.h>
50 #include <asm/smp.h> 50 #include <asm/smp.h>
51 #include <asm/page_types.h> 51 #include <asm/page_types.h>
52 #include <asm/percpu.h> 52 #include <asm/percpu.h>
53 #include <asm/dwarf2.h> 53 #include <asm/dwarf2.h>
54 #include <asm/processor-flags.h> 54 #include <asm/processor-flags.h>
55 #include <asm/ftrace.h> 55 #include <asm/ftrace.h>
56 #include <asm/irq_vectors.h> 56 #include <asm/irq_vectors.h>
57 #include <asm/cpufeature.h> 57 #include <asm/cpufeature.h>
58 #include <asm/alternative-asm.h> 58 #include <asm/alternative-asm.h>
59 #include <asm/asm.h> 59 #include <asm/asm.h>
60 #include <asm/smap.h> 60 #include <asm/smap.h>
61 61
62 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 62 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
63 #include <linux/elf-em.h> 63 #include <linux/elf-em.h>
64 #define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) 64 #define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
65 #define __AUDIT_ARCH_LE 0x40000000 65 #define __AUDIT_ARCH_LE 0x40000000
66 66
67 #ifndef CONFIG_AUDITSYSCALL 67 #ifndef CONFIG_AUDITSYSCALL
68 #define sysenter_audit syscall_trace_entry 68 #define sysenter_audit syscall_trace_entry
69 #define sysexit_audit syscall_exit_work 69 #define sysexit_audit syscall_exit_work
70 #endif 70 #endif
71 71
72 .section .entry.text, "ax" 72 .section .entry.text, "ax"
73 73
74 /* 74 /*
75 * We use macros for low-level operations which need to be overridden 75 * We use macros for low-level operations which need to be overridden
76 * for paravirtualization. The following will never clobber any registers: 76 * for paravirtualization. The following will never clobber any registers:
77 * INTERRUPT_RETURN (aka. "iret") 77 * INTERRUPT_RETURN (aka. "iret")
78 * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") 78 * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
79 * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). 79 * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
80 * 80 *
81 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must 81 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
82 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). 82 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
83 * Allowing a register to be clobbered can shrink the paravirt replacement 83 * Allowing a register to be clobbered can shrink the paravirt replacement
84 * enough to patch inline, increasing performance. 84 * enough to patch inline, increasing performance.
85 */ 85 */
86 86
87 #ifdef CONFIG_PREEMPT 87 #ifdef CONFIG_PREEMPT
88 #define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF 88 #define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
89 #else 89 #else
90 #define preempt_stop(clobbers) 90 #define preempt_stop(clobbers)
91 #define resume_kernel restore_all 91 #define resume_kernel restore_all
92 #endif 92 #endif
93 93
94 .macro TRACE_IRQS_IRET 94 .macro TRACE_IRQS_IRET
95 #ifdef CONFIG_TRACE_IRQFLAGS 95 #ifdef CONFIG_TRACE_IRQFLAGS
96 testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off? 96 testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off?
97 jz 1f 97 jz 1f
98 TRACE_IRQS_ON 98 TRACE_IRQS_ON
99 1: 99 1:
100 #endif 100 #endif
101 .endm 101 .endm
102 102
103 /* 103 /*
104 * User gs save/restore 104 * User gs save/restore
105 * 105 *
106 * %gs is used for userland TLS and kernel only uses it for stack 106 * %gs is used for userland TLS and kernel only uses it for stack
107 * canary which is required to be at %gs:20 by gcc. Read the comment 107 * canary which is required to be at %gs:20 by gcc. Read the comment
108 * at the top of stackprotector.h for more info. 108 * at the top of stackprotector.h for more info.
109 * 109 *
110 * Local labels 98 and 99 are used. 110 * Local labels 98 and 99 are used.
111 */ 111 */
112 #ifdef CONFIG_X86_32_LAZY_GS 112 #ifdef CONFIG_X86_32_LAZY_GS
113 113
114 /* unfortunately push/pop can't be no-op */ 114 /* unfortunately push/pop can't be no-op */
115 .macro PUSH_GS 115 .macro PUSH_GS
116 pushl_cfi $0 116 pushl_cfi $0
117 .endm 117 .endm
118 .macro POP_GS pop=0 118 .macro POP_GS pop=0
119 addl $(4 + \pop), %esp 119 addl $(4 + \pop), %esp
120 CFI_ADJUST_CFA_OFFSET -(4 + \pop) 120 CFI_ADJUST_CFA_OFFSET -(4 + \pop)
121 .endm 121 .endm
122 .macro POP_GS_EX 122 .macro POP_GS_EX
123 .endm 123 .endm
124 124
125 /* all the rest are no-op */ 125 /* all the rest are no-op */
126 .macro PTGS_TO_GS 126 .macro PTGS_TO_GS
127 .endm 127 .endm
128 .macro PTGS_TO_GS_EX 128 .macro PTGS_TO_GS_EX
129 .endm 129 .endm
130 .macro GS_TO_REG reg 130 .macro GS_TO_REG reg
131 .endm 131 .endm
132 .macro REG_TO_PTGS reg 132 .macro REG_TO_PTGS reg
133 .endm 133 .endm
134 .macro SET_KERNEL_GS reg 134 .macro SET_KERNEL_GS reg
135 .endm 135 .endm
136 136
137 #else /* CONFIG_X86_32_LAZY_GS */ 137 #else /* CONFIG_X86_32_LAZY_GS */
138 138
139 .macro PUSH_GS 139 .macro PUSH_GS
140 pushl_cfi %gs 140 pushl_cfi %gs
141 /*CFI_REL_OFFSET gs, 0*/ 141 /*CFI_REL_OFFSET gs, 0*/
142 .endm 142 .endm
143 143
144 .macro POP_GS pop=0 144 .macro POP_GS pop=0
145 98: popl_cfi %gs 145 98: popl_cfi %gs
146 /*CFI_RESTORE gs*/ 146 /*CFI_RESTORE gs*/
147 .if \pop <> 0 147 .if \pop <> 0
148 add $\pop, %esp 148 add $\pop, %esp
149 CFI_ADJUST_CFA_OFFSET -\pop 149 CFI_ADJUST_CFA_OFFSET -\pop
150 .endif 150 .endif
151 .endm 151 .endm
152 .macro POP_GS_EX 152 .macro POP_GS_EX
153 .pushsection .fixup, "ax" 153 .pushsection .fixup, "ax"
154 99: movl $0, (%esp) 154 99: movl $0, (%esp)
155 jmp 98b 155 jmp 98b
156 .popsection 156 .popsection
157 _ASM_EXTABLE(98b,99b) 157 _ASM_EXTABLE(98b,99b)
158 .endm 158 .endm
159 159
160 .macro PTGS_TO_GS 160 .macro PTGS_TO_GS
161 98: mov PT_GS(%esp), %gs 161 98: mov PT_GS(%esp), %gs
162 .endm 162 .endm
163 .macro PTGS_TO_GS_EX 163 .macro PTGS_TO_GS_EX
164 .pushsection .fixup, "ax" 164 .pushsection .fixup, "ax"
165 99: movl $0, PT_GS(%esp) 165 99: movl $0, PT_GS(%esp)
166 jmp 98b 166 jmp 98b
167 .popsection 167 .popsection
168 _ASM_EXTABLE(98b,99b) 168 _ASM_EXTABLE(98b,99b)
169 .endm 169 .endm
170 170
171 .macro GS_TO_REG reg 171 .macro GS_TO_REG reg
172 movl %gs, \reg 172 movl %gs, \reg
173 /*CFI_REGISTER gs, \reg*/ 173 /*CFI_REGISTER gs, \reg*/
174 .endm 174 .endm
175 .macro REG_TO_PTGS reg 175 .macro REG_TO_PTGS reg
176 movl \reg, PT_GS(%esp) 176 movl \reg, PT_GS(%esp)
177 /*CFI_REL_OFFSET gs, PT_GS*/ 177 /*CFI_REL_OFFSET gs, PT_GS*/
178 .endm 178 .endm
179 .macro SET_KERNEL_GS reg 179 .macro SET_KERNEL_GS reg
180 movl $(__KERNEL_STACK_CANARY), \reg 180 movl $(__KERNEL_STACK_CANARY), \reg
181 movl \reg, %gs 181 movl \reg, %gs
182 .endm 182 .endm
183 183
184 #endif /* CONFIG_X86_32_LAZY_GS */ 184 #endif /* CONFIG_X86_32_LAZY_GS */
185 185
186 .macro SAVE_ALL 186 .macro SAVE_ALL
187 cld 187 cld
188 PUSH_GS 188 PUSH_GS
189 pushl_cfi %fs 189 pushl_cfi %fs
190 /*CFI_REL_OFFSET fs, 0;*/ 190 /*CFI_REL_OFFSET fs, 0;*/
191 pushl_cfi %es 191 pushl_cfi %es
192 /*CFI_REL_OFFSET es, 0;*/ 192 /*CFI_REL_OFFSET es, 0;*/
193 pushl_cfi %ds 193 pushl_cfi %ds
194 /*CFI_REL_OFFSET ds, 0;*/ 194 /*CFI_REL_OFFSET ds, 0;*/
195 pushl_cfi %eax 195 pushl_cfi %eax
196 CFI_REL_OFFSET eax, 0 196 CFI_REL_OFFSET eax, 0
197 pushl_cfi %ebp 197 pushl_cfi %ebp
198 CFI_REL_OFFSET ebp, 0 198 CFI_REL_OFFSET ebp, 0
199 pushl_cfi %edi 199 pushl_cfi %edi
200 CFI_REL_OFFSET edi, 0 200 CFI_REL_OFFSET edi, 0
201 pushl_cfi %esi 201 pushl_cfi %esi
202 CFI_REL_OFFSET esi, 0 202 CFI_REL_OFFSET esi, 0
203 pushl_cfi %edx 203 pushl_cfi %edx
204 CFI_REL_OFFSET edx, 0 204 CFI_REL_OFFSET edx, 0
205 pushl_cfi %ecx 205 pushl_cfi %ecx
206 CFI_REL_OFFSET ecx, 0 206 CFI_REL_OFFSET ecx, 0
207 pushl_cfi %ebx 207 pushl_cfi %ebx
208 CFI_REL_OFFSET ebx, 0 208 CFI_REL_OFFSET ebx, 0
209 movl $(__USER_DS), %edx 209 movl $(__USER_DS), %edx
210 movl %edx, %ds 210 movl %edx, %ds
211 movl %edx, %es 211 movl %edx, %es
212 movl $(__KERNEL_PERCPU), %edx 212 movl $(__KERNEL_PERCPU), %edx
213 movl %edx, %fs 213 movl %edx, %fs
214 SET_KERNEL_GS %edx 214 SET_KERNEL_GS %edx
215 .endm 215 .endm
216 216
217 .macro RESTORE_INT_REGS 217 .macro RESTORE_INT_REGS
218 popl_cfi %ebx 218 popl_cfi %ebx
219 CFI_RESTORE ebx 219 CFI_RESTORE ebx
220 popl_cfi %ecx 220 popl_cfi %ecx
221 CFI_RESTORE ecx 221 CFI_RESTORE ecx
222 popl_cfi %edx 222 popl_cfi %edx
223 CFI_RESTORE edx 223 CFI_RESTORE edx
224 popl_cfi %esi 224 popl_cfi %esi
225 CFI_RESTORE esi 225 CFI_RESTORE esi
226 popl_cfi %edi 226 popl_cfi %edi
227 CFI_RESTORE edi 227 CFI_RESTORE edi
228 popl_cfi %ebp 228 popl_cfi %ebp
229 CFI_RESTORE ebp 229 CFI_RESTORE ebp
230 popl_cfi %eax 230 popl_cfi %eax
231 CFI_RESTORE eax 231 CFI_RESTORE eax
232 .endm 232 .endm
233 233
234 .macro RESTORE_REGS pop=0 234 .macro RESTORE_REGS pop=0
235 RESTORE_INT_REGS 235 RESTORE_INT_REGS
236 1: popl_cfi %ds 236 1: popl_cfi %ds
237 /*CFI_RESTORE ds;*/ 237 /*CFI_RESTORE ds;*/
238 2: popl_cfi %es 238 2: popl_cfi %es
239 /*CFI_RESTORE es;*/ 239 /*CFI_RESTORE es;*/
240 3: popl_cfi %fs 240 3: popl_cfi %fs
241 /*CFI_RESTORE fs;*/ 241 /*CFI_RESTORE fs;*/
242 POP_GS \pop 242 POP_GS \pop
243 .pushsection .fixup, "ax" 243 .pushsection .fixup, "ax"
244 4: movl $0, (%esp) 244 4: movl $0, (%esp)
245 jmp 1b 245 jmp 1b
246 5: movl $0, (%esp) 246 5: movl $0, (%esp)
247 jmp 2b 247 jmp 2b
248 6: movl $0, (%esp) 248 6: movl $0, (%esp)
249 jmp 3b 249 jmp 3b
250 .popsection 250 .popsection
251 _ASM_EXTABLE(1b,4b) 251 _ASM_EXTABLE(1b,4b)
252 _ASM_EXTABLE(2b,5b) 252 _ASM_EXTABLE(2b,5b)
253 _ASM_EXTABLE(3b,6b) 253 _ASM_EXTABLE(3b,6b)
254 POP_GS_EX 254 POP_GS_EX
255 .endm 255 .endm
256 256
257 .macro RING0_INT_FRAME 257 .macro RING0_INT_FRAME
258 CFI_STARTPROC simple 258 CFI_STARTPROC simple
259 CFI_SIGNAL_FRAME 259 CFI_SIGNAL_FRAME
260 CFI_DEF_CFA esp, 3*4 260 CFI_DEF_CFA esp, 3*4
261 /*CFI_OFFSET cs, -2*4;*/ 261 /*CFI_OFFSET cs, -2*4;*/
262 CFI_OFFSET eip, -3*4 262 CFI_OFFSET eip, -3*4
263 .endm 263 .endm
264 264
265 .macro RING0_EC_FRAME 265 .macro RING0_EC_FRAME
266 CFI_STARTPROC simple 266 CFI_STARTPROC simple
267 CFI_SIGNAL_FRAME 267 CFI_SIGNAL_FRAME
268 CFI_DEF_CFA esp, 4*4 268 CFI_DEF_CFA esp, 4*4
269 /*CFI_OFFSET cs, -2*4;*/ 269 /*CFI_OFFSET cs, -2*4;*/
270 CFI_OFFSET eip, -3*4 270 CFI_OFFSET eip, -3*4
271 .endm 271 .endm
272 272
273 .macro RING0_PTREGS_FRAME 273 .macro RING0_PTREGS_FRAME
274 CFI_STARTPROC simple 274 CFI_STARTPROC simple
275 CFI_SIGNAL_FRAME 275 CFI_SIGNAL_FRAME
276 CFI_DEF_CFA esp, PT_OLDESP-PT_EBX 276 CFI_DEF_CFA esp, PT_OLDESP-PT_EBX
277 /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/ 277 /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/
278 CFI_OFFSET eip, PT_EIP-PT_OLDESP 278 CFI_OFFSET eip, PT_EIP-PT_OLDESP
279 /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/ 279 /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/
280 /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/ 280 /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/
281 CFI_OFFSET eax, PT_EAX-PT_OLDESP 281 CFI_OFFSET eax, PT_EAX-PT_OLDESP
282 CFI_OFFSET ebp, PT_EBP-PT_OLDESP 282 CFI_OFFSET ebp, PT_EBP-PT_OLDESP
283 CFI_OFFSET edi, PT_EDI-PT_OLDESP 283 CFI_OFFSET edi, PT_EDI-PT_OLDESP
284 CFI_OFFSET esi, PT_ESI-PT_OLDESP 284 CFI_OFFSET esi, PT_ESI-PT_OLDESP
285 CFI_OFFSET edx, PT_EDX-PT_OLDESP 285 CFI_OFFSET edx, PT_EDX-PT_OLDESP
286 CFI_OFFSET ecx, PT_ECX-PT_OLDESP 286 CFI_OFFSET ecx, PT_ECX-PT_OLDESP
287 CFI_OFFSET ebx, PT_EBX-PT_OLDESP 287 CFI_OFFSET ebx, PT_EBX-PT_OLDESP
288 .endm 288 .endm
289 289
290 ENTRY(ret_from_fork) 290 ENTRY(ret_from_fork)
291 CFI_STARTPROC 291 CFI_STARTPROC
292 pushl_cfi %eax 292 pushl_cfi %eax
293 call schedule_tail 293 call schedule_tail
294 GET_THREAD_INFO(%ebp) 294 GET_THREAD_INFO(%ebp)
295 popl_cfi %eax 295 popl_cfi %eax
296 pushl_cfi $0x0202 # Reset kernel eflags 296 pushl_cfi $0x0202 # Reset kernel eflags
297 popfl_cfi 297 popfl_cfi
298 jmp syscall_exit 298 jmp syscall_exit
299 CFI_ENDPROC 299 CFI_ENDPROC
300 END(ret_from_fork) 300 END(ret_from_fork)
301 301
302 ENTRY(ret_from_kernel_thread) 302 ENTRY(ret_from_kernel_thread)
303 CFI_STARTPROC 303 CFI_STARTPROC
304 pushl_cfi %eax 304 pushl_cfi %eax
305 call schedule_tail 305 call schedule_tail
306 GET_THREAD_INFO(%ebp) 306 GET_THREAD_INFO(%ebp)
307 popl_cfi %eax 307 popl_cfi %eax
308 pushl_cfi $0x0202 # Reset kernel eflags 308 pushl_cfi $0x0202 # Reset kernel eflags
309 popfl_cfi 309 popfl_cfi
310 movl PT_EBP(%esp),%eax 310 movl PT_EBP(%esp),%eax
311 call *PT_EBX(%esp) 311 call *PT_EBX(%esp)
312 movl $0,PT_EAX(%esp) 312 movl $0,PT_EAX(%esp)
313 jmp syscall_exit 313 jmp syscall_exit
314 CFI_ENDPROC 314 CFI_ENDPROC
315 ENDPROC(ret_from_kernel_thread) 315 ENDPROC(ret_from_kernel_thread)
316 316
317 /* 317 /*
318 * Interrupt exit functions should be protected against kprobes 318 * Interrupt exit functions should be protected against kprobes
319 */ 319 */
320 .pushsection .kprobes.text, "ax" 320 .pushsection .kprobes.text, "ax"
321 /* 321 /*
322 * Return to user mode is not as complex as all this looks, 322 * Return to user mode is not as complex as all this looks,
323 * but we want the default path for a system call return to 323 * but we want the default path for a system call return to
324 * go as quickly as possible which is why some of this is 324 * go as quickly as possible which is why some of this is
325 * less clear than it otherwise should be. 325 * less clear than it otherwise should be.
326 */ 326 */
327 327
328 # userspace resumption stub bypassing syscall exit tracing 328 # userspace resumption stub bypassing syscall exit tracing
329 ALIGN 329 ALIGN
330 RING0_PTREGS_FRAME 330 RING0_PTREGS_FRAME
331 ret_from_exception: 331 ret_from_exception:
332 preempt_stop(CLBR_ANY) 332 preempt_stop(CLBR_ANY)
333 ret_from_intr: 333 ret_from_intr:
334 GET_THREAD_INFO(%ebp) 334 GET_THREAD_INFO(%ebp)
335 #ifdef CONFIG_VM86 335 #ifdef CONFIG_VM86
336 movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS 336 movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
337 movb PT_CS(%esp), %al 337 movb PT_CS(%esp), %al
338 andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax 338 andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
339 #else 339 #else
340 /* 340 /*
341 * We can be coming here from child spawned by kernel_thread(). 341 * We can be coming here from child spawned by kernel_thread().
342 */ 342 */
343 movl PT_CS(%esp), %eax 343 movl PT_CS(%esp), %eax
344 andl $SEGMENT_RPL_MASK, %eax 344 andl $SEGMENT_RPL_MASK, %eax
345 #endif 345 #endif
346 cmpl $USER_RPL, %eax 346 cmpl $USER_RPL, %eax
347 jb resume_kernel # not returning to v8086 or userspace 347 jb resume_kernel # not returning to v8086 or userspace
348 348
349 ENTRY(resume_userspace) 349 ENTRY(resume_userspace)
350 LOCKDEP_SYS_EXIT 350 LOCKDEP_SYS_EXIT
351 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt 351 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
352 # setting need_resched or sigpending 352 # setting need_resched or sigpending
353 # between sampling and the iret 353 # between sampling and the iret
354 TRACE_IRQS_OFF 354 TRACE_IRQS_OFF
355 movl TI_flags(%ebp), %ecx 355 movl TI_flags(%ebp), %ecx
356 andl $_TIF_WORK_MASK, %ecx # is there any work to be done on 356 andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
357 # int/exception return? 357 # int/exception return?
358 jne work_pending 358 jne work_pending
359 jmp restore_all 359 jmp restore_all
360 END(ret_from_exception) 360 END(ret_from_exception)
361 361
362 #ifdef CONFIG_PREEMPT 362 #ifdef CONFIG_PREEMPT
363 ENTRY(resume_kernel) 363 ENTRY(resume_kernel)
364 DISABLE_INTERRUPTS(CLBR_ANY) 364 DISABLE_INTERRUPTS(CLBR_ANY)
365 cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? 365 cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
366 jnz restore_all 366 jnz restore_all
367 need_resched: 367 need_resched:
368 movl TI_flags(%ebp), %ecx # need_resched set ? 368 movl TI_flags(%ebp), %ecx # need_resched set ?
369 testb $_TIF_NEED_RESCHED, %cl 369 testb $_TIF_NEED_RESCHED, %cl
370 jz restore_all 370 jz restore_all
371 testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ? 371 testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ?
372 jz restore_all 372 jz restore_all
373 call preempt_schedule_irq 373 call preempt_schedule_irq
374 jmp need_resched 374 jmp need_resched
375 END(resume_kernel) 375 END(resume_kernel)
376 #endif 376 #endif
377 CFI_ENDPROC 377 CFI_ENDPROC
378 /* 378 /*
379 * End of kprobes section 379 * End of kprobes section
380 */ 380 */
381 .popsection 381 .popsection
382 382
383 /* SYSENTER_RETURN points to after the "sysenter" instruction in 383 /* SYSENTER_RETURN points to after the "sysenter" instruction in
384 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ 384 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
385 385
386 # sysenter call handler stub 386 # sysenter call handler stub
387 ENTRY(ia32_sysenter_target) 387 ENTRY(ia32_sysenter_target)
388 CFI_STARTPROC simple 388 CFI_STARTPROC simple
389 CFI_SIGNAL_FRAME 389 CFI_SIGNAL_FRAME
390 CFI_DEF_CFA esp, 0 390 CFI_DEF_CFA esp, 0
391 CFI_REGISTER esp, ebp 391 CFI_REGISTER esp, ebp
392 movl TSS_sysenter_sp0(%esp),%esp 392 movl TSS_sysenter_sp0(%esp),%esp
393 sysenter_past_esp: 393 sysenter_past_esp:
394 /* 394 /*
395 * Interrupts are disabled here, but we can't trace it until 395 * Interrupts are disabled here, but we can't trace it until
396 * enough kernel state to call TRACE_IRQS_OFF can be called - but 396 * enough kernel state to call TRACE_IRQS_OFF can be called - but
397 * we immediately enable interrupts at that point anyway. 397 * we immediately enable interrupts at that point anyway.
398 */ 398 */
399 pushl_cfi $__USER_DS 399 pushl_cfi $__USER_DS
400 /*CFI_REL_OFFSET ss, 0*/ 400 /*CFI_REL_OFFSET ss, 0*/
401 pushl_cfi %ebp 401 pushl_cfi %ebp
402 CFI_REL_OFFSET esp, 0 402 CFI_REL_OFFSET esp, 0
403 pushfl_cfi 403 pushfl_cfi
404 orl $X86_EFLAGS_IF, (%esp) 404 orl $X86_EFLAGS_IF, (%esp)
405 pushl_cfi $__USER_CS 405 pushl_cfi $__USER_CS
406 /*CFI_REL_OFFSET cs, 0*/ 406 /*CFI_REL_OFFSET cs, 0*/
407 /* 407 /*
408 * Push current_thread_info()->sysenter_return to the stack. 408 * Push current_thread_info()->sysenter_return to the stack.
409 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words 409 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
410 * pushed above; +8 corresponds to copy_thread's esp0 setting. 410 * pushed above; +8 corresponds to copy_thread's esp0 setting.
411 */ 411 */
412 pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp) 412 pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp)
413 CFI_REL_OFFSET eip, 0 413 CFI_REL_OFFSET eip, 0
414 414
415 pushl_cfi %eax 415 pushl_cfi %eax
416 SAVE_ALL 416 SAVE_ALL
417 ENABLE_INTERRUPTS(CLBR_NONE) 417 ENABLE_INTERRUPTS(CLBR_NONE)
418 418
419 /* 419 /*
420 * Load the potential sixth argument from user stack. 420 * Load the potential sixth argument from user stack.
421 * Careful about security. 421 * Careful about security.
422 */ 422 */
423 cmpl $__PAGE_OFFSET-3,%ebp 423 cmpl $__PAGE_OFFSET-3,%ebp
424 jae syscall_fault 424 jae syscall_fault
425 ASM_STAC 425 ASM_STAC
426 1: movl (%ebp),%ebp 426 1: movl (%ebp),%ebp
427 ASM_CLAC 427 ASM_CLAC
428 movl %ebp,PT_EBP(%esp) 428 movl %ebp,PT_EBP(%esp)
429 _ASM_EXTABLE(1b,syscall_fault) 429 _ASM_EXTABLE(1b,syscall_fault)
430 430
431 GET_THREAD_INFO(%ebp) 431 GET_THREAD_INFO(%ebp)
432 432
433 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) 433 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
434 jnz sysenter_audit 434 jnz sysenter_audit
435 sysenter_do_call: 435 sysenter_do_call:
436 cmpl $(NR_syscalls), %eax 436 cmpl $(NR_syscalls), %eax
437 jae syscall_badsys 437 jae syscall_badsys
438 call *sys_call_table(,%eax,4) 438 call *sys_call_table(,%eax,4)
439 movl %eax,PT_EAX(%esp) 439 movl %eax,PT_EAX(%esp)
440 LOCKDEP_SYS_EXIT 440 LOCKDEP_SYS_EXIT
441 DISABLE_INTERRUPTS(CLBR_ANY) 441 DISABLE_INTERRUPTS(CLBR_ANY)
442 TRACE_IRQS_OFF 442 TRACE_IRQS_OFF
443 movl TI_flags(%ebp), %ecx 443 movl TI_flags(%ebp), %ecx
444 testl $_TIF_ALLWORK_MASK, %ecx 444 testl $_TIF_ALLWORK_MASK, %ecx
445 jne sysexit_audit 445 jne sysexit_audit
446 sysenter_exit: 446 sysenter_exit:
447 /* if something modifies registers it must also disable sysexit */ 447 /* if something modifies registers it must also disable sysexit */
448 movl PT_EIP(%esp), %edx 448 movl PT_EIP(%esp), %edx
449 movl PT_OLDESP(%esp), %ecx 449 movl PT_OLDESP(%esp), %ecx
450 xorl %ebp,%ebp 450 xorl %ebp,%ebp
451 TRACE_IRQS_ON 451 TRACE_IRQS_ON
452 1: mov PT_FS(%esp), %fs 452 1: mov PT_FS(%esp), %fs
453 PTGS_TO_GS 453 PTGS_TO_GS
454 ENABLE_INTERRUPTS_SYSEXIT 454 ENABLE_INTERRUPTS_SYSEXIT
455 455
456 #ifdef CONFIG_AUDITSYSCALL 456 #ifdef CONFIG_AUDITSYSCALL
457 sysenter_audit: 457 sysenter_audit:
458 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) 458 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
459 jnz syscall_trace_entry 459 jnz syscall_trace_entry
460 addl $4,%esp 460 addl $4,%esp
461 CFI_ADJUST_CFA_OFFSET -4 461 CFI_ADJUST_CFA_OFFSET -4
462 /* %esi already in 8(%esp) 6th arg: 4th syscall arg */ 462 /* %esi already in 8(%esp) 6th arg: 4th syscall arg */
463 /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */ 463 /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */
464 /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */ 464 /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */
465 movl %ebx,%ecx /* 3rd arg: 1st syscall arg */ 465 movl %ebx,%ecx /* 3rd arg: 1st syscall arg */
466 movl %eax,%edx /* 2nd arg: syscall number */ 466 movl %eax,%edx /* 2nd arg: syscall number */
467 movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ 467 movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
468 call __audit_syscall_entry 468 call __audit_syscall_entry
469 pushl_cfi %ebx 469 pushl_cfi %ebx
470 movl PT_EAX(%esp),%eax /* reload syscall number */ 470 movl PT_EAX(%esp),%eax /* reload syscall number */
471 jmp sysenter_do_call 471 jmp sysenter_do_call
472 472
473 sysexit_audit: 473 sysexit_audit:
474 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx 474 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
475 jne syscall_exit_work 475 jne syscall_exit_work
476 TRACE_IRQS_ON 476 TRACE_IRQS_ON
477 ENABLE_INTERRUPTS(CLBR_ANY) 477 ENABLE_INTERRUPTS(CLBR_ANY)
478 movl %eax,%edx /* second arg, syscall return value */ 478 movl %eax,%edx /* second arg, syscall return value */
479 cmpl $-MAX_ERRNO,%eax /* is it an error ? */ 479 cmpl $-MAX_ERRNO,%eax /* is it an error ? */
480 setbe %al /* 1 if so, 0 if not */ 480 setbe %al /* 1 if so, 0 if not */
481 movzbl %al,%eax /* zero-extend that */ 481 movzbl %al,%eax /* zero-extend that */
482 call __audit_syscall_exit 482 call __audit_syscall_exit
483 DISABLE_INTERRUPTS(CLBR_ANY) 483 DISABLE_INTERRUPTS(CLBR_ANY)
484 TRACE_IRQS_OFF 484 TRACE_IRQS_OFF
485 movl TI_flags(%ebp), %ecx 485 movl TI_flags(%ebp), %ecx
486 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx 486 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
487 jne syscall_exit_work 487 jne syscall_exit_work
488 movl PT_EAX(%esp),%eax /* reload syscall return value */ 488 movl PT_EAX(%esp),%eax /* reload syscall return value */
489 jmp sysenter_exit 489 jmp sysenter_exit
490 #endif 490 #endif
491 491
492 CFI_ENDPROC 492 CFI_ENDPROC
493 .pushsection .fixup,"ax" 493 .pushsection .fixup,"ax"
494 2: movl $0,PT_FS(%esp) 494 2: movl $0,PT_FS(%esp)
495 jmp 1b 495 jmp 1b
496 .popsection 496 .popsection
497 _ASM_EXTABLE(1b,2b) 497 _ASM_EXTABLE(1b,2b)
498 PTGS_TO_GS_EX 498 PTGS_TO_GS_EX
499 ENDPROC(ia32_sysenter_target) 499 ENDPROC(ia32_sysenter_target)
500 500
501 /* 501 /*
502 * syscall stub including irq exit should be protected against kprobes 502 * syscall stub including irq exit should be protected against kprobes
503 */ 503 */
504 .pushsection .kprobes.text, "ax" 504 .pushsection .kprobes.text, "ax"
505 # system call handler stub 505 # system call handler stub
506 ENTRY(system_call) 506 ENTRY(system_call)
507 RING0_INT_FRAME # can't unwind into user space anyway 507 RING0_INT_FRAME # can't unwind into user space anyway
508 ASM_CLAC 508 ASM_CLAC
509 pushl_cfi %eax # save orig_eax 509 pushl_cfi %eax # save orig_eax
510 SAVE_ALL 510 SAVE_ALL
511 GET_THREAD_INFO(%ebp) 511 GET_THREAD_INFO(%ebp)
512 # system call tracing in operation / emulation 512 # system call tracing in operation / emulation
513 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) 513 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
514 jnz syscall_trace_entry 514 jnz syscall_trace_entry
515 cmpl $(NR_syscalls), %eax 515 cmpl $(NR_syscalls), %eax
516 jae syscall_badsys 516 jae syscall_badsys
517 syscall_call: 517 syscall_call:
518 call *sys_call_table(,%eax,4) 518 call *sys_call_table(,%eax,4)
519 movl %eax,PT_EAX(%esp) # store the return value 519 movl %eax,PT_EAX(%esp) # store the return value
520 syscall_exit: 520 syscall_exit:
521 LOCKDEP_SYS_EXIT 521 LOCKDEP_SYS_EXIT
522 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt 522 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
523 # setting need_resched or sigpending 523 # setting need_resched or sigpending
524 # between sampling and the iret 524 # between sampling and the iret
525 TRACE_IRQS_OFF 525 TRACE_IRQS_OFF
526 movl TI_flags(%ebp), %ecx 526 movl TI_flags(%ebp), %ecx
527 testl $_TIF_ALLWORK_MASK, %ecx # current->work 527 testl $_TIF_ALLWORK_MASK, %ecx # current->work
528 jne syscall_exit_work 528 jne syscall_exit_work
529 529
530 restore_all: 530 restore_all:
531 TRACE_IRQS_IRET 531 TRACE_IRQS_IRET
532 restore_all_notrace: 532 restore_all_notrace:
533 movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS 533 movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
534 # Warning: PT_OLDSS(%esp) contains the wrong/random values if we 534 # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
535 # are returning to the kernel. 535 # are returning to the kernel.
536 # See comments in process.c:copy_thread() for details. 536 # See comments in process.c:copy_thread() for details.
537 movb PT_OLDSS(%esp), %ah 537 movb PT_OLDSS(%esp), %ah
538 movb PT_CS(%esp), %al 538 movb PT_CS(%esp), %al
539 andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax 539 andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
540 cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax 540 cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
541 CFI_REMEMBER_STATE 541 CFI_REMEMBER_STATE
542 je ldt_ss # returning to user-space with LDT SS 542 je ldt_ss # returning to user-space with LDT SS
543 restore_nocheck: 543 restore_nocheck:
544 RESTORE_REGS 4 # skip orig_eax/error_code 544 RESTORE_REGS 4 # skip orig_eax/error_code
545 irq_return: 545 irq_return:
546 INTERRUPT_RETURN 546 INTERRUPT_RETURN
547 .section .fixup,"ax" 547 .section .fixup,"ax"
548 ENTRY(iret_exc) 548 ENTRY(iret_exc)
549 pushl $0 # no error code 549 pushl $0 # no error code
550 pushl $do_iret_error 550 pushl $do_iret_error
551 jmp error_code 551 jmp error_code
552 .previous 552 .previous
553 _ASM_EXTABLE(irq_return,iret_exc) 553 _ASM_EXTABLE(irq_return,iret_exc)
554 554
555 CFI_RESTORE_STATE 555 CFI_RESTORE_STATE
556 ldt_ss: 556 ldt_ss:
557 larl PT_OLDSS(%esp), %eax 557 larl PT_OLDSS(%esp), %eax
558 jnz restore_nocheck 558 jnz restore_nocheck
559 testl $0x00400000, %eax # returning to 32bit stack? 559 testl $0x00400000, %eax # returning to 32bit stack?
560 jnz restore_nocheck # allright, normal return 560 jnz restore_nocheck # allright, normal return
561 561
562 #ifdef CONFIG_PARAVIRT 562 #ifdef CONFIG_PARAVIRT
563 /* 563 /*
564 * The kernel can't run on a non-flat stack if paravirt mode 564 * The kernel can't run on a non-flat stack if paravirt mode
565 * is active. Rather than try to fixup the high bits of 565 * is active. Rather than try to fixup the high bits of
566 * ESP, bypass this code entirely. This may break DOSemu 566 * ESP, bypass this code entirely. This may break DOSemu
567 * and/or Wine support in a paravirt VM, although the option 567 * and/or Wine support in a paravirt VM, although the option
568 * is still available to implement the setting of the high 568 * is still available to implement the setting of the high
569 * 16-bits in the INTERRUPT_RETURN paravirt-op. 569 * 16-bits in the INTERRUPT_RETURN paravirt-op.
570 */ 570 */
571 cmpl $0, pv_info+PARAVIRT_enabled 571 cmpl $0, pv_info+PARAVIRT_enabled
572 jne restore_nocheck 572 jne restore_nocheck
573 #endif 573 #endif
574 574
575 /* 575 /*
576 * Setup and switch to ESPFIX stack 576 * Setup and switch to ESPFIX stack
577 * 577 *
578 * We're returning to userspace with a 16 bit stack. The CPU will not 578 * We're returning to userspace with a 16 bit stack. The CPU will not
579 * restore the high word of ESP for us on executing iret... This is an 579 * restore the high word of ESP for us on executing iret... This is an
580 * "official" bug of all the x86-compatible CPUs, which we can work 580 * "official" bug of all the x86-compatible CPUs, which we can work
581 * around to make dosemu and wine happy. We do this by preloading the 581 * around to make dosemu and wine happy. We do this by preloading the
582 * high word of ESP with the high word of the userspace ESP while 582 * high word of ESP with the high word of the userspace ESP while
583 * compensating for the offset by changing to the ESPFIX segment with 583 * compensating for the offset by changing to the ESPFIX segment with
584 * a base address that matches for the difference. 584 * a base address that matches for the difference.
585 */ 585 */
586 #define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) 586 #define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
587 mov %esp, %edx /* load kernel esp */ 587 mov %esp, %edx /* load kernel esp */
588 mov PT_OLDESP(%esp), %eax /* load userspace esp */ 588 mov PT_OLDESP(%esp), %eax /* load userspace esp */
589 mov %dx, %ax /* eax: new kernel esp */ 589 mov %dx, %ax /* eax: new kernel esp */
590 sub %eax, %edx /* offset (low word is 0) */ 590 sub %eax, %edx /* offset (low word is 0) */
591 shr $16, %edx 591 shr $16, %edx
592 mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ 592 mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
593 mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ 593 mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
594 pushl_cfi $__ESPFIX_SS 594 pushl_cfi $__ESPFIX_SS
595 pushl_cfi %eax /* new kernel esp */ 595 pushl_cfi %eax /* new kernel esp */
596 /* Disable interrupts, but do not irqtrace this section: we 596 /* Disable interrupts, but do not irqtrace this section: we
597 * will soon execute iret and the tracer was already set to 597 * will soon execute iret and the tracer was already set to
598 * the irqstate after the iret */ 598 * the irqstate after the iret */
599 DISABLE_INTERRUPTS(CLBR_EAX) 599 DISABLE_INTERRUPTS(CLBR_EAX)
600 lss (%esp), %esp /* switch to espfix segment */ 600 lss (%esp), %esp /* switch to espfix segment */
601 CFI_ADJUST_CFA_OFFSET -8 601 CFI_ADJUST_CFA_OFFSET -8
602 jmp restore_nocheck 602 jmp restore_nocheck
603 CFI_ENDPROC 603 CFI_ENDPROC
604 ENDPROC(system_call) 604 ENDPROC(system_call)
605 605
606 # perform work that needs to be done immediately before resumption 606 # perform work that needs to be done immediately before resumption
607 ALIGN 607 ALIGN
608 RING0_PTREGS_FRAME # can't unwind into user space anyway 608 RING0_PTREGS_FRAME # can't unwind into user space anyway
609 work_pending: 609 work_pending:
610 testb $_TIF_NEED_RESCHED, %cl 610 testb $_TIF_NEED_RESCHED, %cl
611 jz work_notifysig 611 jz work_notifysig
612 work_resched: 612 work_resched:
613 call schedule 613 call schedule
614 LOCKDEP_SYS_EXIT 614 LOCKDEP_SYS_EXIT
615 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt 615 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
616 # setting need_resched or sigpending 616 # setting need_resched or sigpending
617 # between sampling and the iret 617 # between sampling and the iret
618 TRACE_IRQS_OFF 618 TRACE_IRQS_OFF
619 movl TI_flags(%ebp), %ecx 619 movl TI_flags(%ebp), %ecx
620 andl $_TIF_WORK_MASK, %ecx # is there any work to be done other 620 andl $_TIF_WORK_MASK, %ecx # is there any work to be done other
621 # than syscall tracing? 621 # than syscall tracing?
622 jz restore_all 622 jz restore_all
623 testb $_TIF_NEED_RESCHED, %cl 623 testb $_TIF_NEED_RESCHED, %cl
624 jnz work_resched 624 jnz work_resched
625 625
626 work_notifysig: # deal with pending signals and 626 work_notifysig: # deal with pending signals and
627 # notify-resume requests 627 # notify-resume requests
628 #ifdef CONFIG_VM86 628 #ifdef CONFIG_VM86
629 testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) 629 testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
630 movl %esp, %eax 630 movl %esp, %eax
631 jne work_notifysig_v86 # returning to kernel-space or 631 jne work_notifysig_v86 # returning to kernel-space or
632 # vm86-space 632 # vm86-space
633 1: 633 1:
634 #else 634 #else
635 movl %esp, %eax 635 movl %esp, %eax
636 #endif 636 #endif
637 TRACE_IRQS_ON 637 TRACE_IRQS_ON
638 ENABLE_INTERRUPTS(CLBR_NONE) 638 ENABLE_INTERRUPTS(CLBR_NONE)
639 movb PT_CS(%esp), %bl 639 movb PT_CS(%esp), %bl
640 andb $SEGMENT_RPL_MASK, %bl 640 andb $SEGMENT_RPL_MASK, %bl
641 cmpb $USER_RPL, %bl 641 cmpb $USER_RPL, %bl
642 jb resume_kernel 642 jb resume_kernel
643 xorl %edx, %edx 643 xorl %edx, %edx
644 call do_notify_resume 644 call do_notify_resume
645 jmp resume_userspace 645 jmp resume_userspace
646 646
647 #ifdef CONFIG_VM86 647 #ifdef CONFIG_VM86
648 ALIGN 648 ALIGN
649 work_notifysig_v86: 649 work_notifysig_v86:
650 pushl_cfi %ecx # save ti_flags for do_notify_resume 650 pushl_cfi %ecx # save ti_flags for do_notify_resume
651 call save_v86_state # %eax contains pt_regs pointer 651 call save_v86_state # %eax contains pt_regs pointer
652 popl_cfi %ecx 652 popl_cfi %ecx
653 movl %eax, %esp 653 movl %eax, %esp
654 jmp 1b 654 jmp 1b
655 #endif 655 #endif
656 END(work_pending) 656 END(work_pending)
657 657
658 # perform syscall exit tracing 658 # perform syscall exit tracing
659 ALIGN 659 ALIGN
660 syscall_trace_entry: 660 syscall_trace_entry:
661 movl $-ENOSYS,PT_EAX(%esp) 661 movl $-ENOSYS,PT_EAX(%esp)
662 movl %esp, %eax 662 movl %esp, %eax
663 call syscall_trace_enter 663 call syscall_trace_enter
664 /* What it returned is what we'll actually use. */ 664 /* What it returned is what we'll actually use. */
665 cmpl $(NR_syscalls), %eax 665 cmpl $(NR_syscalls), %eax
666 jnae syscall_call 666 jnae syscall_call
667 jmp syscall_exit 667 jmp syscall_exit
668 END(syscall_trace_entry) 668 END(syscall_trace_entry)
669 669
670 # perform syscall exit tracing 670 # perform syscall exit tracing
671 ALIGN 671 ALIGN
672 syscall_exit_work: 672 syscall_exit_work:
673 testl $_TIF_WORK_SYSCALL_EXIT, %ecx 673 testl $_TIF_WORK_SYSCALL_EXIT, %ecx
674 jz work_pending 674 jz work_pending
675 TRACE_IRQS_ON 675 TRACE_IRQS_ON
676 ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call 676 ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
677 # schedule() instead 677 # schedule() instead
678 movl %esp, %eax 678 movl %esp, %eax
679 call syscall_trace_leave 679 call syscall_trace_leave
680 jmp resume_userspace 680 jmp resume_userspace
681 END(syscall_exit_work) 681 END(syscall_exit_work)
682 CFI_ENDPROC 682 CFI_ENDPROC
683 683
684 RING0_INT_FRAME # can't unwind into user space anyway 684 RING0_INT_FRAME # can't unwind into user space anyway
685 syscall_fault: 685 syscall_fault:
686 ASM_CLAC 686 ASM_CLAC
687 GET_THREAD_INFO(%ebp) 687 GET_THREAD_INFO(%ebp)
688 movl $-EFAULT,PT_EAX(%esp) 688 movl $-EFAULT,PT_EAX(%esp)
689 jmp resume_userspace 689 jmp resume_userspace
690 END(syscall_fault) 690 END(syscall_fault)
691 691
692 syscall_badsys: 692 syscall_badsys:
693 movl $-ENOSYS,PT_EAX(%esp) 693 movl $-ENOSYS,PT_EAX(%esp)
694 jmp resume_userspace 694 jmp resume_userspace
695 END(syscall_badsys) 695 END(syscall_badsys)
696 CFI_ENDPROC 696 CFI_ENDPROC
697 /* 697 /*
698 * End of kprobes section 698 * End of kprobes section
699 */ 699 */
700 .popsection 700 .popsection
701 701
702 /* 702 /*
703 * System calls that need a pt_regs pointer. 703 * System calls that need a pt_regs pointer.
704 */ 704 */
705 #define PTREGSCALL0(name) \ 705 #define PTREGSCALL0(name) \
706 ENTRY(ptregs_##name) ; \ 706 ENTRY(ptregs_##name) ; \
707 leal 4(%esp),%eax; \ 707 leal 4(%esp),%eax; \
708 jmp sys_##name; \ 708 jmp sys_##name; \
709 ENDPROC(ptregs_##name) 709 ENDPROC(ptregs_##name)
710 710
711 #define PTREGSCALL1(name) \ 711 #define PTREGSCALL1(name) \
712 ENTRY(ptregs_##name) ; \ 712 ENTRY(ptregs_##name) ; \
713 leal 4(%esp),%edx; \ 713 leal 4(%esp),%edx; \
714 movl (PT_EBX+4)(%esp),%eax; \ 714 movl (PT_EBX+4)(%esp),%eax; \
715 jmp sys_##name; \ 715 jmp sys_##name; \
716 ENDPROC(ptregs_##name) 716 ENDPROC(ptregs_##name)
717 717
718 #define PTREGSCALL2(name) \ 718 #define PTREGSCALL2(name) \
719 ENTRY(ptregs_##name) ; \ 719 ENTRY(ptregs_##name) ; \
720 leal 4(%esp),%ecx; \ 720 leal 4(%esp),%ecx; \
721 movl (PT_ECX+4)(%esp),%edx; \ 721 movl (PT_ECX+4)(%esp),%edx; \
722 movl (PT_EBX+4)(%esp),%eax; \ 722 movl (PT_EBX+4)(%esp),%eax; \
723 jmp sys_##name; \ 723 jmp sys_##name; \
724 ENDPROC(ptregs_##name) 724 ENDPROC(ptregs_##name)
725 725
726 #define PTREGSCALL3(name) \ 726 #define PTREGSCALL3(name) \
727 ENTRY(ptregs_##name) ; \ 727 ENTRY(ptregs_##name) ; \
728 CFI_STARTPROC; \ 728 CFI_STARTPROC; \
729 leal 4(%esp),%eax; \ 729 leal 4(%esp),%eax; \
730 pushl_cfi %eax; \ 730 pushl_cfi %eax; \
731 movl PT_EDX(%eax),%ecx; \ 731 movl PT_EDX(%eax),%ecx; \
732 movl PT_ECX(%eax),%edx; \ 732 movl PT_ECX(%eax),%edx; \
733 movl PT_EBX(%eax),%eax; \ 733 movl PT_EBX(%eax),%eax; \
734 call sys_##name; \ 734 call sys_##name; \
735 addl $4,%esp; \ 735 addl $4,%esp; \
736 CFI_ADJUST_CFA_OFFSET -4; \ 736 CFI_ADJUST_CFA_OFFSET -4; \
737 ret; \ 737 ret; \
738 CFI_ENDPROC; \ 738 CFI_ENDPROC; \
739 ENDPROC(ptregs_##name) 739 ENDPROC(ptregs_##name)
740 740
741 PTREGSCALL1(iopl) 741 PTREGSCALL1(iopl)
742 PTREGSCALL0(sigreturn) 742 PTREGSCALL0(sigreturn)
743 PTREGSCALL0(rt_sigreturn) 743 PTREGSCALL0(rt_sigreturn)
744 PTREGSCALL2(vm86) 744 PTREGSCALL2(vm86)
745 PTREGSCALL1(vm86old) 745 PTREGSCALL1(vm86old)
746 746
747 .macro FIXUP_ESPFIX_STACK 747 .macro FIXUP_ESPFIX_STACK
748 /* 748 /*
749 * Switch back for ESPFIX stack to the normal zerobased stack 749 * Switch back for ESPFIX stack to the normal zerobased stack
750 * 750 *
751 * We can't call C functions using the ESPFIX stack. This code reads 751 * We can't call C functions using the ESPFIX stack. This code reads
752 * the high word of the segment base from the GDT and swiches to the 752 * the high word of the segment base from the GDT and swiches to the
753 * normal stack and adjusts ESP with the matching offset. 753 * normal stack and adjusts ESP with the matching offset.
754 */ 754 */
755 /* fixup the stack */ 755 /* fixup the stack */
756 mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ 756 mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
757 mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ 757 mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
758 shl $16, %eax 758 shl $16, %eax
759 addl %esp, %eax /* the adjusted stack pointer */ 759 addl %esp, %eax /* the adjusted stack pointer */
760 pushl_cfi $__KERNEL_DS 760 pushl_cfi $__KERNEL_DS
761 pushl_cfi %eax 761 pushl_cfi %eax
762 lss (%esp), %esp /* switch to the normal stack segment */ 762 lss (%esp), %esp /* switch to the normal stack segment */
763 CFI_ADJUST_CFA_OFFSET -8 763 CFI_ADJUST_CFA_OFFSET -8
764 .endm 764 .endm
765 .macro UNWIND_ESPFIX_STACK 765 .macro UNWIND_ESPFIX_STACK
766 movl %ss, %eax 766 movl %ss, %eax
767 /* see if on espfix stack */ 767 /* see if on espfix stack */
768 cmpw $__ESPFIX_SS, %ax 768 cmpw $__ESPFIX_SS, %ax
769 jne 27f 769 jne 27f
770 movl $__KERNEL_DS, %eax 770 movl $__KERNEL_DS, %eax
771 movl %eax, %ds 771 movl %eax, %ds
772 movl %eax, %es 772 movl %eax, %es
773 /* switch to normal stack */ 773 /* switch to normal stack */
774 FIXUP_ESPFIX_STACK 774 FIXUP_ESPFIX_STACK
775 27: 775 27:
776 .endm 776 .endm
777 777
778 /* 778 /*
779 * Build the entry stubs and pointer table with some assembler magic. 779 * Build the entry stubs and pointer table with some assembler magic.
780 * We pack 7 stubs into a single 32-byte chunk, which will fit in a 780 * We pack 7 stubs into a single 32-byte chunk, which will fit in a
781 * single cache line on all modern x86 implementations. 781 * single cache line on all modern x86 implementations.
782 */ 782 */
783 .section .init.rodata,"a" 783 .section .init.rodata,"a"
784 ENTRY(interrupt) 784 ENTRY(interrupt)
785 .section .entry.text, "ax" 785 .section .entry.text, "ax"
786 .p2align 5 786 .p2align 5
787 .p2align CONFIG_X86_L1_CACHE_SHIFT 787 .p2align CONFIG_X86_L1_CACHE_SHIFT
788 ENTRY(irq_entries_start) 788 ENTRY(irq_entries_start)
789 RING0_INT_FRAME 789 RING0_INT_FRAME
790 vector=FIRST_EXTERNAL_VECTOR 790 vector=FIRST_EXTERNAL_VECTOR
791 .rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7 791 .rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
792 .balign 32 792 .balign 32
793 .rept 7 793 .rept 7
794 .if vector < NR_VECTORS 794 .if vector < NR_VECTORS
795 .if vector <> FIRST_EXTERNAL_VECTOR 795 .if vector <> FIRST_EXTERNAL_VECTOR
796 CFI_ADJUST_CFA_OFFSET -4 796 CFI_ADJUST_CFA_OFFSET -4
797 .endif 797 .endif
798 1: pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */ 798 1: pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */
799 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 799 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
800 jmp 2f 800 jmp 2f
801 .endif 801 .endif
802 .previous 802 .previous
803 .long 1b 803 .long 1b
804 .section .entry.text, "ax" 804 .section .entry.text, "ax"
805 vector=vector+1 805 vector=vector+1
806 .endif 806 .endif
807 .endr 807 .endr
808 2: jmp common_interrupt 808 2: jmp common_interrupt
809 .endr 809 .endr
810 END(irq_entries_start) 810 END(irq_entries_start)
811 811
812 .previous 812 .previous
813 END(interrupt) 813 END(interrupt)
814 .previous 814 .previous
815 815
816 /* 816 /*
817 * the CPU automatically disables interrupts when executing an IRQ vector, 817 * the CPU automatically disables interrupts when executing an IRQ vector,
818 * so IRQ-flags tracing has to follow that: 818 * so IRQ-flags tracing has to follow that:
819 */ 819 */
820 .p2align CONFIG_X86_L1_CACHE_SHIFT 820 .p2align CONFIG_X86_L1_CACHE_SHIFT
821 common_interrupt: 821 common_interrupt:
822 ASM_CLAC 822 ASM_CLAC
823 addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */ 823 addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */
824 SAVE_ALL 824 SAVE_ALL
825 TRACE_IRQS_OFF 825 TRACE_IRQS_OFF
826 movl %esp,%eax 826 movl %esp,%eax
827 call do_IRQ 827 call do_IRQ
828 jmp ret_from_intr 828 jmp ret_from_intr
829 ENDPROC(common_interrupt) 829 ENDPROC(common_interrupt)
830 CFI_ENDPROC 830 CFI_ENDPROC
831 831
832 /* 832 /*
833 * Irq entries should be protected against kprobes 833 * Irq entries should be protected against kprobes
834 */ 834 */
835 .pushsection .kprobes.text, "ax" 835 .pushsection .kprobes.text, "ax"
836 #define BUILD_INTERRUPT3(name, nr, fn) \ 836 #define BUILD_INTERRUPT3(name, nr, fn) \
837 ENTRY(name) \ 837 ENTRY(name) \
838 RING0_INT_FRAME; \ 838 RING0_INT_FRAME; \
839 ASM_CLAC; \ 839 ASM_CLAC; \
840 pushl_cfi $~(nr); \ 840 pushl_cfi $~(nr); \
841 SAVE_ALL; \ 841 SAVE_ALL; \
842 TRACE_IRQS_OFF \ 842 TRACE_IRQS_OFF \
843 movl %esp,%eax; \ 843 movl %esp,%eax; \
844 call fn; \ 844 call fn; \
845 jmp ret_from_intr; \ 845 jmp ret_from_intr; \
846 CFI_ENDPROC; \ 846 CFI_ENDPROC; \
847 ENDPROC(name) 847 ENDPROC(name)
848 848
849 #define BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(name, nr, smp_##name) 849 #define BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(name, nr, smp_##name)
850 850
851 /* The include is where all of the SMP etc. interrupts come from */ 851 /* The include is where all of the SMP etc. interrupts come from */
852 #include <asm/entry_arch.h> 852 #include <asm/entry_arch.h>
853 853
854 ENTRY(coprocessor_error) 854 ENTRY(coprocessor_error)
855 RING0_INT_FRAME 855 RING0_INT_FRAME
856 ASM_CLAC 856 ASM_CLAC
857 pushl_cfi $0 857 pushl_cfi $0
858 pushl_cfi $do_coprocessor_error 858 pushl_cfi $do_coprocessor_error
859 jmp error_code 859 jmp error_code
860 CFI_ENDPROC 860 CFI_ENDPROC
861 END(coprocessor_error) 861 END(coprocessor_error)
862 862
863 ENTRY(simd_coprocessor_error) 863 ENTRY(simd_coprocessor_error)
864 RING0_INT_FRAME 864 RING0_INT_FRAME
865 ASM_CLAC 865 ASM_CLAC
866 pushl_cfi $0 866 pushl_cfi $0
867 #ifdef CONFIG_X86_INVD_BUG 867 #ifdef CONFIG_X86_INVD_BUG
868 /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ 868 /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
869 661: pushl_cfi $do_general_protection 869 661: pushl_cfi $do_general_protection
870 662: 870 662:
871 .section .altinstructions,"a" 871 .section .altinstructions,"a"
872 altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f 872 altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f
873 .previous 873 .previous
874 .section .altinstr_replacement,"ax" 874 .section .altinstr_replacement,"ax"
875 663: pushl $do_simd_coprocessor_error 875 663: pushl $do_simd_coprocessor_error
876 664: 876 664:
877 .previous 877 .previous
878 #else 878 #else
879 pushl_cfi $do_simd_coprocessor_error 879 pushl_cfi $do_simd_coprocessor_error
880 #endif 880 #endif
881 jmp error_code 881 jmp error_code
882 CFI_ENDPROC 882 CFI_ENDPROC
883 END(simd_coprocessor_error) 883 END(simd_coprocessor_error)
884 884
885 ENTRY(device_not_available) 885 ENTRY(device_not_available)
886 RING0_INT_FRAME 886 RING0_INT_FRAME
887 ASM_CLAC 887 ASM_CLAC
888 pushl_cfi $-1 # mark this as an int 888 pushl_cfi $-1 # mark this as an int
889 pushl_cfi $do_device_not_available 889 pushl_cfi $do_device_not_available
890 jmp error_code 890 jmp error_code
891 CFI_ENDPROC 891 CFI_ENDPROC
892 END(device_not_available) 892 END(device_not_available)
893 893
894 #ifdef CONFIG_PARAVIRT 894 #ifdef CONFIG_PARAVIRT
895 ENTRY(native_iret) 895 ENTRY(native_iret)
896 iret 896 iret
897 _ASM_EXTABLE(native_iret, iret_exc) 897 _ASM_EXTABLE(native_iret, iret_exc)
898 END(native_iret) 898 END(native_iret)
899 899
900 ENTRY(native_irq_enable_sysexit) 900 ENTRY(native_irq_enable_sysexit)
901 sti 901 sti
902 sysexit 902 sysexit
903 END(native_irq_enable_sysexit) 903 END(native_irq_enable_sysexit)
904 #endif 904 #endif
905 905
906 ENTRY(overflow) 906 ENTRY(overflow)
907 RING0_INT_FRAME 907 RING0_INT_FRAME
908 ASM_CLAC 908 ASM_CLAC
909 pushl_cfi $0 909 pushl_cfi $0
910 pushl_cfi $do_overflow 910 pushl_cfi $do_overflow
911 jmp error_code 911 jmp error_code
912 CFI_ENDPROC 912 CFI_ENDPROC
913 END(overflow) 913 END(overflow)
914 914
915 ENTRY(bounds) 915 ENTRY(bounds)
916 RING0_INT_FRAME 916 RING0_INT_FRAME
917 ASM_CLAC 917 ASM_CLAC
918 pushl_cfi $0 918 pushl_cfi $0
919 pushl_cfi $do_bounds 919 pushl_cfi $do_bounds
920 jmp error_code 920 jmp error_code
921 CFI_ENDPROC 921 CFI_ENDPROC
922 END(bounds) 922 END(bounds)
923 923
924 ENTRY(invalid_op) 924 ENTRY(invalid_op)
925 RING0_INT_FRAME 925 RING0_INT_FRAME
926 ASM_CLAC 926 ASM_CLAC
927 pushl_cfi $0 927 pushl_cfi $0
928 pushl_cfi $do_invalid_op 928 pushl_cfi $do_invalid_op
929 jmp error_code 929 jmp error_code
930 CFI_ENDPROC 930 CFI_ENDPROC
931 END(invalid_op) 931 END(invalid_op)
932 932
933 ENTRY(coprocessor_segment_overrun) 933 ENTRY(coprocessor_segment_overrun)
934 RING0_INT_FRAME 934 RING0_INT_FRAME
935 ASM_CLAC 935 ASM_CLAC
936 pushl_cfi $0 936 pushl_cfi $0
937 pushl_cfi $do_coprocessor_segment_overrun 937 pushl_cfi $do_coprocessor_segment_overrun
938 jmp error_code 938 jmp error_code
939 CFI_ENDPROC 939 CFI_ENDPROC
940 END(coprocessor_segment_overrun) 940 END(coprocessor_segment_overrun)
941 941
942 ENTRY(invalid_TSS) 942 ENTRY(invalid_TSS)
943 RING0_EC_FRAME 943 RING0_EC_FRAME
944 ASM_CLAC 944 ASM_CLAC
945 pushl_cfi $do_invalid_TSS 945 pushl_cfi $do_invalid_TSS
946 jmp error_code 946 jmp error_code
947 CFI_ENDPROC 947 CFI_ENDPROC
948 END(invalid_TSS) 948 END(invalid_TSS)
949 949
950 ENTRY(segment_not_present) 950 ENTRY(segment_not_present)
951 RING0_EC_FRAME 951 RING0_EC_FRAME
952 ASM_CLAC 952 ASM_CLAC
953 pushl_cfi $do_segment_not_present 953 pushl_cfi $do_segment_not_present
954 jmp error_code 954 jmp error_code
955 CFI_ENDPROC 955 CFI_ENDPROC
956 END(segment_not_present) 956 END(segment_not_present)
957 957
958 ENTRY(stack_segment) 958 ENTRY(stack_segment)
959 RING0_EC_FRAME 959 RING0_EC_FRAME
960 ASM_CLAC 960 ASM_CLAC
961 pushl_cfi $do_stack_segment 961 pushl_cfi $do_stack_segment
962 jmp error_code 962 jmp error_code
963 CFI_ENDPROC 963 CFI_ENDPROC
964 END(stack_segment) 964 END(stack_segment)
965 965
966 ENTRY(alignment_check) 966 ENTRY(alignment_check)
967 RING0_EC_FRAME 967 RING0_EC_FRAME
968 ASM_CLAC 968 ASM_CLAC
969 pushl_cfi $do_alignment_check 969 pushl_cfi $do_alignment_check
970 jmp error_code 970 jmp error_code
971 CFI_ENDPROC 971 CFI_ENDPROC
972 END(alignment_check) 972 END(alignment_check)
973 973
974 ENTRY(divide_error) 974 ENTRY(divide_error)
975 RING0_INT_FRAME 975 RING0_INT_FRAME
976 ASM_CLAC 976 ASM_CLAC
977 pushl_cfi $0 # no error code 977 pushl_cfi $0 # no error code
978 pushl_cfi $do_divide_error 978 pushl_cfi $do_divide_error
979 jmp error_code 979 jmp error_code
980 CFI_ENDPROC 980 CFI_ENDPROC
981 END(divide_error) 981 END(divide_error)
982 982
983 #ifdef CONFIG_X86_MCE 983 #ifdef CONFIG_X86_MCE
984 ENTRY(machine_check) 984 ENTRY(machine_check)
985 RING0_INT_FRAME 985 RING0_INT_FRAME
986 ASM_CLAC 986 ASM_CLAC
987 pushl_cfi $0 987 pushl_cfi $0
988 pushl_cfi machine_check_vector 988 pushl_cfi machine_check_vector
989 jmp error_code 989 jmp error_code
990 CFI_ENDPROC 990 CFI_ENDPROC
991 END(machine_check) 991 END(machine_check)
992 #endif 992 #endif
993 993
994 ENTRY(spurious_interrupt_bug) 994 ENTRY(spurious_interrupt_bug)
995 RING0_INT_FRAME 995 RING0_INT_FRAME
996 ASM_CLAC 996 ASM_CLAC
997 pushl_cfi $0 997 pushl_cfi $0
998 pushl_cfi $do_spurious_interrupt_bug 998 pushl_cfi $do_spurious_interrupt_bug
999 jmp error_code 999 jmp error_code
1000 CFI_ENDPROC 1000 CFI_ENDPROC
1001 END(spurious_interrupt_bug) 1001 END(spurious_interrupt_bug)
1002 /* 1002 /*
1003 * End of kprobes section 1003 * End of kprobes section
1004 */ 1004 */
1005 .popsection 1005 .popsection
1006 1006
1007 #ifdef CONFIG_XEN 1007 #ifdef CONFIG_XEN
1008 /* Xen doesn't set %esp to be precisely what the normal sysenter 1008 /* Xen doesn't set %esp to be precisely what the normal sysenter
1009 entrypoint expects, so fix it up before using the normal path. */ 1009 entrypoint expects, so fix it up before using the normal path. */
1010 ENTRY(xen_sysenter_target) 1010 ENTRY(xen_sysenter_target)
1011 RING0_INT_FRAME 1011 RING0_INT_FRAME
1012 addl $5*4, %esp /* remove xen-provided frame */ 1012 addl $5*4, %esp /* remove xen-provided frame */
1013 CFI_ADJUST_CFA_OFFSET -5*4 1013 CFI_ADJUST_CFA_OFFSET -5*4
1014 jmp sysenter_past_esp 1014 jmp sysenter_past_esp
1015 CFI_ENDPROC 1015 CFI_ENDPROC
1016 1016
1017 ENTRY(xen_hypervisor_callback) 1017 ENTRY(xen_hypervisor_callback)
1018 CFI_STARTPROC 1018 CFI_STARTPROC
1019 pushl_cfi $-1 /* orig_ax = -1 => not a system call */ 1019 pushl_cfi $-1 /* orig_ax = -1 => not a system call */
1020 SAVE_ALL 1020 SAVE_ALL
1021 TRACE_IRQS_OFF 1021 TRACE_IRQS_OFF
1022 1022
1023 /* Check to see if we got the event in the critical 1023 /* Check to see if we got the event in the critical
1024 region in xen_iret_direct, after we've reenabled 1024 region in xen_iret_direct, after we've reenabled
1025 events and checked for pending events. This simulates 1025 events and checked for pending events. This simulates
1026 iret instruction's behaviour where it delivers a 1026 iret instruction's behaviour where it delivers a
1027 pending interrupt when enabling interrupts. */ 1027 pending interrupt when enabling interrupts. */
1028 movl PT_EIP(%esp),%eax 1028 movl PT_EIP(%esp),%eax
1029 cmpl $xen_iret_start_crit,%eax 1029 cmpl $xen_iret_start_crit,%eax
1030 jb 1f 1030 jb 1f
1031 cmpl $xen_iret_end_crit,%eax 1031 cmpl $xen_iret_end_crit,%eax
1032 jae 1f 1032 jae 1f
1033 1033
1034 jmp xen_iret_crit_fixup 1034 jmp xen_iret_crit_fixup
1035 1035
1036 ENTRY(xen_do_upcall) 1036 ENTRY(xen_do_upcall)
1037 1: mov %esp, %eax 1037 1: mov %esp, %eax
1038 call xen_evtchn_do_upcall 1038 call xen_evtchn_do_upcall
1039 jmp ret_from_intr 1039 jmp ret_from_intr
1040 CFI_ENDPROC 1040 CFI_ENDPROC
1041 ENDPROC(xen_hypervisor_callback) 1041 ENDPROC(xen_hypervisor_callback)
1042 1042
1043 # Hypervisor uses this for application faults while it executes. 1043 # Hypervisor uses this for application faults while it executes.
1044 # We get here for two reasons: 1044 # We get here for two reasons:
1045 # 1. Fault while reloading DS, ES, FS or GS 1045 # 1. Fault while reloading DS, ES, FS or GS
1046 # 2. Fault while executing IRET 1046 # 2. Fault while executing IRET
1047 # Category 1 we fix up by reattempting the load, and zeroing the segment 1047 # Category 1 we fix up by reattempting the load, and zeroing the segment
1048 # register if the load fails. 1048 # register if the load fails.
1049 # Category 2 we fix up by jumping to do_iret_error. We cannot use the 1049 # Category 2 we fix up by jumping to do_iret_error. We cannot use the
1050 # normal Linux return path in this case because if we use the IRET hypercall 1050 # normal Linux return path in this case because if we use the IRET hypercall
1051 # to pop the stack frame we end up in an infinite loop of failsafe callbacks. 1051 # to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1052 # We distinguish between categories by maintaining a status value in EAX. 1052 # We distinguish between categories by maintaining a status value in EAX.
1053 ENTRY(xen_failsafe_callback) 1053 ENTRY(xen_failsafe_callback)
1054 CFI_STARTPROC 1054 CFI_STARTPROC
1055 pushl_cfi %eax 1055 pushl_cfi %eax
1056 movl $1,%eax 1056 movl $1,%eax
1057 1: mov 4(%esp),%ds 1057 1: mov 4(%esp),%ds
1058 2: mov 8(%esp),%es 1058 2: mov 8(%esp),%es
1059 3: mov 12(%esp),%fs 1059 3: mov 12(%esp),%fs
1060 4: mov 16(%esp),%gs 1060 4: mov 16(%esp),%gs
1061 /* EAX == 0 => Category 1 (Bad segment) 1061 /* EAX == 0 => Category 1 (Bad segment)
1062 EAX != 0 => Category 2 (Bad IRET) */ 1062 EAX != 0 => Category 2 (Bad IRET) */
1063 testl %eax,%eax 1063 testl %eax,%eax
1064 popl_cfi %eax 1064 popl_cfi %eax
1065 lea 16(%esp),%esp 1065 lea 16(%esp),%esp
1066 CFI_ADJUST_CFA_OFFSET -16 1066 CFI_ADJUST_CFA_OFFSET -16
1067 jz 5f 1067 jz 5f
1068 jmp iret_exc 1068 jmp iret_exc
1069 5: pushl_cfi $-1 /* orig_ax = -1 => not a system call */ 1069 5: pushl_cfi $-1 /* orig_ax = -1 => not a system call */
1070 SAVE_ALL 1070 SAVE_ALL
1071 jmp ret_from_exception 1071 jmp ret_from_exception
1072 CFI_ENDPROC 1072 CFI_ENDPROC
1073 1073
1074 .section .fixup,"ax" 1074 .section .fixup,"ax"
1075 6: xorl %eax,%eax 1075 6: xorl %eax,%eax
1076 movl %eax,4(%esp) 1076 movl %eax,4(%esp)
1077 jmp 1b 1077 jmp 1b
1078 7: xorl %eax,%eax 1078 7: xorl %eax,%eax
1079 movl %eax,8(%esp) 1079 movl %eax,8(%esp)
1080 jmp 2b 1080 jmp 2b
1081 8: xorl %eax,%eax 1081 8: xorl %eax,%eax
1082 movl %eax,12(%esp) 1082 movl %eax,12(%esp)
1083 jmp 3b 1083 jmp 3b
1084 9: xorl %eax,%eax 1084 9: xorl %eax,%eax
1085 movl %eax,16(%esp) 1085 movl %eax,16(%esp)
1086 jmp 4b 1086 jmp 4b
1087 .previous 1087 .previous
1088 _ASM_EXTABLE(1b,6b) 1088 _ASM_EXTABLE(1b,6b)
1089 _ASM_EXTABLE(2b,7b) 1089 _ASM_EXTABLE(2b,7b)
1090 _ASM_EXTABLE(3b,8b) 1090 _ASM_EXTABLE(3b,8b)
1091 _ASM_EXTABLE(4b,9b) 1091 _ASM_EXTABLE(4b,9b)
1092 ENDPROC(xen_failsafe_callback) 1092 ENDPROC(xen_failsafe_callback)
1093 1093
1094 BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK, 1094 BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
1095 xen_evtchn_do_upcall) 1095 xen_evtchn_do_upcall)
1096 1096
1097 #endif /* CONFIG_XEN */ 1097 #endif /* CONFIG_XEN */
1098
1099 #if IS_ENABLED(CONFIG_HYPERV)
1100
1101 BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
1102 hyperv_vector_handler)
1103
1104 #endif /* CONFIG_HYPERV */
1098 1105
1099 #ifdef CONFIG_FUNCTION_TRACER 1106 #ifdef CONFIG_FUNCTION_TRACER
1100 #ifdef CONFIG_DYNAMIC_FTRACE 1107 #ifdef CONFIG_DYNAMIC_FTRACE
1101 1108
1102 ENTRY(mcount) 1109 ENTRY(mcount)
1103 ret 1110 ret
1104 END(mcount) 1111 END(mcount)
1105 1112
1106 ENTRY(ftrace_caller) 1113 ENTRY(ftrace_caller)
1107 cmpl $0, function_trace_stop 1114 cmpl $0, function_trace_stop
1108 jne ftrace_stub 1115 jne ftrace_stub
1109 1116
1110 pushl %eax 1117 pushl %eax
1111 pushl %ecx 1118 pushl %ecx
1112 pushl %edx 1119 pushl %edx
1113 pushl $0 /* Pass NULL as regs pointer */ 1120 pushl $0 /* Pass NULL as regs pointer */
1114 movl 4*4(%esp), %eax 1121 movl 4*4(%esp), %eax
1115 movl 0x4(%ebp), %edx 1122 movl 0x4(%ebp), %edx
1116 leal function_trace_op, %ecx 1123 leal function_trace_op, %ecx
1117 subl $MCOUNT_INSN_SIZE, %eax 1124 subl $MCOUNT_INSN_SIZE, %eax
1118 1125
1119 .globl ftrace_call 1126 .globl ftrace_call
1120 ftrace_call: 1127 ftrace_call:
1121 call ftrace_stub 1128 call ftrace_stub
1122 1129
1123 addl $4,%esp /* skip NULL pointer */ 1130 addl $4,%esp /* skip NULL pointer */
1124 popl %edx 1131 popl %edx
1125 popl %ecx 1132 popl %ecx
1126 popl %eax 1133 popl %eax
1127 ftrace_ret: 1134 ftrace_ret:
1128 #ifdef CONFIG_FUNCTION_GRAPH_TRACER 1135 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
1129 .globl ftrace_graph_call 1136 .globl ftrace_graph_call
1130 ftrace_graph_call: 1137 ftrace_graph_call:
1131 jmp ftrace_stub 1138 jmp ftrace_stub
1132 #endif 1139 #endif
1133 1140
1134 .globl ftrace_stub 1141 .globl ftrace_stub
1135 ftrace_stub: 1142 ftrace_stub:
1136 ret 1143 ret
1137 END(ftrace_caller) 1144 END(ftrace_caller)
1138 1145
1139 ENTRY(ftrace_regs_caller) 1146 ENTRY(ftrace_regs_caller)
1140 pushf /* push flags before compare (in cs location) */ 1147 pushf /* push flags before compare (in cs location) */
1141 cmpl $0, function_trace_stop 1148 cmpl $0, function_trace_stop
1142 jne ftrace_restore_flags 1149 jne ftrace_restore_flags
1143 1150
1144 /* 1151 /*
1145 * i386 does not save SS and ESP when coming from kernel. 1152 * i386 does not save SS and ESP when coming from kernel.
1146 * Instead, to get sp, &regs->sp is used (see ptrace.h). 1153 * Instead, to get sp, &regs->sp is used (see ptrace.h).
1147 * Unfortunately, that means eflags must be at the same location 1154 * Unfortunately, that means eflags must be at the same location
1148 * as the current return ip is. We move the return ip into the 1155 * as the current return ip is. We move the return ip into the
1149 * ip location, and move flags into the return ip location. 1156 * ip location, and move flags into the return ip location.
1150 */ 1157 */
1151 pushl 4(%esp) /* save return ip into ip slot */ 1158 pushl 4(%esp) /* save return ip into ip slot */
1152 1159
1153 pushl $0 /* Load 0 into orig_ax */ 1160 pushl $0 /* Load 0 into orig_ax */
1154 pushl %gs 1161 pushl %gs
1155 pushl %fs 1162 pushl %fs
1156 pushl %es 1163 pushl %es
1157 pushl %ds 1164 pushl %ds
1158 pushl %eax 1165 pushl %eax
1159 pushl %ebp 1166 pushl %ebp
1160 pushl %edi 1167 pushl %edi
1161 pushl %esi 1168 pushl %esi
1162 pushl %edx 1169 pushl %edx
1163 pushl %ecx 1170 pushl %ecx
1164 pushl %ebx 1171 pushl %ebx
1165 1172
1166 movl 13*4(%esp), %eax /* Get the saved flags */ 1173 movl 13*4(%esp), %eax /* Get the saved flags */
1167 movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */ 1174 movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */
1168 /* clobbering return ip */ 1175 /* clobbering return ip */
1169 movl $__KERNEL_CS,13*4(%esp) 1176 movl $__KERNEL_CS,13*4(%esp)
1170 1177
1171 movl 12*4(%esp), %eax /* Load ip (1st parameter) */ 1178 movl 12*4(%esp), %eax /* Load ip (1st parameter) */
1172 subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ 1179 subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */
1173 movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ 1180 movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */
1174 leal function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ 1181 leal function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
1175 pushl %esp /* Save pt_regs as 4th parameter */ 1182 pushl %esp /* Save pt_regs as 4th parameter */
1176 1183
1177 GLOBAL(ftrace_regs_call) 1184 GLOBAL(ftrace_regs_call)
1178 call ftrace_stub 1185 call ftrace_stub
1179 1186
1180 addl $4, %esp /* Skip pt_regs */ 1187 addl $4, %esp /* Skip pt_regs */
1181 movl 14*4(%esp), %eax /* Move flags back into cs */ 1188 movl 14*4(%esp), %eax /* Move flags back into cs */
1182 movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */ 1189 movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */
1183 movl 12*4(%esp), %eax /* Get return ip from regs->ip */ 1190 movl 12*4(%esp), %eax /* Get return ip from regs->ip */
1184 movl %eax, 14*4(%esp) /* Put return ip back for ret */ 1191 movl %eax, 14*4(%esp) /* Put return ip back for ret */
1185 1192
1186 popl %ebx 1193 popl %ebx
1187 popl %ecx 1194 popl %ecx
1188 popl %edx 1195 popl %edx
1189 popl %esi 1196 popl %esi
1190 popl %edi 1197 popl %edi
1191 popl %ebp 1198 popl %ebp
1192 popl %eax 1199 popl %eax
1193 popl %ds 1200 popl %ds
1194 popl %es 1201 popl %es
1195 popl %fs 1202 popl %fs
1196 popl %gs 1203 popl %gs
1197 addl $8, %esp /* Skip orig_ax and ip */ 1204 addl $8, %esp /* Skip orig_ax and ip */
1198 popf /* Pop flags at end (no addl to corrupt flags) */ 1205 popf /* Pop flags at end (no addl to corrupt flags) */
1199 jmp ftrace_ret 1206 jmp ftrace_ret
1200 1207
1201 ftrace_restore_flags: 1208 ftrace_restore_flags:
1202 popf 1209 popf
1203 jmp ftrace_stub 1210 jmp ftrace_stub
1204 #else /* ! CONFIG_DYNAMIC_FTRACE */ 1211 #else /* ! CONFIG_DYNAMIC_FTRACE */
1205 1212
1206 ENTRY(mcount) 1213 ENTRY(mcount)
1207 cmpl $0, function_trace_stop 1214 cmpl $0, function_trace_stop
1208 jne ftrace_stub 1215 jne ftrace_stub
1209 1216
1210 cmpl $ftrace_stub, ftrace_trace_function 1217 cmpl $ftrace_stub, ftrace_trace_function
1211 jnz trace 1218 jnz trace
1212 #ifdef CONFIG_FUNCTION_GRAPH_TRACER 1219 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
1213 cmpl $ftrace_stub, ftrace_graph_return 1220 cmpl $ftrace_stub, ftrace_graph_return
1214 jnz ftrace_graph_caller 1221 jnz ftrace_graph_caller
1215 1222
1216 cmpl $ftrace_graph_entry_stub, ftrace_graph_entry 1223 cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
1217 jnz ftrace_graph_caller 1224 jnz ftrace_graph_caller
1218 #endif 1225 #endif
1219 .globl ftrace_stub 1226 .globl ftrace_stub
1220 ftrace_stub: 1227 ftrace_stub:
1221 ret 1228 ret
1222 1229
1223 /* taken from glibc */ 1230 /* taken from glibc */
1224 trace: 1231 trace:
1225 pushl %eax 1232 pushl %eax
1226 pushl %ecx 1233 pushl %ecx
1227 pushl %edx 1234 pushl %edx
1228 movl 0xc(%esp), %eax 1235 movl 0xc(%esp), %eax
1229 movl 0x4(%ebp), %edx 1236 movl 0x4(%ebp), %edx
1230 subl $MCOUNT_INSN_SIZE, %eax 1237 subl $MCOUNT_INSN_SIZE, %eax
1231 1238
1232 call *ftrace_trace_function 1239 call *ftrace_trace_function
1233 1240
1234 popl %edx 1241 popl %edx
1235 popl %ecx 1242 popl %ecx
1236 popl %eax 1243 popl %eax
1237 jmp ftrace_stub 1244 jmp ftrace_stub
1238 END(mcount) 1245 END(mcount)
1239 #endif /* CONFIG_DYNAMIC_FTRACE */ 1246 #endif /* CONFIG_DYNAMIC_FTRACE */
1240 #endif /* CONFIG_FUNCTION_TRACER */ 1247 #endif /* CONFIG_FUNCTION_TRACER */
1241 1248
1242 #ifdef CONFIG_FUNCTION_GRAPH_TRACER 1249 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
1243 ENTRY(ftrace_graph_caller) 1250 ENTRY(ftrace_graph_caller)
1244 pushl %eax 1251 pushl %eax
1245 pushl %ecx 1252 pushl %ecx
1246 pushl %edx 1253 pushl %edx
1247 movl 0xc(%esp), %edx 1254 movl 0xc(%esp), %edx
1248 lea 0x4(%ebp), %eax 1255 lea 0x4(%ebp), %eax
1249 movl (%ebp), %ecx 1256 movl (%ebp), %ecx
1250 subl $MCOUNT_INSN_SIZE, %edx 1257 subl $MCOUNT_INSN_SIZE, %edx
1251 call prepare_ftrace_return 1258 call prepare_ftrace_return
1252 popl %edx 1259 popl %edx
1253 popl %ecx 1260 popl %ecx
1254 popl %eax 1261 popl %eax
1255 ret 1262 ret
1256 END(ftrace_graph_caller) 1263 END(ftrace_graph_caller)
1257 1264
1258 .globl return_to_handler 1265 .globl return_to_handler
1259 return_to_handler: 1266 return_to_handler:
1260 pushl %eax 1267 pushl %eax
1261 pushl %edx 1268 pushl %edx
1262 movl %ebp, %eax 1269 movl %ebp, %eax
1263 call ftrace_return_to_handler 1270 call ftrace_return_to_handler
1264 movl %eax, %ecx 1271 movl %eax, %ecx
1265 popl %edx 1272 popl %edx
1266 popl %eax 1273 popl %eax
1267 jmp *%ecx 1274 jmp *%ecx
1268 #endif 1275 #endif
1269 1276
1270 /* 1277 /*
1271 * Some functions should be protected against kprobes 1278 * Some functions should be protected against kprobes
1272 */ 1279 */
1273 .pushsection .kprobes.text, "ax" 1280 .pushsection .kprobes.text, "ax"
1274 1281
1275 ENTRY(page_fault) 1282 ENTRY(page_fault)
1276 RING0_EC_FRAME 1283 RING0_EC_FRAME
1277 ASM_CLAC 1284 ASM_CLAC
1278 pushl_cfi $do_page_fault 1285 pushl_cfi $do_page_fault
1279 ALIGN 1286 ALIGN
1280 error_code: 1287 error_code:
1281 /* the function address is in %gs's slot on the stack */ 1288 /* the function address is in %gs's slot on the stack */
1282 pushl_cfi %fs 1289 pushl_cfi %fs
1283 /*CFI_REL_OFFSET fs, 0*/ 1290 /*CFI_REL_OFFSET fs, 0*/
1284 pushl_cfi %es 1291 pushl_cfi %es
1285 /*CFI_REL_OFFSET es, 0*/ 1292 /*CFI_REL_OFFSET es, 0*/
1286 pushl_cfi %ds 1293 pushl_cfi %ds
1287 /*CFI_REL_OFFSET ds, 0*/ 1294 /*CFI_REL_OFFSET ds, 0*/
1288 pushl_cfi %eax 1295 pushl_cfi %eax
1289 CFI_REL_OFFSET eax, 0 1296 CFI_REL_OFFSET eax, 0
1290 pushl_cfi %ebp 1297 pushl_cfi %ebp
1291 CFI_REL_OFFSET ebp, 0 1298 CFI_REL_OFFSET ebp, 0
1292 pushl_cfi %edi 1299 pushl_cfi %edi
1293 CFI_REL_OFFSET edi, 0 1300 CFI_REL_OFFSET edi, 0
1294 pushl_cfi %esi 1301 pushl_cfi %esi
1295 CFI_REL_OFFSET esi, 0 1302 CFI_REL_OFFSET esi, 0
1296 pushl_cfi %edx 1303 pushl_cfi %edx
1297 CFI_REL_OFFSET edx, 0 1304 CFI_REL_OFFSET edx, 0
1298 pushl_cfi %ecx 1305 pushl_cfi %ecx
1299 CFI_REL_OFFSET ecx, 0 1306 CFI_REL_OFFSET ecx, 0
1300 pushl_cfi %ebx 1307 pushl_cfi %ebx
1301 CFI_REL_OFFSET ebx, 0 1308 CFI_REL_OFFSET ebx, 0
1302 cld 1309 cld
1303 movl $(__KERNEL_PERCPU), %ecx 1310 movl $(__KERNEL_PERCPU), %ecx
1304 movl %ecx, %fs 1311 movl %ecx, %fs
1305 UNWIND_ESPFIX_STACK 1312 UNWIND_ESPFIX_STACK
1306 GS_TO_REG %ecx 1313 GS_TO_REG %ecx
1307 movl PT_GS(%esp), %edi # get the function address 1314 movl PT_GS(%esp), %edi # get the function address
1308 movl PT_ORIG_EAX(%esp), %edx # get the error code 1315 movl PT_ORIG_EAX(%esp), %edx # get the error code
1309 movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart 1316 movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
1310 REG_TO_PTGS %ecx 1317 REG_TO_PTGS %ecx
1311 SET_KERNEL_GS %ecx 1318 SET_KERNEL_GS %ecx
1312 movl $(__USER_DS), %ecx 1319 movl $(__USER_DS), %ecx
1313 movl %ecx, %ds 1320 movl %ecx, %ds
1314 movl %ecx, %es 1321 movl %ecx, %es
1315 TRACE_IRQS_OFF 1322 TRACE_IRQS_OFF
1316 movl %esp,%eax # pt_regs pointer 1323 movl %esp,%eax # pt_regs pointer
1317 call *%edi 1324 call *%edi
1318 jmp ret_from_exception 1325 jmp ret_from_exception
1319 CFI_ENDPROC 1326 CFI_ENDPROC
1320 END(page_fault) 1327 END(page_fault)
1321 1328
1322 /* 1329 /*
1323 * Debug traps and NMI can happen at the one SYSENTER instruction 1330 * Debug traps and NMI can happen at the one SYSENTER instruction
1324 * that sets up the real kernel stack. Check here, since we can't 1331 * that sets up the real kernel stack. Check here, since we can't
1325 * allow the wrong stack to be used. 1332 * allow the wrong stack to be used.
1326 * 1333 *
1327 * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have 1334 * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
1328 * already pushed 3 words if it hits on the sysenter instruction: 1335 * already pushed 3 words if it hits on the sysenter instruction:
1329 * eflags, cs and eip. 1336 * eflags, cs and eip.
1330 * 1337 *
1331 * We just load the right stack, and push the three (known) values 1338 * We just load the right stack, and push the three (known) values
1332 * by hand onto the new stack - while updating the return eip past 1339 * by hand onto the new stack - while updating the return eip past
1333 * the instruction that would have done it for sysenter. 1340 * the instruction that would have done it for sysenter.
1334 */ 1341 */
1335 .macro FIX_STACK offset ok label 1342 .macro FIX_STACK offset ok label
1336 cmpw $__KERNEL_CS, 4(%esp) 1343 cmpw $__KERNEL_CS, 4(%esp)
1337 jne \ok 1344 jne \ok
1338 \label: 1345 \label:
1339 movl TSS_sysenter_sp0 + \offset(%esp), %esp 1346 movl TSS_sysenter_sp0 + \offset(%esp), %esp
1340 CFI_DEF_CFA esp, 0 1347 CFI_DEF_CFA esp, 0
1341 CFI_UNDEFINED eip 1348 CFI_UNDEFINED eip
1342 pushfl_cfi 1349 pushfl_cfi
1343 pushl_cfi $__KERNEL_CS 1350 pushl_cfi $__KERNEL_CS
1344 pushl_cfi $sysenter_past_esp 1351 pushl_cfi $sysenter_past_esp
1345 CFI_REL_OFFSET eip, 0 1352 CFI_REL_OFFSET eip, 0
1346 .endm 1353 .endm
1347 1354
1348 ENTRY(debug) 1355 ENTRY(debug)
1349 RING0_INT_FRAME 1356 RING0_INT_FRAME
1350 ASM_CLAC 1357 ASM_CLAC
1351 cmpl $ia32_sysenter_target,(%esp) 1358 cmpl $ia32_sysenter_target,(%esp)
1352 jne debug_stack_correct 1359 jne debug_stack_correct
1353 FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn 1360 FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
1354 debug_stack_correct: 1361 debug_stack_correct:
1355 pushl_cfi $-1 # mark this as an int 1362 pushl_cfi $-1 # mark this as an int
1356 SAVE_ALL 1363 SAVE_ALL
1357 TRACE_IRQS_OFF 1364 TRACE_IRQS_OFF
1358 xorl %edx,%edx # error code 0 1365 xorl %edx,%edx # error code 0
1359 movl %esp,%eax # pt_regs pointer 1366 movl %esp,%eax # pt_regs pointer
1360 call do_debug 1367 call do_debug
1361 jmp ret_from_exception 1368 jmp ret_from_exception
1362 CFI_ENDPROC 1369 CFI_ENDPROC
1363 END(debug) 1370 END(debug)
1364 1371
1365 /* 1372 /*
1366 * NMI is doubly nasty. It can happen _while_ we're handling 1373 * NMI is doubly nasty. It can happen _while_ we're handling
1367 * a debug fault, and the debug fault hasn't yet been able to 1374 * a debug fault, and the debug fault hasn't yet been able to
1368 * clear up the stack. So we first check whether we got an 1375 * clear up the stack. So we first check whether we got an
1369 * NMI on the sysenter entry path, but after that we need to 1376 * NMI on the sysenter entry path, but after that we need to
1370 * check whether we got an NMI on the debug path where the debug 1377 * check whether we got an NMI on the debug path where the debug
1371 * fault happened on the sysenter path. 1378 * fault happened on the sysenter path.
1372 */ 1379 */
1373 ENTRY(nmi) 1380 ENTRY(nmi)
1374 RING0_INT_FRAME 1381 RING0_INT_FRAME
1375 ASM_CLAC 1382 ASM_CLAC
1376 pushl_cfi %eax 1383 pushl_cfi %eax
1377 movl %ss, %eax 1384 movl %ss, %eax
1378 cmpw $__ESPFIX_SS, %ax 1385 cmpw $__ESPFIX_SS, %ax
1379 popl_cfi %eax 1386 popl_cfi %eax
1380 je nmi_espfix_stack 1387 je nmi_espfix_stack
1381 cmpl $ia32_sysenter_target,(%esp) 1388 cmpl $ia32_sysenter_target,(%esp)
1382 je nmi_stack_fixup 1389 je nmi_stack_fixup
1383 pushl_cfi %eax 1390 pushl_cfi %eax
1384 movl %esp,%eax 1391 movl %esp,%eax
1385 /* Do not access memory above the end of our stack page, 1392 /* Do not access memory above the end of our stack page,
1386 * it might not exist. 1393 * it might not exist.
1387 */ 1394 */
1388 andl $(THREAD_SIZE-1),%eax 1395 andl $(THREAD_SIZE-1),%eax
1389 cmpl $(THREAD_SIZE-20),%eax 1396 cmpl $(THREAD_SIZE-20),%eax
1390 popl_cfi %eax 1397 popl_cfi %eax
1391 jae nmi_stack_correct 1398 jae nmi_stack_correct
1392 cmpl $ia32_sysenter_target,12(%esp) 1399 cmpl $ia32_sysenter_target,12(%esp)
1393 je nmi_debug_stack_check 1400 je nmi_debug_stack_check
1394 nmi_stack_correct: 1401 nmi_stack_correct:
1395 /* We have a RING0_INT_FRAME here */ 1402 /* We have a RING0_INT_FRAME here */
1396 pushl_cfi %eax 1403 pushl_cfi %eax
1397 SAVE_ALL 1404 SAVE_ALL
1398 xorl %edx,%edx # zero error code 1405 xorl %edx,%edx # zero error code
1399 movl %esp,%eax # pt_regs pointer 1406 movl %esp,%eax # pt_regs pointer
1400 call do_nmi 1407 call do_nmi
1401 jmp restore_all_notrace 1408 jmp restore_all_notrace
1402 CFI_ENDPROC 1409 CFI_ENDPROC
1403 1410
1404 nmi_stack_fixup: 1411 nmi_stack_fixup:
1405 RING0_INT_FRAME 1412 RING0_INT_FRAME
1406 FIX_STACK 12, nmi_stack_correct, 1 1413 FIX_STACK 12, nmi_stack_correct, 1
1407 jmp nmi_stack_correct 1414 jmp nmi_stack_correct
1408 1415
1409 nmi_debug_stack_check: 1416 nmi_debug_stack_check:
1410 /* We have a RING0_INT_FRAME here */ 1417 /* We have a RING0_INT_FRAME here */
1411 cmpw $__KERNEL_CS,16(%esp) 1418 cmpw $__KERNEL_CS,16(%esp)
1412 jne nmi_stack_correct 1419 jne nmi_stack_correct
1413 cmpl $debug,(%esp) 1420 cmpl $debug,(%esp)
1414 jb nmi_stack_correct 1421 jb nmi_stack_correct
1415 cmpl $debug_esp_fix_insn,(%esp) 1422 cmpl $debug_esp_fix_insn,(%esp)
1416 ja nmi_stack_correct 1423 ja nmi_stack_correct
1417 FIX_STACK 24, nmi_stack_correct, 1 1424 FIX_STACK 24, nmi_stack_correct, 1
1418 jmp nmi_stack_correct 1425 jmp nmi_stack_correct
1419 1426
1420 nmi_espfix_stack: 1427 nmi_espfix_stack:
1421 /* We have a RING0_INT_FRAME here. 1428 /* We have a RING0_INT_FRAME here.
1422 * 1429 *
1423 * create the pointer to lss back 1430 * create the pointer to lss back
1424 */ 1431 */
1425 pushl_cfi %ss 1432 pushl_cfi %ss
1426 pushl_cfi %esp 1433 pushl_cfi %esp
1427 addl $4, (%esp) 1434 addl $4, (%esp)
1428 /* copy the iret frame of 12 bytes */ 1435 /* copy the iret frame of 12 bytes */
1429 .rept 3 1436 .rept 3
1430 pushl_cfi 16(%esp) 1437 pushl_cfi 16(%esp)
1431 .endr 1438 .endr
1432 pushl_cfi %eax 1439 pushl_cfi %eax
1433 SAVE_ALL 1440 SAVE_ALL
1434 FIXUP_ESPFIX_STACK # %eax == %esp 1441 FIXUP_ESPFIX_STACK # %eax == %esp
1435 xorl %edx,%edx # zero error code 1442 xorl %edx,%edx # zero error code
1436 call do_nmi 1443 call do_nmi
1437 RESTORE_REGS 1444 RESTORE_REGS
1438 lss 12+4(%esp), %esp # back to espfix stack 1445 lss 12+4(%esp), %esp # back to espfix stack
1439 CFI_ADJUST_CFA_OFFSET -24 1446 CFI_ADJUST_CFA_OFFSET -24
1440 jmp irq_return 1447 jmp irq_return
1441 CFI_ENDPROC 1448 CFI_ENDPROC
1442 END(nmi) 1449 END(nmi)
1443 1450
1444 ENTRY(int3) 1451 ENTRY(int3)
1445 RING0_INT_FRAME 1452 RING0_INT_FRAME
1446 ASM_CLAC 1453 ASM_CLAC
1447 pushl_cfi $-1 # mark this as an int 1454 pushl_cfi $-1 # mark this as an int
1448 SAVE_ALL 1455 SAVE_ALL
1449 TRACE_IRQS_OFF 1456 TRACE_IRQS_OFF
1450 xorl %edx,%edx # zero error code 1457 xorl %edx,%edx # zero error code
1451 movl %esp,%eax # pt_regs pointer 1458 movl %esp,%eax # pt_regs pointer
1452 call do_int3 1459 call do_int3
1453 jmp ret_from_exception 1460 jmp ret_from_exception
1454 CFI_ENDPROC 1461 CFI_ENDPROC
1455 END(int3) 1462 END(int3)
1456 1463
1457 ENTRY(general_protection) 1464 ENTRY(general_protection)
1458 RING0_EC_FRAME 1465 RING0_EC_FRAME
1459 pushl_cfi $do_general_protection 1466 pushl_cfi $do_general_protection
1460 jmp error_code 1467 jmp error_code
1461 CFI_ENDPROC 1468 CFI_ENDPROC
1462 END(general_protection) 1469 END(general_protection)
1463 1470
1464 #ifdef CONFIG_KVM_GUEST 1471 #ifdef CONFIG_KVM_GUEST
1465 ENTRY(async_page_fault) 1472 ENTRY(async_page_fault)
1466 RING0_EC_FRAME 1473 RING0_EC_FRAME
1467 ASM_CLAC 1474 ASM_CLAC
1468 pushl_cfi $do_async_page_fault 1475 pushl_cfi $do_async_page_fault
1469 jmp error_code 1476 jmp error_code
1470 CFI_ENDPROC 1477 CFI_ENDPROC
1471 END(async_page_fault) 1478 END(async_page_fault)
1472 #endif 1479 #endif
1473 1480
1474 /* 1481 /*
1475 * End of kprobes section 1482 * End of kprobes section
1476 */ 1483 */
1477 .popsection 1484 .popsection
1478 1485
arch/x86/kernel/entry_64.S
1 /* 1 /*
2 * linux/arch/x86_64/entry.S 2 * linux/arch/x86_64/entry.S
3 * 3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs 5 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
6 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> 6 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
7 */ 7 */
8 8
9 /* 9 /*
10 * entry.S contains the system-call and fault low-level handling routines. 10 * entry.S contains the system-call and fault low-level handling routines.
11 * 11 *
12 * Some of this is documented in Documentation/x86/entry_64.txt 12 * Some of this is documented in Documentation/x86/entry_64.txt
13 * 13 *
14 * NOTE: This code handles signal-recognition, which happens every time 14 * NOTE: This code handles signal-recognition, which happens every time
15 * after an interrupt and after each system call. 15 * after an interrupt and after each system call.
16 * 16 *
17 * Normal syscalls and interrupts don't save a full stack frame, this is 17 * Normal syscalls and interrupts don't save a full stack frame, this is
18 * only done for syscall tracing, signals or fork/exec et.al. 18 * only done for syscall tracing, signals or fork/exec et.al.
19 * 19 *
20 * A note on terminology: 20 * A note on terminology:
21 * - top of stack: Architecture defined interrupt frame from SS to RIP 21 * - top of stack: Architecture defined interrupt frame from SS to RIP
22 * at the top of the kernel process stack. 22 * at the top of the kernel process stack.
23 * - partial stack frame: partially saved registers up to R11. 23 * - partial stack frame: partially saved registers up to R11.
24 * - full stack frame: Like partial stack frame, but all register saved. 24 * - full stack frame: Like partial stack frame, but all register saved.
25 * 25 *
26 * Some macro usage: 26 * Some macro usage:
27 * - CFI macros are used to generate dwarf2 unwind information for better 27 * - CFI macros are used to generate dwarf2 unwind information for better
28 * backtraces. They don't change any code. 28 * backtraces. They don't change any code.
29 * - SAVE_ALL/RESTORE_ALL - Save/restore all registers 29 * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
30 * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify. 30 * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
31 * There are unfortunately lots of special cases where some registers 31 * There are unfortunately lots of special cases where some registers
32 * not touched. The macro is a big mess that should be cleaned up. 32 * not touched. The macro is a big mess that should be cleaned up.
33 * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS. 33 * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
34 * Gives a full stack frame. 34 * Gives a full stack frame.
35 * - ENTRY/END Define functions in the symbol table. 35 * - ENTRY/END Define functions in the symbol table.
36 * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack 36 * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
37 * frame that is otherwise undefined after a SYSCALL 37 * frame that is otherwise undefined after a SYSCALL
38 * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. 38 * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
39 * - errorentry/paranoidentry/zeroentry - Define exception entry points. 39 * - errorentry/paranoidentry/zeroentry - Define exception entry points.
40 */ 40 */
41 41
42 #include <linux/linkage.h> 42 #include <linux/linkage.h>
43 #include <asm/segment.h> 43 #include <asm/segment.h>
44 #include <asm/cache.h> 44 #include <asm/cache.h>
45 #include <asm/errno.h> 45 #include <asm/errno.h>
46 #include <asm/dwarf2.h> 46 #include <asm/dwarf2.h>
47 #include <asm/calling.h> 47 #include <asm/calling.h>
48 #include <asm/asm-offsets.h> 48 #include <asm/asm-offsets.h>
49 #include <asm/msr.h> 49 #include <asm/msr.h>
50 #include <asm/unistd.h> 50 #include <asm/unistd.h>
51 #include <asm/thread_info.h> 51 #include <asm/thread_info.h>
52 #include <asm/hw_irq.h> 52 #include <asm/hw_irq.h>
53 #include <asm/page_types.h> 53 #include <asm/page_types.h>
54 #include <asm/irqflags.h> 54 #include <asm/irqflags.h>
55 #include <asm/paravirt.h> 55 #include <asm/paravirt.h>
56 #include <asm/ftrace.h> 56 #include <asm/ftrace.h>
57 #include <asm/percpu.h> 57 #include <asm/percpu.h>
58 #include <asm/asm.h> 58 #include <asm/asm.h>
59 #include <asm/context_tracking.h> 59 #include <asm/context_tracking.h>
60 #include <asm/smap.h> 60 #include <asm/smap.h>
61 #include <linux/err.h> 61 #include <linux/err.h>
62 62
63 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 63 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
64 #include <linux/elf-em.h> 64 #include <linux/elf-em.h>
65 #define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) 65 #define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
66 #define __AUDIT_ARCH_64BIT 0x80000000 66 #define __AUDIT_ARCH_64BIT 0x80000000
67 #define __AUDIT_ARCH_LE 0x40000000 67 #define __AUDIT_ARCH_LE 0x40000000
68 68
69 .code64 69 .code64
70 .section .entry.text, "ax" 70 .section .entry.text, "ax"
71 71
72 #ifdef CONFIG_FUNCTION_TRACER 72 #ifdef CONFIG_FUNCTION_TRACER
73 73
74 #ifdef CC_USING_FENTRY 74 #ifdef CC_USING_FENTRY
75 # define function_hook __fentry__ 75 # define function_hook __fentry__
76 #else 76 #else
77 # define function_hook mcount 77 # define function_hook mcount
78 #endif 78 #endif
79 79
80 #ifdef CONFIG_DYNAMIC_FTRACE 80 #ifdef CONFIG_DYNAMIC_FTRACE
81 81
82 ENTRY(function_hook) 82 ENTRY(function_hook)
83 retq 83 retq
84 END(function_hook) 84 END(function_hook)
85 85
86 /* skip is set if stack has been adjusted */ 86 /* skip is set if stack has been adjusted */
87 .macro ftrace_caller_setup skip=0 87 .macro ftrace_caller_setup skip=0
88 MCOUNT_SAVE_FRAME \skip 88 MCOUNT_SAVE_FRAME \skip
89 89
90 /* Load the ftrace_ops into the 3rd parameter */ 90 /* Load the ftrace_ops into the 3rd parameter */
91 leaq function_trace_op, %rdx 91 leaq function_trace_op, %rdx
92 92
93 /* Load ip into the first parameter */ 93 /* Load ip into the first parameter */
94 movq RIP(%rsp), %rdi 94 movq RIP(%rsp), %rdi
95 subq $MCOUNT_INSN_SIZE, %rdi 95 subq $MCOUNT_INSN_SIZE, %rdi
96 /* Load the parent_ip into the second parameter */ 96 /* Load the parent_ip into the second parameter */
97 #ifdef CC_USING_FENTRY 97 #ifdef CC_USING_FENTRY
98 movq SS+16(%rsp), %rsi 98 movq SS+16(%rsp), %rsi
99 #else 99 #else
100 movq 8(%rbp), %rsi 100 movq 8(%rbp), %rsi
101 #endif 101 #endif
102 .endm 102 .endm
103 103
104 ENTRY(ftrace_caller) 104 ENTRY(ftrace_caller)
105 /* Check if tracing was disabled (quick check) */ 105 /* Check if tracing was disabled (quick check) */
106 cmpl $0, function_trace_stop 106 cmpl $0, function_trace_stop
107 jne ftrace_stub 107 jne ftrace_stub
108 108
109 ftrace_caller_setup 109 ftrace_caller_setup
110 /* regs go into 4th parameter (but make it NULL) */ 110 /* regs go into 4th parameter (but make it NULL) */
111 movq $0, %rcx 111 movq $0, %rcx
112 112
113 GLOBAL(ftrace_call) 113 GLOBAL(ftrace_call)
114 call ftrace_stub 114 call ftrace_stub
115 115
116 MCOUNT_RESTORE_FRAME 116 MCOUNT_RESTORE_FRAME
117 ftrace_return: 117 ftrace_return:
118 118
119 #ifdef CONFIG_FUNCTION_GRAPH_TRACER 119 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
120 GLOBAL(ftrace_graph_call) 120 GLOBAL(ftrace_graph_call)
121 jmp ftrace_stub 121 jmp ftrace_stub
122 #endif 122 #endif
123 123
124 GLOBAL(ftrace_stub) 124 GLOBAL(ftrace_stub)
125 retq 125 retq
126 END(ftrace_caller) 126 END(ftrace_caller)
127 127
128 ENTRY(ftrace_regs_caller) 128 ENTRY(ftrace_regs_caller)
129 /* Save the current flags before compare (in SS location)*/ 129 /* Save the current flags before compare (in SS location)*/
130 pushfq 130 pushfq
131 131
132 /* Check if tracing was disabled (quick check) */ 132 /* Check if tracing was disabled (quick check) */
133 cmpl $0, function_trace_stop 133 cmpl $0, function_trace_stop
134 jne ftrace_restore_flags 134 jne ftrace_restore_flags
135 135
136 /* skip=8 to skip flags saved in SS */ 136 /* skip=8 to skip flags saved in SS */
137 ftrace_caller_setup 8 137 ftrace_caller_setup 8
138 138
139 /* Save the rest of pt_regs */ 139 /* Save the rest of pt_regs */
140 movq %r15, R15(%rsp) 140 movq %r15, R15(%rsp)
141 movq %r14, R14(%rsp) 141 movq %r14, R14(%rsp)
142 movq %r13, R13(%rsp) 142 movq %r13, R13(%rsp)
143 movq %r12, R12(%rsp) 143 movq %r12, R12(%rsp)
144 movq %r11, R11(%rsp) 144 movq %r11, R11(%rsp)
145 movq %r10, R10(%rsp) 145 movq %r10, R10(%rsp)
146 movq %rbp, RBP(%rsp) 146 movq %rbp, RBP(%rsp)
147 movq %rbx, RBX(%rsp) 147 movq %rbx, RBX(%rsp)
148 /* Copy saved flags */ 148 /* Copy saved flags */
149 movq SS(%rsp), %rcx 149 movq SS(%rsp), %rcx
150 movq %rcx, EFLAGS(%rsp) 150 movq %rcx, EFLAGS(%rsp)
151 /* Kernel segments */ 151 /* Kernel segments */
152 movq $__KERNEL_DS, %rcx 152 movq $__KERNEL_DS, %rcx
153 movq %rcx, SS(%rsp) 153 movq %rcx, SS(%rsp)
154 movq $__KERNEL_CS, %rcx 154 movq $__KERNEL_CS, %rcx
155 movq %rcx, CS(%rsp) 155 movq %rcx, CS(%rsp)
156 /* Stack - skipping return address */ 156 /* Stack - skipping return address */
157 leaq SS+16(%rsp), %rcx 157 leaq SS+16(%rsp), %rcx
158 movq %rcx, RSP(%rsp) 158 movq %rcx, RSP(%rsp)
159 159
160 /* regs go into 4th parameter */ 160 /* regs go into 4th parameter */
161 leaq (%rsp), %rcx 161 leaq (%rsp), %rcx
162 162
163 GLOBAL(ftrace_regs_call) 163 GLOBAL(ftrace_regs_call)
164 call ftrace_stub 164 call ftrace_stub
165 165
166 /* Copy flags back to SS, to restore them */ 166 /* Copy flags back to SS, to restore them */
167 movq EFLAGS(%rsp), %rax 167 movq EFLAGS(%rsp), %rax
168 movq %rax, SS(%rsp) 168 movq %rax, SS(%rsp)
169 169
170 /* Handlers can change the RIP */ 170 /* Handlers can change the RIP */
171 movq RIP(%rsp), %rax 171 movq RIP(%rsp), %rax
172 movq %rax, SS+8(%rsp) 172 movq %rax, SS+8(%rsp)
173 173
174 /* restore the rest of pt_regs */ 174 /* restore the rest of pt_regs */
175 movq R15(%rsp), %r15 175 movq R15(%rsp), %r15
176 movq R14(%rsp), %r14 176 movq R14(%rsp), %r14
177 movq R13(%rsp), %r13 177 movq R13(%rsp), %r13
178 movq R12(%rsp), %r12 178 movq R12(%rsp), %r12
179 movq R10(%rsp), %r10 179 movq R10(%rsp), %r10
180 movq RBP(%rsp), %rbp 180 movq RBP(%rsp), %rbp
181 movq RBX(%rsp), %rbx 181 movq RBX(%rsp), %rbx
182 182
183 /* skip=8 to skip flags saved in SS */ 183 /* skip=8 to skip flags saved in SS */
184 MCOUNT_RESTORE_FRAME 8 184 MCOUNT_RESTORE_FRAME 8
185 185
186 /* Restore flags */ 186 /* Restore flags */
187 popfq 187 popfq
188 188
189 jmp ftrace_return 189 jmp ftrace_return
190 ftrace_restore_flags: 190 ftrace_restore_flags:
191 popfq 191 popfq
192 jmp ftrace_stub 192 jmp ftrace_stub
193 193
194 END(ftrace_regs_caller) 194 END(ftrace_regs_caller)
195 195
196 196
197 #else /* ! CONFIG_DYNAMIC_FTRACE */ 197 #else /* ! CONFIG_DYNAMIC_FTRACE */
198 198
199 ENTRY(function_hook) 199 ENTRY(function_hook)
200 cmpl $0, function_trace_stop 200 cmpl $0, function_trace_stop
201 jne ftrace_stub 201 jne ftrace_stub
202 202
203 cmpq $ftrace_stub, ftrace_trace_function 203 cmpq $ftrace_stub, ftrace_trace_function
204 jnz trace 204 jnz trace
205 205
206 #ifdef CONFIG_FUNCTION_GRAPH_TRACER 206 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
207 cmpq $ftrace_stub, ftrace_graph_return 207 cmpq $ftrace_stub, ftrace_graph_return
208 jnz ftrace_graph_caller 208 jnz ftrace_graph_caller
209 209
210 cmpq $ftrace_graph_entry_stub, ftrace_graph_entry 210 cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
211 jnz ftrace_graph_caller 211 jnz ftrace_graph_caller
212 #endif 212 #endif
213 213
214 GLOBAL(ftrace_stub) 214 GLOBAL(ftrace_stub)
215 retq 215 retq
216 216
217 trace: 217 trace:
218 MCOUNT_SAVE_FRAME 218 MCOUNT_SAVE_FRAME
219 219
220 movq RIP(%rsp), %rdi 220 movq RIP(%rsp), %rdi
221 #ifdef CC_USING_FENTRY 221 #ifdef CC_USING_FENTRY
222 movq SS+16(%rsp), %rsi 222 movq SS+16(%rsp), %rsi
223 #else 223 #else
224 movq 8(%rbp), %rsi 224 movq 8(%rbp), %rsi
225 #endif 225 #endif
226 subq $MCOUNT_INSN_SIZE, %rdi 226 subq $MCOUNT_INSN_SIZE, %rdi
227 227
228 call *ftrace_trace_function 228 call *ftrace_trace_function
229 229
230 MCOUNT_RESTORE_FRAME 230 MCOUNT_RESTORE_FRAME
231 231
232 jmp ftrace_stub 232 jmp ftrace_stub
233 END(function_hook) 233 END(function_hook)
234 #endif /* CONFIG_DYNAMIC_FTRACE */ 234 #endif /* CONFIG_DYNAMIC_FTRACE */
235 #endif /* CONFIG_FUNCTION_TRACER */ 235 #endif /* CONFIG_FUNCTION_TRACER */
236 236
237 #ifdef CONFIG_FUNCTION_GRAPH_TRACER 237 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
238 ENTRY(ftrace_graph_caller) 238 ENTRY(ftrace_graph_caller)
239 MCOUNT_SAVE_FRAME 239 MCOUNT_SAVE_FRAME
240 240
241 #ifdef CC_USING_FENTRY 241 #ifdef CC_USING_FENTRY
242 leaq SS+16(%rsp), %rdi 242 leaq SS+16(%rsp), %rdi
243 movq $0, %rdx /* No framepointers needed */ 243 movq $0, %rdx /* No framepointers needed */
244 #else 244 #else
245 leaq 8(%rbp), %rdi 245 leaq 8(%rbp), %rdi
246 movq (%rbp), %rdx 246 movq (%rbp), %rdx
247 #endif 247 #endif
248 movq RIP(%rsp), %rsi 248 movq RIP(%rsp), %rsi
249 subq $MCOUNT_INSN_SIZE, %rsi 249 subq $MCOUNT_INSN_SIZE, %rsi
250 250
251 call prepare_ftrace_return 251 call prepare_ftrace_return
252 252
253 MCOUNT_RESTORE_FRAME 253 MCOUNT_RESTORE_FRAME
254 254
255 retq 255 retq
256 END(ftrace_graph_caller) 256 END(ftrace_graph_caller)
257 257
258 GLOBAL(return_to_handler) 258 GLOBAL(return_to_handler)
259 subq $24, %rsp 259 subq $24, %rsp
260 260
261 /* Save the return values */ 261 /* Save the return values */
262 movq %rax, (%rsp) 262 movq %rax, (%rsp)
263 movq %rdx, 8(%rsp) 263 movq %rdx, 8(%rsp)
264 movq %rbp, %rdi 264 movq %rbp, %rdi
265 265
266 call ftrace_return_to_handler 266 call ftrace_return_to_handler
267 267
268 movq %rax, %rdi 268 movq %rax, %rdi
269 movq 8(%rsp), %rdx 269 movq 8(%rsp), %rdx
270 movq (%rsp), %rax 270 movq (%rsp), %rax
271 addq $24, %rsp 271 addq $24, %rsp
272 jmp *%rdi 272 jmp *%rdi
273 #endif 273 #endif
274 274
275 275
276 #ifndef CONFIG_PREEMPT 276 #ifndef CONFIG_PREEMPT
277 #define retint_kernel retint_restore_args 277 #define retint_kernel retint_restore_args
278 #endif 278 #endif
279 279
280 #ifdef CONFIG_PARAVIRT 280 #ifdef CONFIG_PARAVIRT
281 ENTRY(native_usergs_sysret64) 281 ENTRY(native_usergs_sysret64)
282 swapgs 282 swapgs
283 sysretq 283 sysretq
284 ENDPROC(native_usergs_sysret64) 284 ENDPROC(native_usergs_sysret64)
285 #endif /* CONFIG_PARAVIRT */ 285 #endif /* CONFIG_PARAVIRT */
286 286
287 287
288 .macro TRACE_IRQS_IRETQ offset=ARGOFFSET 288 .macro TRACE_IRQS_IRETQ offset=ARGOFFSET
289 #ifdef CONFIG_TRACE_IRQFLAGS 289 #ifdef CONFIG_TRACE_IRQFLAGS
290 bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ 290 bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */
291 jnc 1f 291 jnc 1f
292 TRACE_IRQS_ON 292 TRACE_IRQS_ON
293 1: 293 1:
294 #endif 294 #endif
295 .endm 295 .endm
296 296
297 /* 297 /*
298 * When dynamic function tracer is enabled it will add a breakpoint 298 * When dynamic function tracer is enabled it will add a breakpoint
299 * to all locations that it is about to modify, sync CPUs, update 299 * to all locations that it is about to modify, sync CPUs, update
300 * all the code, sync CPUs, then remove the breakpoints. In this time 300 * all the code, sync CPUs, then remove the breakpoints. In this time
301 * if lockdep is enabled, it might jump back into the debug handler 301 * if lockdep is enabled, it might jump back into the debug handler
302 * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF). 302 * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF).
303 * 303 *
304 * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to 304 * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to
305 * make sure the stack pointer does not get reset back to the top 305 * make sure the stack pointer does not get reset back to the top
306 * of the debug stack, and instead just reuses the current stack. 306 * of the debug stack, and instead just reuses the current stack.
307 */ 307 */
308 #if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS) 308 #if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS)
309 309
310 .macro TRACE_IRQS_OFF_DEBUG 310 .macro TRACE_IRQS_OFF_DEBUG
311 call debug_stack_set_zero 311 call debug_stack_set_zero
312 TRACE_IRQS_OFF 312 TRACE_IRQS_OFF
313 call debug_stack_reset 313 call debug_stack_reset
314 .endm 314 .endm
315 315
316 .macro TRACE_IRQS_ON_DEBUG 316 .macro TRACE_IRQS_ON_DEBUG
317 call debug_stack_set_zero 317 call debug_stack_set_zero
318 TRACE_IRQS_ON 318 TRACE_IRQS_ON
319 call debug_stack_reset 319 call debug_stack_reset
320 .endm 320 .endm
321 321
322 .macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET 322 .macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET
323 bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ 323 bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */
324 jnc 1f 324 jnc 1f
325 TRACE_IRQS_ON_DEBUG 325 TRACE_IRQS_ON_DEBUG
326 1: 326 1:
327 .endm 327 .endm
328 328
329 #else 329 #else
330 # define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF 330 # define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF
331 # define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON 331 # define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON
332 # define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ 332 # define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ
333 #endif 333 #endif
334 334
335 /* 335 /*
336 * C code is not supposed to know about undefined top of stack. Every time 336 * C code is not supposed to know about undefined top of stack. Every time
337 * a C function with an pt_regs argument is called from the SYSCALL based 337 * a C function with an pt_regs argument is called from the SYSCALL based
338 * fast path FIXUP_TOP_OF_STACK is needed. 338 * fast path FIXUP_TOP_OF_STACK is needed.
339 * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs 339 * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
340 * manipulation. 340 * manipulation.
341 */ 341 */
342 342
343 /* %rsp:at FRAMEEND */ 343 /* %rsp:at FRAMEEND */
344 .macro FIXUP_TOP_OF_STACK tmp offset=0 344 .macro FIXUP_TOP_OF_STACK tmp offset=0
345 movq PER_CPU_VAR(old_rsp),\tmp 345 movq PER_CPU_VAR(old_rsp),\tmp
346 movq \tmp,RSP+\offset(%rsp) 346 movq \tmp,RSP+\offset(%rsp)
347 movq $__USER_DS,SS+\offset(%rsp) 347 movq $__USER_DS,SS+\offset(%rsp)
348 movq $__USER_CS,CS+\offset(%rsp) 348 movq $__USER_CS,CS+\offset(%rsp)
349 movq $-1,RCX+\offset(%rsp) 349 movq $-1,RCX+\offset(%rsp)
350 movq R11+\offset(%rsp),\tmp /* get eflags */ 350 movq R11+\offset(%rsp),\tmp /* get eflags */
351 movq \tmp,EFLAGS+\offset(%rsp) 351 movq \tmp,EFLAGS+\offset(%rsp)
352 .endm 352 .endm
353 353
354 .macro RESTORE_TOP_OF_STACK tmp offset=0 354 .macro RESTORE_TOP_OF_STACK tmp offset=0
355 movq RSP+\offset(%rsp),\tmp 355 movq RSP+\offset(%rsp),\tmp
356 movq \tmp,PER_CPU_VAR(old_rsp) 356 movq \tmp,PER_CPU_VAR(old_rsp)
357 movq EFLAGS+\offset(%rsp),\tmp 357 movq EFLAGS+\offset(%rsp),\tmp
358 movq \tmp,R11+\offset(%rsp) 358 movq \tmp,R11+\offset(%rsp)
359 .endm 359 .endm
360 360
361 .macro FAKE_STACK_FRAME child_rip 361 .macro FAKE_STACK_FRAME child_rip
362 /* push in order ss, rsp, eflags, cs, rip */ 362 /* push in order ss, rsp, eflags, cs, rip */
363 xorl %eax, %eax 363 xorl %eax, %eax
364 pushq_cfi $__KERNEL_DS /* ss */ 364 pushq_cfi $__KERNEL_DS /* ss */
365 /*CFI_REL_OFFSET ss,0*/ 365 /*CFI_REL_OFFSET ss,0*/
366 pushq_cfi %rax /* rsp */ 366 pushq_cfi %rax /* rsp */
367 CFI_REL_OFFSET rsp,0 367 CFI_REL_OFFSET rsp,0
368 pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_BIT1) /* eflags - interrupts on */ 368 pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_BIT1) /* eflags - interrupts on */
369 /*CFI_REL_OFFSET rflags,0*/ 369 /*CFI_REL_OFFSET rflags,0*/
370 pushq_cfi $__KERNEL_CS /* cs */ 370 pushq_cfi $__KERNEL_CS /* cs */
371 /*CFI_REL_OFFSET cs,0*/ 371 /*CFI_REL_OFFSET cs,0*/
372 pushq_cfi \child_rip /* rip */ 372 pushq_cfi \child_rip /* rip */
373 CFI_REL_OFFSET rip,0 373 CFI_REL_OFFSET rip,0
374 pushq_cfi %rax /* orig rax */ 374 pushq_cfi %rax /* orig rax */
375 .endm 375 .endm
376 376
377 .macro UNFAKE_STACK_FRAME 377 .macro UNFAKE_STACK_FRAME
378 addq $8*6, %rsp 378 addq $8*6, %rsp
379 CFI_ADJUST_CFA_OFFSET -(6*8) 379 CFI_ADJUST_CFA_OFFSET -(6*8)
380 .endm 380 .endm
381 381
382 /* 382 /*
383 * initial frame state for interrupts (and exceptions without error code) 383 * initial frame state for interrupts (and exceptions without error code)
384 */ 384 */
385 .macro EMPTY_FRAME start=1 offset=0 385 .macro EMPTY_FRAME start=1 offset=0
386 .if \start 386 .if \start
387 CFI_STARTPROC simple 387 CFI_STARTPROC simple
388 CFI_SIGNAL_FRAME 388 CFI_SIGNAL_FRAME
389 CFI_DEF_CFA rsp,8+\offset 389 CFI_DEF_CFA rsp,8+\offset
390 .else 390 .else
391 CFI_DEF_CFA_OFFSET 8+\offset 391 CFI_DEF_CFA_OFFSET 8+\offset
392 .endif 392 .endif
393 .endm 393 .endm
394 394
395 /* 395 /*
396 * initial frame state for interrupts (and exceptions without error code) 396 * initial frame state for interrupts (and exceptions without error code)
397 */ 397 */
398 .macro INTR_FRAME start=1 offset=0 398 .macro INTR_FRAME start=1 offset=0
399 EMPTY_FRAME \start, SS+8+\offset-RIP 399 EMPTY_FRAME \start, SS+8+\offset-RIP
400 /*CFI_REL_OFFSET ss, SS+\offset-RIP*/ 400 /*CFI_REL_OFFSET ss, SS+\offset-RIP*/
401 CFI_REL_OFFSET rsp, RSP+\offset-RIP 401 CFI_REL_OFFSET rsp, RSP+\offset-RIP
402 /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/ 402 /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/
403 /*CFI_REL_OFFSET cs, CS+\offset-RIP*/ 403 /*CFI_REL_OFFSET cs, CS+\offset-RIP*/
404 CFI_REL_OFFSET rip, RIP+\offset-RIP 404 CFI_REL_OFFSET rip, RIP+\offset-RIP
405 .endm 405 .endm
406 406
407 /* 407 /*
408 * initial frame state for exceptions with error code (and interrupts 408 * initial frame state for exceptions with error code (and interrupts
409 * with vector already pushed) 409 * with vector already pushed)
410 */ 410 */
411 .macro XCPT_FRAME start=1 offset=0 411 .macro XCPT_FRAME start=1 offset=0
412 INTR_FRAME \start, RIP+\offset-ORIG_RAX 412 INTR_FRAME \start, RIP+\offset-ORIG_RAX
413 /*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/ 413 /*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/
414 .endm 414 .endm
415 415
416 /* 416 /*
417 * frame that enables calling into C. 417 * frame that enables calling into C.
418 */ 418 */
419 .macro PARTIAL_FRAME start=1 offset=0 419 .macro PARTIAL_FRAME start=1 offset=0
420 XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET 420 XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET
421 CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET 421 CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
422 CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET 422 CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
423 CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET 423 CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET
424 CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET 424 CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET
425 CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET 425 CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET
426 CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET 426 CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET
427 CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET 427 CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
428 CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET 428 CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
429 CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET 429 CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
430 .endm 430 .endm
431 431
432 /* 432 /*
433 * frame that enables passing a complete pt_regs to a C function. 433 * frame that enables passing a complete pt_regs to a C function.
434 */ 434 */
435 .macro DEFAULT_FRAME start=1 offset=0 435 .macro DEFAULT_FRAME start=1 offset=0
436 PARTIAL_FRAME \start, R11+\offset-R15 436 PARTIAL_FRAME \start, R11+\offset-R15
437 CFI_REL_OFFSET rbx, RBX+\offset 437 CFI_REL_OFFSET rbx, RBX+\offset
438 CFI_REL_OFFSET rbp, RBP+\offset 438 CFI_REL_OFFSET rbp, RBP+\offset
439 CFI_REL_OFFSET r12, R12+\offset 439 CFI_REL_OFFSET r12, R12+\offset
440 CFI_REL_OFFSET r13, R13+\offset 440 CFI_REL_OFFSET r13, R13+\offset
441 CFI_REL_OFFSET r14, R14+\offset 441 CFI_REL_OFFSET r14, R14+\offset
442 CFI_REL_OFFSET r15, R15+\offset 442 CFI_REL_OFFSET r15, R15+\offset
443 .endm 443 .endm
444 444
445 /* save partial stack frame */ 445 /* save partial stack frame */
446 .macro SAVE_ARGS_IRQ 446 .macro SAVE_ARGS_IRQ
447 cld 447 cld
448 /* start from rbp in pt_regs and jump over */ 448 /* start from rbp in pt_regs and jump over */
449 movq_cfi rdi, (RDI-RBP) 449 movq_cfi rdi, (RDI-RBP)
450 movq_cfi rsi, (RSI-RBP) 450 movq_cfi rsi, (RSI-RBP)
451 movq_cfi rdx, (RDX-RBP) 451 movq_cfi rdx, (RDX-RBP)
452 movq_cfi rcx, (RCX-RBP) 452 movq_cfi rcx, (RCX-RBP)
453 movq_cfi rax, (RAX-RBP) 453 movq_cfi rax, (RAX-RBP)
454 movq_cfi r8, (R8-RBP) 454 movq_cfi r8, (R8-RBP)
455 movq_cfi r9, (R9-RBP) 455 movq_cfi r9, (R9-RBP)
456 movq_cfi r10, (R10-RBP) 456 movq_cfi r10, (R10-RBP)
457 movq_cfi r11, (R11-RBP) 457 movq_cfi r11, (R11-RBP)
458 458
459 /* Save rbp so that we can unwind from get_irq_regs() */ 459 /* Save rbp so that we can unwind from get_irq_regs() */
460 movq_cfi rbp, 0 460 movq_cfi rbp, 0
461 461
462 /* Save previous stack value */ 462 /* Save previous stack value */
463 movq %rsp, %rsi 463 movq %rsp, %rsi
464 464
465 leaq -RBP(%rsp),%rdi /* arg1 for handler */ 465 leaq -RBP(%rsp),%rdi /* arg1 for handler */
466 testl $3, CS-RBP(%rsi) 466 testl $3, CS-RBP(%rsi)
467 je 1f 467 je 1f
468 SWAPGS 468 SWAPGS
469 /* 469 /*
470 * irq_count is used to check if a CPU is already on an interrupt stack 470 * irq_count is used to check if a CPU is already on an interrupt stack
471 * or not. While this is essentially redundant with preempt_count it is 471 * or not. While this is essentially redundant with preempt_count it is
472 * a little cheaper to use a separate counter in the PDA (short of 472 * a little cheaper to use a separate counter in the PDA (short of
473 * moving irq_enter into assembly, which would be too much work) 473 * moving irq_enter into assembly, which would be too much work)
474 */ 474 */
475 1: incl PER_CPU_VAR(irq_count) 475 1: incl PER_CPU_VAR(irq_count)
476 cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp 476 cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
477 CFI_DEF_CFA_REGISTER rsi 477 CFI_DEF_CFA_REGISTER rsi
478 478
479 /* Store previous stack value */ 479 /* Store previous stack value */
480 pushq %rsi 480 pushq %rsi
481 CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ 481 CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
482 0x77 /* DW_OP_breg7 */, 0, \ 482 0x77 /* DW_OP_breg7 */, 0, \
483 0x06 /* DW_OP_deref */, \ 483 0x06 /* DW_OP_deref */, \
484 0x08 /* DW_OP_const1u */, SS+8-RBP, \ 484 0x08 /* DW_OP_const1u */, SS+8-RBP, \
485 0x22 /* DW_OP_plus */ 485 0x22 /* DW_OP_plus */
486 /* We entered an interrupt context - irqs are off: */ 486 /* We entered an interrupt context - irqs are off: */
487 TRACE_IRQS_OFF 487 TRACE_IRQS_OFF
488 .endm 488 .endm
489 489
490 ENTRY(save_rest) 490 ENTRY(save_rest)
491 PARTIAL_FRAME 1 (REST_SKIP+8) 491 PARTIAL_FRAME 1 (REST_SKIP+8)
492 movq 5*8+16(%rsp), %r11 /* save return address */ 492 movq 5*8+16(%rsp), %r11 /* save return address */
493 movq_cfi rbx, RBX+16 493 movq_cfi rbx, RBX+16
494 movq_cfi rbp, RBP+16 494 movq_cfi rbp, RBP+16
495 movq_cfi r12, R12+16 495 movq_cfi r12, R12+16
496 movq_cfi r13, R13+16 496 movq_cfi r13, R13+16
497 movq_cfi r14, R14+16 497 movq_cfi r14, R14+16
498 movq_cfi r15, R15+16 498 movq_cfi r15, R15+16
499 movq %r11, 8(%rsp) /* return address */ 499 movq %r11, 8(%rsp) /* return address */
500 FIXUP_TOP_OF_STACK %r11, 16 500 FIXUP_TOP_OF_STACK %r11, 16
501 ret 501 ret
502 CFI_ENDPROC 502 CFI_ENDPROC
503 END(save_rest) 503 END(save_rest)
504 504
505 /* save complete stack frame */ 505 /* save complete stack frame */
506 .pushsection .kprobes.text, "ax" 506 .pushsection .kprobes.text, "ax"
507 ENTRY(save_paranoid) 507 ENTRY(save_paranoid)
508 XCPT_FRAME 1 RDI+8 508 XCPT_FRAME 1 RDI+8
509 cld 509 cld
510 movq_cfi rdi, RDI+8 510 movq_cfi rdi, RDI+8
511 movq_cfi rsi, RSI+8 511 movq_cfi rsi, RSI+8
512 movq_cfi rdx, RDX+8 512 movq_cfi rdx, RDX+8
513 movq_cfi rcx, RCX+8 513 movq_cfi rcx, RCX+8
514 movq_cfi rax, RAX+8 514 movq_cfi rax, RAX+8
515 movq_cfi r8, R8+8 515 movq_cfi r8, R8+8
516 movq_cfi r9, R9+8 516 movq_cfi r9, R9+8
517 movq_cfi r10, R10+8 517 movq_cfi r10, R10+8
518 movq_cfi r11, R11+8 518 movq_cfi r11, R11+8
519 movq_cfi rbx, RBX+8 519 movq_cfi rbx, RBX+8
520 movq_cfi rbp, RBP+8 520 movq_cfi rbp, RBP+8
521 movq_cfi r12, R12+8 521 movq_cfi r12, R12+8
522 movq_cfi r13, R13+8 522 movq_cfi r13, R13+8
523 movq_cfi r14, R14+8 523 movq_cfi r14, R14+8
524 movq_cfi r15, R15+8 524 movq_cfi r15, R15+8
525 movl $1,%ebx 525 movl $1,%ebx
526 movl $MSR_GS_BASE,%ecx 526 movl $MSR_GS_BASE,%ecx
527 rdmsr 527 rdmsr
528 testl %edx,%edx 528 testl %edx,%edx
529 js 1f /* negative -> in kernel */ 529 js 1f /* negative -> in kernel */
530 SWAPGS 530 SWAPGS
531 xorl %ebx,%ebx 531 xorl %ebx,%ebx
532 1: ret 532 1: ret
533 CFI_ENDPROC 533 CFI_ENDPROC
534 END(save_paranoid) 534 END(save_paranoid)
535 .popsection 535 .popsection
536 536
537 /* 537 /*
538 * A newly forked process directly context switches into this address. 538 * A newly forked process directly context switches into this address.
539 * 539 *
540 * rdi: prev task we switched from 540 * rdi: prev task we switched from
541 */ 541 */
542 ENTRY(ret_from_fork) 542 ENTRY(ret_from_fork)
543 DEFAULT_FRAME 543 DEFAULT_FRAME
544 544
545 LOCK ; btr $TIF_FORK,TI_flags(%r8) 545 LOCK ; btr $TIF_FORK,TI_flags(%r8)
546 546
547 pushq_cfi $0x0002 547 pushq_cfi $0x0002
548 popfq_cfi # reset kernel eflags 548 popfq_cfi # reset kernel eflags
549 549
550 call schedule_tail # rdi: 'prev' task parameter 550 call schedule_tail # rdi: 'prev' task parameter
551 551
552 GET_THREAD_INFO(%rcx) 552 GET_THREAD_INFO(%rcx)
553 553
554 RESTORE_REST 554 RESTORE_REST
555 555
556 testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? 556 testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
557 jz 1f 557 jz 1f
558 558
559 testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET 559 testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET
560 jnz int_ret_from_sys_call 560 jnz int_ret_from_sys_call
561 561
562 RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET 562 RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
563 jmp ret_from_sys_call # go to the SYSRET fastpath 563 jmp ret_from_sys_call # go to the SYSRET fastpath
564 564
565 1: 565 1:
566 subq $REST_SKIP, %rsp # leave space for volatiles 566 subq $REST_SKIP, %rsp # leave space for volatiles
567 CFI_ADJUST_CFA_OFFSET REST_SKIP 567 CFI_ADJUST_CFA_OFFSET REST_SKIP
568 movq %rbp, %rdi 568 movq %rbp, %rdi
569 call *%rbx 569 call *%rbx
570 movl $0, RAX(%rsp) 570 movl $0, RAX(%rsp)
571 RESTORE_REST 571 RESTORE_REST
572 jmp int_ret_from_sys_call 572 jmp int_ret_from_sys_call
573 CFI_ENDPROC 573 CFI_ENDPROC
574 END(ret_from_fork) 574 END(ret_from_fork)
575 575
576 /* 576 /*
577 * System call entry. Up to 6 arguments in registers are supported. 577 * System call entry. Up to 6 arguments in registers are supported.
578 * 578 *
579 * SYSCALL does not save anything on the stack and does not change the 579 * SYSCALL does not save anything on the stack and does not change the
580 * stack pointer. However, it does mask the flags register for us, so 580 * stack pointer. However, it does mask the flags register for us, so
581 * CLD and CLAC are not needed. 581 * CLD and CLAC are not needed.
582 */ 582 */
583 583
584 /* 584 /*
585 * Register setup: 585 * Register setup:
586 * rax system call number 586 * rax system call number
587 * rdi arg0 587 * rdi arg0
588 * rcx return address for syscall/sysret, C arg3 588 * rcx return address for syscall/sysret, C arg3
589 * rsi arg1 589 * rsi arg1
590 * rdx arg2 590 * rdx arg2
591 * r10 arg3 (--> moved to rcx for C) 591 * r10 arg3 (--> moved to rcx for C)
592 * r8 arg4 592 * r8 arg4
593 * r9 arg5 593 * r9 arg5
594 * r11 eflags for syscall/sysret, temporary for C 594 * r11 eflags for syscall/sysret, temporary for C
595 * r12-r15,rbp,rbx saved by C code, not touched. 595 * r12-r15,rbp,rbx saved by C code, not touched.
596 * 596 *
597 * Interrupts are off on entry. 597 * Interrupts are off on entry.
598 * Only called from user space. 598 * Only called from user space.
599 * 599 *
600 * XXX if we had a free scratch register we could save the RSP into the stack frame 600 * XXX if we had a free scratch register we could save the RSP into the stack frame
601 * and report it properly in ps. Unfortunately we haven't. 601 * and report it properly in ps. Unfortunately we haven't.
602 * 602 *
603 * When user can change the frames always force IRET. That is because 603 * When user can change the frames always force IRET. That is because
604 * it deals with uncanonical addresses better. SYSRET has trouble 604 * it deals with uncanonical addresses better. SYSRET has trouble
605 * with them due to bugs in both AMD and Intel CPUs. 605 * with them due to bugs in both AMD and Intel CPUs.
606 */ 606 */
607 607
608 ENTRY(system_call) 608 ENTRY(system_call)
609 CFI_STARTPROC simple 609 CFI_STARTPROC simple
610 CFI_SIGNAL_FRAME 610 CFI_SIGNAL_FRAME
611 CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET 611 CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET
612 CFI_REGISTER rip,rcx 612 CFI_REGISTER rip,rcx
613 /*CFI_REGISTER rflags,r11*/ 613 /*CFI_REGISTER rflags,r11*/
614 SWAPGS_UNSAFE_STACK 614 SWAPGS_UNSAFE_STACK
615 /* 615 /*
616 * A hypervisor implementation might want to use a label 616 * A hypervisor implementation might want to use a label
617 * after the swapgs, so that it can do the swapgs 617 * after the swapgs, so that it can do the swapgs
618 * for the guest and jump here on syscall. 618 * for the guest and jump here on syscall.
619 */ 619 */
620 GLOBAL(system_call_after_swapgs) 620 GLOBAL(system_call_after_swapgs)
621 621
622 movq %rsp,PER_CPU_VAR(old_rsp) 622 movq %rsp,PER_CPU_VAR(old_rsp)
623 movq PER_CPU_VAR(kernel_stack),%rsp 623 movq PER_CPU_VAR(kernel_stack),%rsp
624 /* 624 /*
625 * No need to follow this irqs off/on section - it's straight 625 * No need to follow this irqs off/on section - it's straight
626 * and short: 626 * and short:
627 */ 627 */
628 ENABLE_INTERRUPTS(CLBR_NONE) 628 ENABLE_INTERRUPTS(CLBR_NONE)
629 SAVE_ARGS 8,0 629 SAVE_ARGS 8,0
630 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) 630 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
631 movq %rcx,RIP-ARGOFFSET(%rsp) 631 movq %rcx,RIP-ARGOFFSET(%rsp)
632 CFI_REL_OFFSET rip,RIP-ARGOFFSET 632 CFI_REL_OFFSET rip,RIP-ARGOFFSET
633 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 633 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
634 jnz tracesys 634 jnz tracesys
635 system_call_fastpath: 635 system_call_fastpath:
636 #if __SYSCALL_MASK == ~0 636 #if __SYSCALL_MASK == ~0
637 cmpq $__NR_syscall_max,%rax 637 cmpq $__NR_syscall_max,%rax
638 #else 638 #else
639 andl $__SYSCALL_MASK,%eax 639 andl $__SYSCALL_MASK,%eax
640 cmpl $__NR_syscall_max,%eax 640 cmpl $__NR_syscall_max,%eax
641 #endif 641 #endif
642 ja badsys 642 ja badsys
643 movq %r10,%rcx 643 movq %r10,%rcx
644 call *sys_call_table(,%rax,8) # XXX: rip relative 644 call *sys_call_table(,%rax,8) # XXX: rip relative
645 movq %rax,RAX-ARGOFFSET(%rsp) 645 movq %rax,RAX-ARGOFFSET(%rsp)
646 /* 646 /*
647 * Syscall return path ending with SYSRET (fast path) 647 * Syscall return path ending with SYSRET (fast path)
648 * Has incomplete stack frame and undefined top of stack. 648 * Has incomplete stack frame and undefined top of stack.
649 */ 649 */
650 ret_from_sys_call: 650 ret_from_sys_call:
651 movl $_TIF_ALLWORK_MASK,%edi 651 movl $_TIF_ALLWORK_MASK,%edi
652 /* edi: flagmask */ 652 /* edi: flagmask */
653 sysret_check: 653 sysret_check:
654 LOCKDEP_SYS_EXIT 654 LOCKDEP_SYS_EXIT
655 DISABLE_INTERRUPTS(CLBR_NONE) 655 DISABLE_INTERRUPTS(CLBR_NONE)
656 TRACE_IRQS_OFF 656 TRACE_IRQS_OFF
657 movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx 657 movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx
658 andl %edi,%edx 658 andl %edi,%edx
659 jnz sysret_careful 659 jnz sysret_careful
660 CFI_REMEMBER_STATE 660 CFI_REMEMBER_STATE
661 /* 661 /*
662 * sysretq will re-enable interrupts: 662 * sysretq will re-enable interrupts:
663 */ 663 */
664 TRACE_IRQS_ON 664 TRACE_IRQS_ON
665 movq RIP-ARGOFFSET(%rsp),%rcx 665 movq RIP-ARGOFFSET(%rsp),%rcx
666 CFI_REGISTER rip,rcx 666 CFI_REGISTER rip,rcx
667 RESTORE_ARGS 1,-ARG_SKIP,0 667 RESTORE_ARGS 1,-ARG_SKIP,0
668 /*CFI_REGISTER rflags,r11*/ 668 /*CFI_REGISTER rflags,r11*/
669 movq PER_CPU_VAR(old_rsp), %rsp 669 movq PER_CPU_VAR(old_rsp), %rsp
670 USERGS_SYSRET64 670 USERGS_SYSRET64
671 671
672 CFI_RESTORE_STATE 672 CFI_RESTORE_STATE
673 /* Handle reschedules */ 673 /* Handle reschedules */
674 /* edx: work, edi: workmask */ 674 /* edx: work, edi: workmask */
675 sysret_careful: 675 sysret_careful:
676 bt $TIF_NEED_RESCHED,%edx 676 bt $TIF_NEED_RESCHED,%edx
677 jnc sysret_signal 677 jnc sysret_signal
678 TRACE_IRQS_ON 678 TRACE_IRQS_ON
679 ENABLE_INTERRUPTS(CLBR_NONE) 679 ENABLE_INTERRUPTS(CLBR_NONE)
680 pushq_cfi %rdi 680 pushq_cfi %rdi
681 SCHEDULE_USER 681 SCHEDULE_USER
682 popq_cfi %rdi 682 popq_cfi %rdi
683 jmp sysret_check 683 jmp sysret_check
684 684
685 /* Handle a signal */ 685 /* Handle a signal */
686 sysret_signal: 686 sysret_signal:
687 TRACE_IRQS_ON 687 TRACE_IRQS_ON
688 ENABLE_INTERRUPTS(CLBR_NONE) 688 ENABLE_INTERRUPTS(CLBR_NONE)
689 #ifdef CONFIG_AUDITSYSCALL 689 #ifdef CONFIG_AUDITSYSCALL
690 bt $TIF_SYSCALL_AUDIT,%edx 690 bt $TIF_SYSCALL_AUDIT,%edx
691 jc sysret_audit 691 jc sysret_audit
692 #endif 692 #endif
693 /* 693 /*
694 * We have a signal, or exit tracing or single-step. 694 * We have a signal, or exit tracing or single-step.
695 * These all wind up with the iret return path anyway, 695 * These all wind up with the iret return path anyway,
696 * so just join that path right now. 696 * so just join that path right now.
697 */ 697 */
698 FIXUP_TOP_OF_STACK %r11, -ARGOFFSET 698 FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
699 jmp int_check_syscall_exit_work 699 jmp int_check_syscall_exit_work
700 700
701 badsys: 701 badsys:
702 movq $-ENOSYS,RAX-ARGOFFSET(%rsp) 702 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
703 jmp ret_from_sys_call 703 jmp ret_from_sys_call
704 704
705 #ifdef CONFIG_AUDITSYSCALL 705 #ifdef CONFIG_AUDITSYSCALL
706 /* 706 /*
707 * Fast path for syscall audit without full syscall trace. 707 * Fast path for syscall audit without full syscall trace.
708 * We just call __audit_syscall_entry() directly, and then 708 * We just call __audit_syscall_entry() directly, and then
709 * jump back to the normal fast path. 709 * jump back to the normal fast path.
710 */ 710 */
711 auditsys: 711 auditsys:
712 movq %r10,%r9 /* 6th arg: 4th syscall arg */ 712 movq %r10,%r9 /* 6th arg: 4th syscall arg */
713 movq %rdx,%r8 /* 5th arg: 3rd syscall arg */ 713 movq %rdx,%r8 /* 5th arg: 3rd syscall arg */
714 movq %rsi,%rcx /* 4th arg: 2nd syscall arg */ 714 movq %rsi,%rcx /* 4th arg: 2nd syscall arg */
715 movq %rdi,%rdx /* 3rd arg: 1st syscall arg */ 715 movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
716 movq %rax,%rsi /* 2nd arg: syscall number */ 716 movq %rax,%rsi /* 2nd arg: syscall number */
717 movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */ 717 movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
718 call __audit_syscall_entry 718 call __audit_syscall_entry
719 LOAD_ARGS 0 /* reload call-clobbered registers */ 719 LOAD_ARGS 0 /* reload call-clobbered registers */
720 jmp system_call_fastpath 720 jmp system_call_fastpath
721 721
722 /* 722 /*
723 * Return fast path for syscall audit. Call __audit_syscall_exit() 723 * Return fast path for syscall audit. Call __audit_syscall_exit()
724 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT 724 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
725 * masked off. 725 * masked off.
726 */ 726 */
727 sysret_audit: 727 sysret_audit:
728 movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */ 728 movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */
729 cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */ 729 cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */
730 setbe %al /* 1 if so, 0 if not */ 730 setbe %al /* 1 if so, 0 if not */
731 movzbl %al,%edi /* zero-extend that into %edi */ 731 movzbl %al,%edi /* zero-extend that into %edi */
732 call __audit_syscall_exit 732 call __audit_syscall_exit
733 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi 733 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
734 jmp sysret_check 734 jmp sysret_check
735 #endif /* CONFIG_AUDITSYSCALL */ 735 #endif /* CONFIG_AUDITSYSCALL */
736 736
737 /* Do syscall tracing */ 737 /* Do syscall tracing */
738 tracesys: 738 tracesys:
739 #ifdef CONFIG_AUDITSYSCALL 739 #ifdef CONFIG_AUDITSYSCALL
740 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 740 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
741 jz auditsys 741 jz auditsys
742 #endif 742 #endif
743 SAVE_REST 743 SAVE_REST
744 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ 744 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
745 FIXUP_TOP_OF_STACK %rdi 745 FIXUP_TOP_OF_STACK %rdi
746 movq %rsp,%rdi 746 movq %rsp,%rdi
747 call syscall_trace_enter 747 call syscall_trace_enter
748 /* 748 /*
749 * Reload arg registers from stack in case ptrace changed them. 749 * Reload arg registers from stack in case ptrace changed them.
750 * We don't reload %rax because syscall_trace_enter() returned 750 * We don't reload %rax because syscall_trace_enter() returned
751 * the value it wants us to use in the table lookup. 751 * the value it wants us to use in the table lookup.
752 */ 752 */
753 LOAD_ARGS ARGOFFSET, 1 753 LOAD_ARGS ARGOFFSET, 1
754 RESTORE_REST 754 RESTORE_REST
755 #if __SYSCALL_MASK == ~0 755 #if __SYSCALL_MASK == ~0
756 cmpq $__NR_syscall_max,%rax 756 cmpq $__NR_syscall_max,%rax
757 #else 757 #else
758 andl $__SYSCALL_MASK,%eax 758 andl $__SYSCALL_MASK,%eax
759 cmpl $__NR_syscall_max,%eax 759 cmpl $__NR_syscall_max,%eax
760 #endif 760 #endif
761 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */ 761 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
762 movq %r10,%rcx /* fixup for C */ 762 movq %r10,%rcx /* fixup for C */
763 call *sys_call_table(,%rax,8) 763 call *sys_call_table(,%rax,8)
764 movq %rax,RAX-ARGOFFSET(%rsp) 764 movq %rax,RAX-ARGOFFSET(%rsp)
765 /* Use IRET because user could have changed frame */ 765 /* Use IRET because user could have changed frame */
766 766
767 /* 767 /*
768 * Syscall return path ending with IRET. 768 * Syscall return path ending with IRET.
769 * Has correct top of stack, but partial stack frame. 769 * Has correct top of stack, but partial stack frame.
770 */ 770 */
771 GLOBAL(int_ret_from_sys_call) 771 GLOBAL(int_ret_from_sys_call)
772 DISABLE_INTERRUPTS(CLBR_NONE) 772 DISABLE_INTERRUPTS(CLBR_NONE)
773 TRACE_IRQS_OFF 773 TRACE_IRQS_OFF
774 movl $_TIF_ALLWORK_MASK,%edi 774 movl $_TIF_ALLWORK_MASK,%edi
775 /* edi: mask to check */ 775 /* edi: mask to check */
776 GLOBAL(int_with_check) 776 GLOBAL(int_with_check)
777 LOCKDEP_SYS_EXIT_IRQ 777 LOCKDEP_SYS_EXIT_IRQ
778 GET_THREAD_INFO(%rcx) 778 GET_THREAD_INFO(%rcx)
779 movl TI_flags(%rcx),%edx 779 movl TI_flags(%rcx),%edx
780 andl %edi,%edx 780 andl %edi,%edx
781 jnz int_careful 781 jnz int_careful
782 andl $~TS_COMPAT,TI_status(%rcx) 782 andl $~TS_COMPAT,TI_status(%rcx)
783 jmp retint_swapgs 783 jmp retint_swapgs
784 784
785 /* Either reschedule or signal or syscall exit tracking needed. */ 785 /* Either reschedule or signal or syscall exit tracking needed. */
786 /* First do a reschedule test. */ 786 /* First do a reschedule test. */
787 /* edx: work, edi: workmask */ 787 /* edx: work, edi: workmask */
788 int_careful: 788 int_careful:
789 bt $TIF_NEED_RESCHED,%edx 789 bt $TIF_NEED_RESCHED,%edx
790 jnc int_very_careful 790 jnc int_very_careful
791 TRACE_IRQS_ON 791 TRACE_IRQS_ON
792 ENABLE_INTERRUPTS(CLBR_NONE) 792 ENABLE_INTERRUPTS(CLBR_NONE)
793 pushq_cfi %rdi 793 pushq_cfi %rdi
794 SCHEDULE_USER 794 SCHEDULE_USER
795 popq_cfi %rdi 795 popq_cfi %rdi
796 DISABLE_INTERRUPTS(CLBR_NONE) 796 DISABLE_INTERRUPTS(CLBR_NONE)
797 TRACE_IRQS_OFF 797 TRACE_IRQS_OFF
798 jmp int_with_check 798 jmp int_with_check
799 799
800 /* handle signals and tracing -- both require a full stack frame */ 800 /* handle signals and tracing -- both require a full stack frame */
801 int_very_careful: 801 int_very_careful:
802 TRACE_IRQS_ON 802 TRACE_IRQS_ON
803 ENABLE_INTERRUPTS(CLBR_NONE) 803 ENABLE_INTERRUPTS(CLBR_NONE)
804 int_check_syscall_exit_work: 804 int_check_syscall_exit_work:
805 SAVE_REST 805 SAVE_REST
806 /* Check for syscall exit trace */ 806 /* Check for syscall exit trace */
807 testl $_TIF_WORK_SYSCALL_EXIT,%edx 807 testl $_TIF_WORK_SYSCALL_EXIT,%edx
808 jz int_signal 808 jz int_signal
809 pushq_cfi %rdi 809 pushq_cfi %rdi
810 leaq 8(%rsp),%rdi # &ptregs -> arg1 810 leaq 8(%rsp),%rdi # &ptregs -> arg1
811 call syscall_trace_leave 811 call syscall_trace_leave
812 popq_cfi %rdi 812 popq_cfi %rdi
813 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi 813 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
814 jmp int_restore_rest 814 jmp int_restore_rest
815 815
816 int_signal: 816 int_signal:
817 testl $_TIF_DO_NOTIFY_MASK,%edx 817 testl $_TIF_DO_NOTIFY_MASK,%edx
818 jz 1f 818 jz 1f
819 movq %rsp,%rdi # &ptregs -> arg1 819 movq %rsp,%rdi # &ptregs -> arg1
820 xorl %esi,%esi # oldset -> arg2 820 xorl %esi,%esi # oldset -> arg2
821 call do_notify_resume 821 call do_notify_resume
822 1: movl $_TIF_WORK_MASK,%edi 822 1: movl $_TIF_WORK_MASK,%edi
823 int_restore_rest: 823 int_restore_rest:
824 RESTORE_REST 824 RESTORE_REST
825 DISABLE_INTERRUPTS(CLBR_NONE) 825 DISABLE_INTERRUPTS(CLBR_NONE)
826 TRACE_IRQS_OFF 826 TRACE_IRQS_OFF
827 jmp int_with_check 827 jmp int_with_check
828 CFI_ENDPROC 828 CFI_ENDPROC
829 END(system_call) 829 END(system_call)
830 830
831 /* 831 /*
832 * Certain special system calls that need to save a complete full stack frame. 832 * Certain special system calls that need to save a complete full stack frame.
833 */ 833 */
834 .macro PTREGSCALL label,func,arg 834 .macro PTREGSCALL label,func,arg
835 ENTRY(\label) 835 ENTRY(\label)
836 PARTIAL_FRAME 1 8 /* offset 8: return address */ 836 PARTIAL_FRAME 1 8 /* offset 8: return address */
837 subq $REST_SKIP, %rsp 837 subq $REST_SKIP, %rsp
838 CFI_ADJUST_CFA_OFFSET REST_SKIP 838 CFI_ADJUST_CFA_OFFSET REST_SKIP
839 call save_rest 839 call save_rest
840 DEFAULT_FRAME 0 8 /* offset 8: return address */ 840 DEFAULT_FRAME 0 8 /* offset 8: return address */
841 leaq 8(%rsp), \arg /* pt_regs pointer */ 841 leaq 8(%rsp), \arg /* pt_regs pointer */
842 call \func 842 call \func
843 jmp ptregscall_common 843 jmp ptregscall_common
844 CFI_ENDPROC 844 CFI_ENDPROC
845 END(\label) 845 END(\label)
846 .endm 846 .endm
847 847
848 .macro FORK_LIKE func 848 .macro FORK_LIKE func
849 ENTRY(stub_\func) 849 ENTRY(stub_\func)
850 CFI_STARTPROC 850 CFI_STARTPROC
851 popq %r11 /* save return address */ 851 popq %r11 /* save return address */
852 PARTIAL_FRAME 0 852 PARTIAL_FRAME 0
853 SAVE_REST 853 SAVE_REST
854 pushq %r11 /* put it back on stack */ 854 pushq %r11 /* put it back on stack */
855 FIXUP_TOP_OF_STACK %r11, 8 855 FIXUP_TOP_OF_STACK %r11, 8
856 DEFAULT_FRAME 0 8 /* offset 8: return address */ 856 DEFAULT_FRAME 0 8 /* offset 8: return address */
857 call sys_\func 857 call sys_\func
858 RESTORE_TOP_OF_STACK %r11, 8 858 RESTORE_TOP_OF_STACK %r11, 8
859 ret $REST_SKIP /* pop extended registers */ 859 ret $REST_SKIP /* pop extended registers */
860 CFI_ENDPROC 860 CFI_ENDPROC
861 END(stub_\func) 861 END(stub_\func)
862 .endm 862 .endm
863 863
864 FORK_LIKE clone 864 FORK_LIKE clone
865 FORK_LIKE fork 865 FORK_LIKE fork
866 FORK_LIKE vfork 866 FORK_LIKE vfork
867 PTREGSCALL stub_iopl, sys_iopl, %rsi 867 PTREGSCALL stub_iopl, sys_iopl, %rsi
868 868
869 ENTRY(ptregscall_common) 869 ENTRY(ptregscall_common)
870 DEFAULT_FRAME 1 8 /* offset 8: return address */ 870 DEFAULT_FRAME 1 8 /* offset 8: return address */
871 RESTORE_TOP_OF_STACK %r11, 8 871 RESTORE_TOP_OF_STACK %r11, 8
872 movq_cfi_restore R15+8, r15 872 movq_cfi_restore R15+8, r15
873 movq_cfi_restore R14+8, r14 873 movq_cfi_restore R14+8, r14
874 movq_cfi_restore R13+8, r13 874 movq_cfi_restore R13+8, r13
875 movq_cfi_restore R12+8, r12 875 movq_cfi_restore R12+8, r12
876 movq_cfi_restore RBP+8, rbp 876 movq_cfi_restore RBP+8, rbp
877 movq_cfi_restore RBX+8, rbx 877 movq_cfi_restore RBX+8, rbx
878 ret $REST_SKIP /* pop extended registers */ 878 ret $REST_SKIP /* pop extended registers */
879 CFI_ENDPROC 879 CFI_ENDPROC
880 END(ptregscall_common) 880 END(ptregscall_common)
881 881
882 ENTRY(stub_execve) 882 ENTRY(stub_execve)
883 CFI_STARTPROC 883 CFI_STARTPROC
884 addq $8, %rsp 884 addq $8, %rsp
885 PARTIAL_FRAME 0 885 PARTIAL_FRAME 0
886 SAVE_REST 886 SAVE_REST
887 FIXUP_TOP_OF_STACK %r11 887 FIXUP_TOP_OF_STACK %r11
888 call sys_execve 888 call sys_execve
889 RESTORE_TOP_OF_STACK %r11 889 RESTORE_TOP_OF_STACK %r11
890 movq %rax,RAX(%rsp) 890 movq %rax,RAX(%rsp)
891 RESTORE_REST 891 RESTORE_REST
892 jmp int_ret_from_sys_call 892 jmp int_ret_from_sys_call
893 CFI_ENDPROC 893 CFI_ENDPROC
894 END(stub_execve) 894 END(stub_execve)
895 895
896 /* 896 /*
897 * sigreturn is special because it needs to restore all registers on return. 897 * sigreturn is special because it needs to restore all registers on return.
898 * This cannot be done with SYSRET, so use the IRET return path instead. 898 * This cannot be done with SYSRET, so use the IRET return path instead.
899 */ 899 */
900 ENTRY(stub_rt_sigreturn) 900 ENTRY(stub_rt_sigreturn)
901 CFI_STARTPROC 901 CFI_STARTPROC
902 addq $8, %rsp 902 addq $8, %rsp
903 PARTIAL_FRAME 0 903 PARTIAL_FRAME 0
904 SAVE_REST 904 SAVE_REST
905 movq %rsp,%rdi 905 movq %rsp,%rdi
906 FIXUP_TOP_OF_STACK %r11 906 FIXUP_TOP_OF_STACK %r11
907 call sys_rt_sigreturn 907 call sys_rt_sigreturn
908 movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer 908 movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
909 RESTORE_REST 909 RESTORE_REST
910 jmp int_ret_from_sys_call 910 jmp int_ret_from_sys_call
911 CFI_ENDPROC 911 CFI_ENDPROC
912 END(stub_rt_sigreturn) 912 END(stub_rt_sigreturn)
913 913
914 #ifdef CONFIG_X86_X32_ABI 914 #ifdef CONFIG_X86_X32_ABI
915 ENTRY(stub_x32_rt_sigreturn) 915 ENTRY(stub_x32_rt_sigreturn)
916 CFI_STARTPROC 916 CFI_STARTPROC
917 addq $8, %rsp 917 addq $8, %rsp
918 PARTIAL_FRAME 0 918 PARTIAL_FRAME 0
919 SAVE_REST 919 SAVE_REST
920 movq %rsp,%rdi 920 movq %rsp,%rdi
921 FIXUP_TOP_OF_STACK %r11 921 FIXUP_TOP_OF_STACK %r11
922 call sys32_x32_rt_sigreturn 922 call sys32_x32_rt_sigreturn
923 movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer 923 movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
924 RESTORE_REST 924 RESTORE_REST
925 jmp int_ret_from_sys_call 925 jmp int_ret_from_sys_call
926 CFI_ENDPROC 926 CFI_ENDPROC
927 END(stub_x32_rt_sigreturn) 927 END(stub_x32_rt_sigreturn)
928 928
929 ENTRY(stub_x32_execve) 929 ENTRY(stub_x32_execve)
930 CFI_STARTPROC 930 CFI_STARTPROC
931 addq $8, %rsp 931 addq $8, %rsp
932 PARTIAL_FRAME 0 932 PARTIAL_FRAME 0
933 SAVE_REST 933 SAVE_REST
934 FIXUP_TOP_OF_STACK %r11 934 FIXUP_TOP_OF_STACK %r11
935 call compat_sys_execve 935 call compat_sys_execve
936 RESTORE_TOP_OF_STACK %r11 936 RESTORE_TOP_OF_STACK %r11
937 movq %rax,RAX(%rsp) 937 movq %rax,RAX(%rsp)
938 RESTORE_REST 938 RESTORE_REST
939 jmp int_ret_from_sys_call 939 jmp int_ret_from_sys_call
940 CFI_ENDPROC 940 CFI_ENDPROC
941 END(stub_x32_execve) 941 END(stub_x32_execve)
942 942
943 #endif 943 #endif
944 944
945 /* 945 /*
946 * Build the entry stubs and pointer table with some assembler magic. 946 * Build the entry stubs and pointer table with some assembler magic.
947 * We pack 7 stubs into a single 32-byte chunk, which will fit in a 947 * We pack 7 stubs into a single 32-byte chunk, which will fit in a
948 * single cache line on all modern x86 implementations. 948 * single cache line on all modern x86 implementations.
949 */ 949 */
950 .section .init.rodata,"a" 950 .section .init.rodata,"a"
951 ENTRY(interrupt) 951 ENTRY(interrupt)
952 .section .entry.text 952 .section .entry.text
953 .p2align 5 953 .p2align 5
954 .p2align CONFIG_X86_L1_CACHE_SHIFT 954 .p2align CONFIG_X86_L1_CACHE_SHIFT
955 ENTRY(irq_entries_start) 955 ENTRY(irq_entries_start)
956 INTR_FRAME 956 INTR_FRAME
957 vector=FIRST_EXTERNAL_VECTOR 957 vector=FIRST_EXTERNAL_VECTOR
958 .rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7 958 .rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
959 .balign 32 959 .balign 32
960 .rept 7 960 .rept 7
961 .if vector < NR_VECTORS 961 .if vector < NR_VECTORS
962 .if vector <> FIRST_EXTERNAL_VECTOR 962 .if vector <> FIRST_EXTERNAL_VECTOR
963 CFI_ADJUST_CFA_OFFSET -8 963 CFI_ADJUST_CFA_OFFSET -8
964 .endif 964 .endif
965 1: pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */ 965 1: pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */
966 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 966 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
967 jmp 2f 967 jmp 2f
968 .endif 968 .endif
969 .previous 969 .previous
970 .quad 1b 970 .quad 1b
971 .section .entry.text 971 .section .entry.text
972 vector=vector+1 972 vector=vector+1
973 .endif 973 .endif
974 .endr 974 .endr
975 2: jmp common_interrupt 975 2: jmp common_interrupt
976 .endr 976 .endr
977 CFI_ENDPROC 977 CFI_ENDPROC
978 END(irq_entries_start) 978 END(irq_entries_start)
979 979
980 .previous 980 .previous
981 END(interrupt) 981 END(interrupt)
982 .previous 982 .previous
983 983
984 /* 984 /*
985 * Interrupt entry/exit. 985 * Interrupt entry/exit.
986 * 986 *
987 * Interrupt entry points save only callee clobbered registers in fast path. 987 * Interrupt entry points save only callee clobbered registers in fast path.
988 * 988 *
989 * Entry runs with interrupts off. 989 * Entry runs with interrupts off.
990 */ 990 */
991 991
992 /* 0(%rsp): ~(interrupt number) */ 992 /* 0(%rsp): ~(interrupt number) */
993 .macro interrupt func 993 .macro interrupt func
994 /* reserve pt_regs for scratch regs and rbp */ 994 /* reserve pt_regs for scratch regs and rbp */
995 subq $ORIG_RAX-RBP, %rsp 995 subq $ORIG_RAX-RBP, %rsp
996 CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP 996 CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
997 SAVE_ARGS_IRQ 997 SAVE_ARGS_IRQ
998 call \func 998 call \func
999 .endm 999 .endm
1000 1000
1001 /* 1001 /*
1002 * Interrupt entry/exit should be protected against kprobes 1002 * Interrupt entry/exit should be protected against kprobes
1003 */ 1003 */
1004 .pushsection .kprobes.text, "ax" 1004 .pushsection .kprobes.text, "ax"
1005 /* 1005 /*
1006 * The interrupt stubs push (~vector+0x80) onto the stack and 1006 * The interrupt stubs push (~vector+0x80) onto the stack and
1007 * then jump to common_interrupt. 1007 * then jump to common_interrupt.
1008 */ 1008 */
1009 .p2align CONFIG_X86_L1_CACHE_SHIFT 1009 .p2align CONFIG_X86_L1_CACHE_SHIFT
1010 common_interrupt: 1010 common_interrupt:
1011 XCPT_FRAME 1011 XCPT_FRAME
1012 ASM_CLAC 1012 ASM_CLAC
1013 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ 1013 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
1014 interrupt do_IRQ 1014 interrupt do_IRQ
1015 /* 0(%rsp): old_rsp-ARGOFFSET */ 1015 /* 0(%rsp): old_rsp-ARGOFFSET */
1016 ret_from_intr: 1016 ret_from_intr:
1017 DISABLE_INTERRUPTS(CLBR_NONE) 1017 DISABLE_INTERRUPTS(CLBR_NONE)
1018 TRACE_IRQS_OFF 1018 TRACE_IRQS_OFF
1019 decl PER_CPU_VAR(irq_count) 1019 decl PER_CPU_VAR(irq_count)
1020 1020
1021 /* Restore saved previous stack */ 1021 /* Restore saved previous stack */
1022 popq %rsi 1022 popq %rsi
1023 CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */ 1023 CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */
1024 leaq ARGOFFSET-RBP(%rsi), %rsp 1024 leaq ARGOFFSET-RBP(%rsi), %rsp
1025 CFI_DEF_CFA_REGISTER rsp 1025 CFI_DEF_CFA_REGISTER rsp
1026 CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET 1026 CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET
1027 1027
1028 exit_intr: 1028 exit_intr:
1029 GET_THREAD_INFO(%rcx) 1029 GET_THREAD_INFO(%rcx)
1030 testl $3,CS-ARGOFFSET(%rsp) 1030 testl $3,CS-ARGOFFSET(%rsp)
1031 je retint_kernel 1031 je retint_kernel
1032 1032
1033 /* Interrupt came from user space */ 1033 /* Interrupt came from user space */
1034 /* 1034 /*
1035 * Has a correct top of stack, but a partial stack frame 1035 * Has a correct top of stack, but a partial stack frame
1036 * %rcx: thread info. Interrupts off. 1036 * %rcx: thread info. Interrupts off.
1037 */ 1037 */
1038 retint_with_reschedule: 1038 retint_with_reschedule:
1039 movl $_TIF_WORK_MASK,%edi 1039 movl $_TIF_WORK_MASK,%edi
1040 retint_check: 1040 retint_check:
1041 LOCKDEP_SYS_EXIT_IRQ 1041 LOCKDEP_SYS_EXIT_IRQ
1042 movl TI_flags(%rcx),%edx 1042 movl TI_flags(%rcx),%edx
1043 andl %edi,%edx 1043 andl %edi,%edx
1044 CFI_REMEMBER_STATE 1044 CFI_REMEMBER_STATE
1045 jnz retint_careful 1045 jnz retint_careful
1046 1046
1047 retint_swapgs: /* return to user-space */ 1047 retint_swapgs: /* return to user-space */
1048 /* 1048 /*
1049 * The iretq could re-enable interrupts: 1049 * The iretq could re-enable interrupts:
1050 */ 1050 */
1051 DISABLE_INTERRUPTS(CLBR_ANY) 1051 DISABLE_INTERRUPTS(CLBR_ANY)
1052 TRACE_IRQS_IRETQ 1052 TRACE_IRQS_IRETQ
1053 SWAPGS 1053 SWAPGS
1054 jmp restore_args 1054 jmp restore_args
1055 1055
1056 retint_restore_args: /* return to kernel space */ 1056 retint_restore_args: /* return to kernel space */
1057 DISABLE_INTERRUPTS(CLBR_ANY) 1057 DISABLE_INTERRUPTS(CLBR_ANY)
1058 /* 1058 /*
1059 * The iretq could re-enable interrupts: 1059 * The iretq could re-enable interrupts:
1060 */ 1060 */
1061 TRACE_IRQS_IRETQ 1061 TRACE_IRQS_IRETQ
1062 restore_args: 1062 restore_args:
1063 RESTORE_ARGS 1,8,1 1063 RESTORE_ARGS 1,8,1
1064 1064
1065 irq_return: 1065 irq_return:
1066 INTERRUPT_RETURN 1066 INTERRUPT_RETURN
1067 _ASM_EXTABLE(irq_return, bad_iret) 1067 _ASM_EXTABLE(irq_return, bad_iret)
1068 1068
1069 #ifdef CONFIG_PARAVIRT 1069 #ifdef CONFIG_PARAVIRT
1070 ENTRY(native_iret) 1070 ENTRY(native_iret)
1071 iretq 1071 iretq
1072 _ASM_EXTABLE(native_iret, bad_iret) 1072 _ASM_EXTABLE(native_iret, bad_iret)
1073 #endif 1073 #endif
1074 1074
1075 .section .fixup,"ax" 1075 .section .fixup,"ax"
1076 bad_iret: 1076 bad_iret:
1077 /* 1077 /*
1078 * The iret traps when the %cs or %ss being restored is bogus. 1078 * The iret traps when the %cs or %ss being restored is bogus.
1079 * We've lost the original trap vector and error code. 1079 * We've lost the original trap vector and error code.
1080 * #GPF is the most likely one to get for an invalid selector. 1080 * #GPF is the most likely one to get for an invalid selector.
1081 * So pretend we completed the iret and took the #GPF in user mode. 1081 * So pretend we completed the iret and took the #GPF in user mode.
1082 * 1082 *
1083 * We are now running with the kernel GS after exception recovery. 1083 * We are now running with the kernel GS after exception recovery.
1084 * But error_entry expects us to have user GS to match the user %cs, 1084 * But error_entry expects us to have user GS to match the user %cs,
1085 * so swap back. 1085 * so swap back.
1086 */ 1086 */
1087 pushq $0 1087 pushq $0
1088 1088
1089 SWAPGS 1089 SWAPGS
1090 jmp general_protection 1090 jmp general_protection
1091 1091
1092 .previous 1092 .previous
1093 1093
1094 /* edi: workmask, edx: work */ 1094 /* edi: workmask, edx: work */
1095 retint_careful: 1095 retint_careful:
1096 CFI_RESTORE_STATE 1096 CFI_RESTORE_STATE
1097 bt $TIF_NEED_RESCHED,%edx 1097 bt $TIF_NEED_RESCHED,%edx
1098 jnc retint_signal 1098 jnc retint_signal
1099 TRACE_IRQS_ON 1099 TRACE_IRQS_ON
1100 ENABLE_INTERRUPTS(CLBR_NONE) 1100 ENABLE_INTERRUPTS(CLBR_NONE)
1101 pushq_cfi %rdi 1101 pushq_cfi %rdi
1102 SCHEDULE_USER 1102 SCHEDULE_USER
1103 popq_cfi %rdi 1103 popq_cfi %rdi
1104 GET_THREAD_INFO(%rcx) 1104 GET_THREAD_INFO(%rcx)
1105 DISABLE_INTERRUPTS(CLBR_NONE) 1105 DISABLE_INTERRUPTS(CLBR_NONE)
1106 TRACE_IRQS_OFF 1106 TRACE_IRQS_OFF
1107 jmp retint_check 1107 jmp retint_check
1108 1108
1109 retint_signal: 1109 retint_signal:
1110 testl $_TIF_DO_NOTIFY_MASK,%edx 1110 testl $_TIF_DO_NOTIFY_MASK,%edx
1111 jz retint_swapgs 1111 jz retint_swapgs
1112 TRACE_IRQS_ON 1112 TRACE_IRQS_ON
1113 ENABLE_INTERRUPTS(CLBR_NONE) 1113 ENABLE_INTERRUPTS(CLBR_NONE)
1114 SAVE_REST 1114 SAVE_REST
1115 movq $-1,ORIG_RAX(%rsp) 1115 movq $-1,ORIG_RAX(%rsp)
1116 xorl %esi,%esi # oldset 1116 xorl %esi,%esi # oldset
1117 movq %rsp,%rdi # &pt_regs 1117 movq %rsp,%rdi # &pt_regs
1118 call do_notify_resume 1118 call do_notify_resume
1119 RESTORE_REST 1119 RESTORE_REST
1120 DISABLE_INTERRUPTS(CLBR_NONE) 1120 DISABLE_INTERRUPTS(CLBR_NONE)
1121 TRACE_IRQS_OFF 1121 TRACE_IRQS_OFF
1122 GET_THREAD_INFO(%rcx) 1122 GET_THREAD_INFO(%rcx)
1123 jmp retint_with_reschedule 1123 jmp retint_with_reschedule
1124 1124
1125 #ifdef CONFIG_PREEMPT 1125 #ifdef CONFIG_PREEMPT
1126 /* Returning to kernel space. Check if we need preemption */ 1126 /* Returning to kernel space. Check if we need preemption */
1127 /* rcx: threadinfo. interrupts off. */ 1127 /* rcx: threadinfo. interrupts off. */
1128 ENTRY(retint_kernel) 1128 ENTRY(retint_kernel)
1129 cmpl $0,TI_preempt_count(%rcx) 1129 cmpl $0,TI_preempt_count(%rcx)
1130 jnz retint_restore_args 1130 jnz retint_restore_args
1131 bt $TIF_NEED_RESCHED,TI_flags(%rcx) 1131 bt $TIF_NEED_RESCHED,TI_flags(%rcx)
1132 jnc retint_restore_args 1132 jnc retint_restore_args
1133 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ 1133 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
1134 jnc retint_restore_args 1134 jnc retint_restore_args
1135 call preempt_schedule_irq 1135 call preempt_schedule_irq
1136 jmp exit_intr 1136 jmp exit_intr
1137 #endif 1137 #endif
1138 1138
1139 CFI_ENDPROC 1139 CFI_ENDPROC
1140 END(common_interrupt) 1140 END(common_interrupt)
1141 /* 1141 /*
1142 * End of kprobes section 1142 * End of kprobes section
1143 */ 1143 */
1144 .popsection 1144 .popsection
1145 1145
1146 /* 1146 /*
1147 * APIC interrupts. 1147 * APIC interrupts.
1148 */ 1148 */
1149 .macro apicinterrupt num sym do_sym 1149 .macro apicinterrupt num sym do_sym
1150 ENTRY(\sym) 1150 ENTRY(\sym)
1151 INTR_FRAME 1151 INTR_FRAME
1152 ASM_CLAC 1152 ASM_CLAC
1153 pushq_cfi $~(\num) 1153 pushq_cfi $~(\num)
1154 .Lcommon_\sym: 1154 .Lcommon_\sym:
1155 interrupt \do_sym 1155 interrupt \do_sym
1156 jmp ret_from_intr 1156 jmp ret_from_intr
1157 CFI_ENDPROC 1157 CFI_ENDPROC
1158 END(\sym) 1158 END(\sym)
1159 .endm 1159 .endm
1160 1160
1161 #ifdef CONFIG_SMP 1161 #ifdef CONFIG_SMP
1162 apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \ 1162 apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
1163 irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt 1163 irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
1164 apicinterrupt REBOOT_VECTOR \ 1164 apicinterrupt REBOOT_VECTOR \
1165 reboot_interrupt smp_reboot_interrupt 1165 reboot_interrupt smp_reboot_interrupt
1166 #endif 1166 #endif
1167 1167
1168 #ifdef CONFIG_X86_UV 1168 #ifdef CONFIG_X86_UV
1169 apicinterrupt UV_BAU_MESSAGE \ 1169 apicinterrupt UV_BAU_MESSAGE \
1170 uv_bau_message_intr1 uv_bau_message_interrupt 1170 uv_bau_message_intr1 uv_bau_message_interrupt
1171 #endif 1171 #endif
1172 apicinterrupt LOCAL_TIMER_VECTOR \ 1172 apicinterrupt LOCAL_TIMER_VECTOR \
1173 apic_timer_interrupt smp_apic_timer_interrupt 1173 apic_timer_interrupt smp_apic_timer_interrupt
1174 apicinterrupt X86_PLATFORM_IPI_VECTOR \ 1174 apicinterrupt X86_PLATFORM_IPI_VECTOR \
1175 x86_platform_ipi smp_x86_platform_ipi 1175 x86_platform_ipi smp_x86_platform_ipi
1176 1176
1177 apicinterrupt THRESHOLD_APIC_VECTOR \ 1177 apicinterrupt THRESHOLD_APIC_VECTOR \
1178 threshold_interrupt smp_threshold_interrupt 1178 threshold_interrupt smp_threshold_interrupt
1179 apicinterrupt THERMAL_APIC_VECTOR \ 1179 apicinterrupt THERMAL_APIC_VECTOR \
1180 thermal_interrupt smp_thermal_interrupt 1180 thermal_interrupt smp_thermal_interrupt
1181 1181
1182 #ifdef CONFIG_SMP 1182 #ifdef CONFIG_SMP
1183 apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ 1183 apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
1184 call_function_single_interrupt smp_call_function_single_interrupt 1184 call_function_single_interrupt smp_call_function_single_interrupt
1185 apicinterrupt CALL_FUNCTION_VECTOR \ 1185 apicinterrupt CALL_FUNCTION_VECTOR \
1186 call_function_interrupt smp_call_function_interrupt 1186 call_function_interrupt smp_call_function_interrupt
1187 apicinterrupt RESCHEDULE_VECTOR \ 1187 apicinterrupt RESCHEDULE_VECTOR \
1188 reschedule_interrupt smp_reschedule_interrupt 1188 reschedule_interrupt smp_reschedule_interrupt
1189 #endif 1189 #endif
1190 1190
1191 apicinterrupt ERROR_APIC_VECTOR \ 1191 apicinterrupt ERROR_APIC_VECTOR \
1192 error_interrupt smp_error_interrupt 1192 error_interrupt smp_error_interrupt
1193 apicinterrupt SPURIOUS_APIC_VECTOR \ 1193 apicinterrupt SPURIOUS_APIC_VECTOR \
1194 spurious_interrupt smp_spurious_interrupt 1194 spurious_interrupt smp_spurious_interrupt
1195 1195
1196 #ifdef CONFIG_IRQ_WORK 1196 #ifdef CONFIG_IRQ_WORK
1197 apicinterrupt IRQ_WORK_VECTOR \ 1197 apicinterrupt IRQ_WORK_VECTOR \
1198 irq_work_interrupt smp_irq_work_interrupt 1198 irq_work_interrupt smp_irq_work_interrupt
1199 #endif 1199 #endif
1200 1200
1201 /* 1201 /*
1202 * Exception entry points. 1202 * Exception entry points.
1203 */ 1203 */
1204 .macro zeroentry sym do_sym 1204 .macro zeroentry sym do_sym
1205 ENTRY(\sym) 1205 ENTRY(\sym)
1206 INTR_FRAME 1206 INTR_FRAME
1207 ASM_CLAC 1207 ASM_CLAC
1208 PARAVIRT_ADJUST_EXCEPTION_FRAME 1208 PARAVIRT_ADJUST_EXCEPTION_FRAME
1209 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ 1209 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1210 subq $ORIG_RAX-R15, %rsp 1210 subq $ORIG_RAX-R15, %rsp
1211 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1211 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1212 call error_entry 1212 call error_entry
1213 DEFAULT_FRAME 0 1213 DEFAULT_FRAME 0
1214 movq %rsp,%rdi /* pt_regs pointer */ 1214 movq %rsp,%rdi /* pt_regs pointer */
1215 xorl %esi,%esi /* no error code */ 1215 xorl %esi,%esi /* no error code */
1216 call \do_sym 1216 call \do_sym
1217 jmp error_exit /* %ebx: no swapgs flag */ 1217 jmp error_exit /* %ebx: no swapgs flag */
1218 CFI_ENDPROC 1218 CFI_ENDPROC
1219 END(\sym) 1219 END(\sym)
1220 .endm 1220 .endm
1221 1221
1222 .macro paranoidzeroentry sym do_sym 1222 .macro paranoidzeroentry sym do_sym
1223 ENTRY(\sym) 1223 ENTRY(\sym)
1224 INTR_FRAME 1224 INTR_FRAME
1225 ASM_CLAC 1225 ASM_CLAC
1226 PARAVIRT_ADJUST_EXCEPTION_FRAME 1226 PARAVIRT_ADJUST_EXCEPTION_FRAME
1227 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ 1227 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1228 subq $ORIG_RAX-R15, %rsp 1228 subq $ORIG_RAX-R15, %rsp
1229 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1229 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1230 call save_paranoid 1230 call save_paranoid
1231 TRACE_IRQS_OFF 1231 TRACE_IRQS_OFF
1232 movq %rsp,%rdi /* pt_regs pointer */ 1232 movq %rsp,%rdi /* pt_regs pointer */
1233 xorl %esi,%esi /* no error code */ 1233 xorl %esi,%esi /* no error code */
1234 call \do_sym 1234 call \do_sym
1235 jmp paranoid_exit /* %ebx: no swapgs flag */ 1235 jmp paranoid_exit /* %ebx: no swapgs flag */
1236 CFI_ENDPROC 1236 CFI_ENDPROC
1237 END(\sym) 1237 END(\sym)
1238 .endm 1238 .endm
1239 1239
1240 #define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) 1240 #define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
1241 .macro paranoidzeroentry_ist sym do_sym ist 1241 .macro paranoidzeroentry_ist sym do_sym ist
1242 ENTRY(\sym) 1242 ENTRY(\sym)
1243 INTR_FRAME 1243 INTR_FRAME
1244 ASM_CLAC 1244 ASM_CLAC
1245 PARAVIRT_ADJUST_EXCEPTION_FRAME 1245 PARAVIRT_ADJUST_EXCEPTION_FRAME
1246 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ 1246 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1247 subq $ORIG_RAX-R15, %rsp 1247 subq $ORIG_RAX-R15, %rsp
1248 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1248 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1249 call save_paranoid 1249 call save_paranoid
1250 TRACE_IRQS_OFF_DEBUG 1250 TRACE_IRQS_OFF_DEBUG
1251 movq %rsp,%rdi /* pt_regs pointer */ 1251 movq %rsp,%rdi /* pt_regs pointer */
1252 xorl %esi,%esi /* no error code */ 1252 xorl %esi,%esi /* no error code */
1253 subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) 1253 subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
1254 call \do_sym 1254 call \do_sym
1255 addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) 1255 addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
1256 jmp paranoid_exit /* %ebx: no swapgs flag */ 1256 jmp paranoid_exit /* %ebx: no swapgs flag */
1257 CFI_ENDPROC 1257 CFI_ENDPROC
1258 END(\sym) 1258 END(\sym)
1259 .endm 1259 .endm
1260 1260
1261 .macro errorentry sym do_sym 1261 .macro errorentry sym do_sym
1262 ENTRY(\sym) 1262 ENTRY(\sym)
1263 XCPT_FRAME 1263 XCPT_FRAME
1264 ASM_CLAC 1264 ASM_CLAC
1265 PARAVIRT_ADJUST_EXCEPTION_FRAME 1265 PARAVIRT_ADJUST_EXCEPTION_FRAME
1266 subq $ORIG_RAX-R15, %rsp 1266 subq $ORIG_RAX-R15, %rsp
1267 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1267 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1268 call error_entry 1268 call error_entry
1269 DEFAULT_FRAME 0 1269 DEFAULT_FRAME 0
1270 movq %rsp,%rdi /* pt_regs pointer */ 1270 movq %rsp,%rdi /* pt_regs pointer */
1271 movq ORIG_RAX(%rsp),%rsi /* get error code */ 1271 movq ORIG_RAX(%rsp),%rsi /* get error code */
1272 movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ 1272 movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
1273 call \do_sym 1273 call \do_sym
1274 jmp error_exit /* %ebx: no swapgs flag */ 1274 jmp error_exit /* %ebx: no swapgs flag */
1275 CFI_ENDPROC 1275 CFI_ENDPROC
1276 END(\sym) 1276 END(\sym)
1277 .endm 1277 .endm
1278 1278
1279 /* error code is on the stack already */ 1279 /* error code is on the stack already */
1280 .macro paranoiderrorentry sym do_sym 1280 .macro paranoiderrorentry sym do_sym
1281 ENTRY(\sym) 1281 ENTRY(\sym)
1282 XCPT_FRAME 1282 XCPT_FRAME
1283 ASM_CLAC 1283 ASM_CLAC
1284 PARAVIRT_ADJUST_EXCEPTION_FRAME 1284 PARAVIRT_ADJUST_EXCEPTION_FRAME
1285 subq $ORIG_RAX-R15, %rsp 1285 subq $ORIG_RAX-R15, %rsp
1286 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1286 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1287 call save_paranoid 1287 call save_paranoid
1288 DEFAULT_FRAME 0 1288 DEFAULT_FRAME 0
1289 TRACE_IRQS_OFF 1289 TRACE_IRQS_OFF
1290 movq %rsp,%rdi /* pt_regs pointer */ 1290 movq %rsp,%rdi /* pt_regs pointer */
1291 movq ORIG_RAX(%rsp),%rsi /* get error code */ 1291 movq ORIG_RAX(%rsp),%rsi /* get error code */
1292 movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ 1292 movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
1293 call \do_sym 1293 call \do_sym
1294 jmp paranoid_exit /* %ebx: no swapgs flag */ 1294 jmp paranoid_exit /* %ebx: no swapgs flag */
1295 CFI_ENDPROC 1295 CFI_ENDPROC
1296 END(\sym) 1296 END(\sym)
1297 .endm 1297 .endm
1298 1298
1299 zeroentry divide_error do_divide_error 1299 zeroentry divide_error do_divide_error
1300 zeroentry overflow do_overflow 1300 zeroentry overflow do_overflow
1301 zeroentry bounds do_bounds 1301 zeroentry bounds do_bounds
1302 zeroentry invalid_op do_invalid_op 1302 zeroentry invalid_op do_invalid_op
1303 zeroentry device_not_available do_device_not_available 1303 zeroentry device_not_available do_device_not_available
1304 paranoiderrorentry double_fault do_double_fault 1304 paranoiderrorentry double_fault do_double_fault
1305 zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun 1305 zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
1306 errorentry invalid_TSS do_invalid_TSS 1306 errorentry invalid_TSS do_invalid_TSS
1307 errorentry segment_not_present do_segment_not_present 1307 errorentry segment_not_present do_segment_not_present
1308 zeroentry spurious_interrupt_bug do_spurious_interrupt_bug 1308 zeroentry spurious_interrupt_bug do_spurious_interrupt_bug
1309 zeroentry coprocessor_error do_coprocessor_error 1309 zeroentry coprocessor_error do_coprocessor_error
1310 errorentry alignment_check do_alignment_check 1310 errorentry alignment_check do_alignment_check
1311 zeroentry simd_coprocessor_error do_simd_coprocessor_error 1311 zeroentry simd_coprocessor_error do_simd_coprocessor_error
1312 1312
1313 1313
1314 /* Reload gs selector with exception handling */ 1314 /* Reload gs selector with exception handling */
1315 /* edi: new selector */ 1315 /* edi: new selector */
1316 ENTRY(native_load_gs_index) 1316 ENTRY(native_load_gs_index)
1317 CFI_STARTPROC 1317 CFI_STARTPROC
1318 pushfq_cfi 1318 pushfq_cfi
1319 DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) 1319 DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
1320 SWAPGS 1320 SWAPGS
1321 gs_change: 1321 gs_change:
1322 movl %edi,%gs 1322 movl %edi,%gs
1323 2: mfence /* workaround */ 1323 2: mfence /* workaround */
1324 SWAPGS 1324 SWAPGS
1325 popfq_cfi 1325 popfq_cfi
1326 ret 1326 ret
1327 CFI_ENDPROC 1327 CFI_ENDPROC
1328 END(native_load_gs_index) 1328 END(native_load_gs_index)
1329 1329
1330 _ASM_EXTABLE(gs_change,bad_gs) 1330 _ASM_EXTABLE(gs_change,bad_gs)
1331 .section .fixup,"ax" 1331 .section .fixup,"ax"
1332 /* running with kernelgs */ 1332 /* running with kernelgs */
1333 bad_gs: 1333 bad_gs:
1334 SWAPGS /* switch back to user gs */ 1334 SWAPGS /* switch back to user gs */
1335 xorl %eax,%eax 1335 xorl %eax,%eax
1336 movl %eax,%gs 1336 movl %eax,%gs
1337 jmp 2b 1337 jmp 2b
1338 .previous 1338 .previous
1339 1339
1340 /* Call softirq on interrupt stack. Interrupts are off. */ 1340 /* Call softirq on interrupt stack. Interrupts are off. */
1341 ENTRY(call_softirq) 1341 ENTRY(call_softirq)
1342 CFI_STARTPROC 1342 CFI_STARTPROC
1343 pushq_cfi %rbp 1343 pushq_cfi %rbp
1344 CFI_REL_OFFSET rbp,0 1344 CFI_REL_OFFSET rbp,0
1345 mov %rsp,%rbp 1345 mov %rsp,%rbp
1346 CFI_DEF_CFA_REGISTER rbp 1346 CFI_DEF_CFA_REGISTER rbp
1347 incl PER_CPU_VAR(irq_count) 1347 incl PER_CPU_VAR(irq_count)
1348 cmove PER_CPU_VAR(irq_stack_ptr),%rsp 1348 cmove PER_CPU_VAR(irq_stack_ptr),%rsp
1349 push %rbp # backlink for old unwinder 1349 push %rbp # backlink for old unwinder
1350 call __do_softirq 1350 call __do_softirq
1351 leaveq 1351 leaveq
1352 CFI_RESTORE rbp 1352 CFI_RESTORE rbp
1353 CFI_DEF_CFA_REGISTER rsp 1353 CFI_DEF_CFA_REGISTER rsp
1354 CFI_ADJUST_CFA_OFFSET -8 1354 CFI_ADJUST_CFA_OFFSET -8
1355 decl PER_CPU_VAR(irq_count) 1355 decl PER_CPU_VAR(irq_count)
1356 ret 1356 ret
1357 CFI_ENDPROC 1357 CFI_ENDPROC
1358 END(call_softirq) 1358 END(call_softirq)
1359 1359
1360 #ifdef CONFIG_XEN 1360 #ifdef CONFIG_XEN
1361 zeroentry xen_hypervisor_callback xen_do_hypervisor_callback 1361 zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
1362 1362
1363 /* 1363 /*
1364 * A note on the "critical region" in our callback handler. 1364 * A note on the "critical region" in our callback handler.
1365 * We want to avoid stacking callback handlers due to events occurring 1365 * We want to avoid stacking callback handlers due to events occurring
1366 * during handling of the last event. To do this, we keep events disabled 1366 * during handling of the last event. To do this, we keep events disabled
1367 * until we've done all processing. HOWEVER, we must enable events before 1367 * until we've done all processing. HOWEVER, we must enable events before
1368 * popping the stack frame (can't be done atomically) and so it would still 1368 * popping the stack frame (can't be done atomically) and so it would still
1369 * be possible to get enough handler activations to overflow the stack. 1369 * be possible to get enough handler activations to overflow the stack.
1370 * Although unlikely, bugs of that kind are hard to track down, so we'd 1370 * Although unlikely, bugs of that kind are hard to track down, so we'd
1371 * like to avoid the possibility. 1371 * like to avoid the possibility.
1372 * So, on entry to the handler we detect whether we interrupted an 1372 * So, on entry to the handler we detect whether we interrupted an
1373 * existing activation in its critical region -- if so, we pop the current 1373 * existing activation in its critical region -- if so, we pop the current
1374 * activation and restart the handler using the previous one. 1374 * activation and restart the handler using the previous one.
1375 */ 1375 */
1376 ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) 1376 ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1377 CFI_STARTPROC 1377 CFI_STARTPROC
1378 /* 1378 /*
1379 * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will 1379 * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
1380 * see the correct pointer to the pt_regs 1380 * see the correct pointer to the pt_regs
1381 */ 1381 */
1382 movq %rdi, %rsp # we don't return, adjust the stack frame 1382 movq %rdi, %rsp # we don't return, adjust the stack frame
1383 CFI_ENDPROC 1383 CFI_ENDPROC
1384 DEFAULT_FRAME 1384 DEFAULT_FRAME
1385 11: incl PER_CPU_VAR(irq_count) 1385 11: incl PER_CPU_VAR(irq_count)
1386 movq %rsp,%rbp 1386 movq %rsp,%rbp
1387 CFI_DEF_CFA_REGISTER rbp 1387 CFI_DEF_CFA_REGISTER rbp
1388 cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp 1388 cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
1389 pushq %rbp # backlink for old unwinder 1389 pushq %rbp # backlink for old unwinder
1390 call xen_evtchn_do_upcall 1390 call xen_evtchn_do_upcall
1391 popq %rsp 1391 popq %rsp
1392 CFI_DEF_CFA_REGISTER rsp 1392 CFI_DEF_CFA_REGISTER rsp
1393 decl PER_CPU_VAR(irq_count) 1393 decl PER_CPU_VAR(irq_count)
1394 jmp error_exit 1394 jmp error_exit
1395 CFI_ENDPROC 1395 CFI_ENDPROC
1396 END(xen_do_hypervisor_callback) 1396 END(xen_do_hypervisor_callback)
1397 1397
1398 /* 1398 /*
1399 * Hypervisor uses this for application faults while it executes. 1399 * Hypervisor uses this for application faults while it executes.
1400 * We get here for two reasons: 1400 * We get here for two reasons:
1401 * 1. Fault while reloading DS, ES, FS or GS 1401 * 1. Fault while reloading DS, ES, FS or GS
1402 * 2. Fault while executing IRET 1402 * 2. Fault while executing IRET
1403 * Category 1 we do not need to fix up as Xen has already reloaded all segment 1403 * Category 1 we do not need to fix up as Xen has already reloaded all segment
1404 * registers that could be reloaded and zeroed the others. 1404 * registers that could be reloaded and zeroed the others.
1405 * Category 2 we fix up by killing the current process. We cannot use the 1405 * Category 2 we fix up by killing the current process. We cannot use the
1406 * normal Linux return path in this case because if we use the IRET hypercall 1406 * normal Linux return path in this case because if we use the IRET hypercall
1407 * to pop the stack frame we end up in an infinite loop of failsafe callbacks. 1407 * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1408 * We distinguish between categories by comparing each saved segment register 1408 * We distinguish between categories by comparing each saved segment register
1409 * with its current contents: any discrepancy means we in category 1. 1409 * with its current contents: any discrepancy means we in category 1.
1410 */ 1410 */
1411 ENTRY(xen_failsafe_callback) 1411 ENTRY(xen_failsafe_callback)
1412 INTR_FRAME 1 (6*8) 1412 INTR_FRAME 1 (6*8)
1413 /*CFI_REL_OFFSET gs,GS*/ 1413 /*CFI_REL_OFFSET gs,GS*/
1414 /*CFI_REL_OFFSET fs,FS*/ 1414 /*CFI_REL_OFFSET fs,FS*/
1415 /*CFI_REL_OFFSET es,ES*/ 1415 /*CFI_REL_OFFSET es,ES*/
1416 /*CFI_REL_OFFSET ds,DS*/ 1416 /*CFI_REL_OFFSET ds,DS*/
1417 CFI_REL_OFFSET r11,8 1417 CFI_REL_OFFSET r11,8
1418 CFI_REL_OFFSET rcx,0 1418 CFI_REL_OFFSET rcx,0
1419 movw %ds,%cx 1419 movw %ds,%cx
1420 cmpw %cx,0x10(%rsp) 1420 cmpw %cx,0x10(%rsp)
1421 CFI_REMEMBER_STATE 1421 CFI_REMEMBER_STATE
1422 jne 1f 1422 jne 1f
1423 movw %es,%cx 1423 movw %es,%cx
1424 cmpw %cx,0x18(%rsp) 1424 cmpw %cx,0x18(%rsp)
1425 jne 1f 1425 jne 1f
1426 movw %fs,%cx 1426 movw %fs,%cx
1427 cmpw %cx,0x20(%rsp) 1427 cmpw %cx,0x20(%rsp)
1428 jne 1f 1428 jne 1f
1429 movw %gs,%cx 1429 movw %gs,%cx
1430 cmpw %cx,0x28(%rsp) 1430 cmpw %cx,0x28(%rsp)
1431 jne 1f 1431 jne 1f
1432 /* All segments match their saved values => Category 2 (Bad IRET). */ 1432 /* All segments match their saved values => Category 2 (Bad IRET). */
1433 movq (%rsp),%rcx 1433 movq (%rsp),%rcx
1434 CFI_RESTORE rcx 1434 CFI_RESTORE rcx
1435 movq 8(%rsp),%r11 1435 movq 8(%rsp),%r11
1436 CFI_RESTORE r11 1436 CFI_RESTORE r11
1437 addq $0x30,%rsp 1437 addq $0x30,%rsp
1438 CFI_ADJUST_CFA_OFFSET -0x30 1438 CFI_ADJUST_CFA_OFFSET -0x30
1439 pushq_cfi $0 /* RIP */ 1439 pushq_cfi $0 /* RIP */
1440 pushq_cfi %r11 1440 pushq_cfi %r11
1441 pushq_cfi %rcx 1441 pushq_cfi %rcx
1442 jmp general_protection 1442 jmp general_protection
1443 CFI_RESTORE_STATE 1443 CFI_RESTORE_STATE
1444 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ 1444 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
1445 movq (%rsp),%rcx 1445 movq (%rsp),%rcx
1446 CFI_RESTORE rcx 1446 CFI_RESTORE rcx
1447 movq 8(%rsp),%r11 1447 movq 8(%rsp),%r11
1448 CFI_RESTORE r11 1448 CFI_RESTORE r11
1449 addq $0x30,%rsp 1449 addq $0x30,%rsp
1450 CFI_ADJUST_CFA_OFFSET -0x30 1450 CFI_ADJUST_CFA_OFFSET -0x30
1451 pushq_cfi $-1 /* orig_ax = -1 => not a system call */ 1451 pushq_cfi $-1 /* orig_ax = -1 => not a system call */
1452 SAVE_ALL 1452 SAVE_ALL
1453 jmp error_exit 1453 jmp error_exit
1454 CFI_ENDPROC 1454 CFI_ENDPROC
1455 END(xen_failsafe_callback) 1455 END(xen_failsafe_callback)
1456 1456
1457 apicinterrupt XEN_HVM_EVTCHN_CALLBACK \ 1457 apicinterrupt HYPERVISOR_CALLBACK_VECTOR \
1458 xen_hvm_callback_vector xen_evtchn_do_upcall 1458 xen_hvm_callback_vector xen_evtchn_do_upcall
1459 1459
1460 #endif /* CONFIG_XEN */ 1460 #endif /* CONFIG_XEN */
1461
1462 #if IS_ENABLED(CONFIG_HYPERV)
1463 apicinterrupt HYPERVISOR_CALLBACK_VECTOR \
1464 hyperv_callback_vector hyperv_vector_handler
1465 #endif /* CONFIG_HYPERV */
1461 1466
1462 /* 1467 /*
1463 * Some functions should be protected against kprobes 1468 * Some functions should be protected against kprobes
1464 */ 1469 */
1465 .pushsection .kprobes.text, "ax" 1470 .pushsection .kprobes.text, "ax"
1466 1471
1467 paranoidzeroentry_ist debug do_debug DEBUG_STACK 1472 paranoidzeroentry_ist debug do_debug DEBUG_STACK
1468 paranoidzeroentry_ist int3 do_int3 DEBUG_STACK 1473 paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
1469 paranoiderrorentry stack_segment do_stack_segment 1474 paranoiderrorentry stack_segment do_stack_segment
1470 #ifdef CONFIG_XEN 1475 #ifdef CONFIG_XEN
1471 zeroentry xen_debug do_debug 1476 zeroentry xen_debug do_debug
1472 zeroentry xen_int3 do_int3 1477 zeroentry xen_int3 do_int3
1473 errorentry xen_stack_segment do_stack_segment 1478 errorentry xen_stack_segment do_stack_segment
1474 #endif 1479 #endif
1475 errorentry general_protection do_general_protection 1480 errorentry general_protection do_general_protection
1476 errorentry page_fault do_page_fault 1481 errorentry page_fault do_page_fault
1477 #ifdef CONFIG_KVM_GUEST 1482 #ifdef CONFIG_KVM_GUEST
1478 errorentry async_page_fault do_async_page_fault 1483 errorentry async_page_fault do_async_page_fault
1479 #endif 1484 #endif
1480 #ifdef CONFIG_X86_MCE 1485 #ifdef CONFIG_X86_MCE
1481 paranoidzeroentry machine_check *machine_check_vector(%rip) 1486 paranoidzeroentry machine_check *machine_check_vector(%rip)
1482 #endif 1487 #endif
1483 1488
1484 /* 1489 /*
1485 * "Paranoid" exit path from exception stack. 1490 * "Paranoid" exit path from exception stack.
1486 * Paranoid because this is used by NMIs and cannot take 1491 * Paranoid because this is used by NMIs and cannot take
1487 * any kernel state for granted. 1492 * any kernel state for granted.
1488 * We don't do kernel preemption checks here, because only 1493 * We don't do kernel preemption checks here, because only
1489 * NMI should be common and it does not enable IRQs and 1494 * NMI should be common and it does not enable IRQs and
1490 * cannot get reschedule ticks. 1495 * cannot get reschedule ticks.
1491 * 1496 *
1492 * "trace" is 0 for the NMI handler only, because irq-tracing 1497 * "trace" is 0 for the NMI handler only, because irq-tracing
1493 * is fundamentally NMI-unsafe. (we cannot change the soft and 1498 * is fundamentally NMI-unsafe. (we cannot change the soft and
1494 * hard flags at once, atomically) 1499 * hard flags at once, atomically)
1495 */ 1500 */
1496 1501
1497 /* ebx: no swapgs flag */ 1502 /* ebx: no swapgs flag */
1498 ENTRY(paranoid_exit) 1503 ENTRY(paranoid_exit)
1499 DEFAULT_FRAME 1504 DEFAULT_FRAME
1500 DISABLE_INTERRUPTS(CLBR_NONE) 1505 DISABLE_INTERRUPTS(CLBR_NONE)
1501 TRACE_IRQS_OFF_DEBUG 1506 TRACE_IRQS_OFF_DEBUG
1502 testl %ebx,%ebx /* swapgs needed? */ 1507 testl %ebx,%ebx /* swapgs needed? */
1503 jnz paranoid_restore 1508 jnz paranoid_restore
1504 testl $3,CS(%rsp) 1509 testl $3,CS(%rsp)
1505 jnz paranoid_userspace 1510 jnz paranoid_userspace
1506 paranoid_swapgs: 1511 paranoid_swapgs:
1507 TRACE_IRQS_IRETQ 0 1512 TRACE_IRQS_IRETQ 0
1508 SWAPGS_UNSAFE_STACK 1513 SWAPGS_UNSAFE_STACK
1509 RESTORE_ALL 8 1514 RESTORE_ALL 8
1510 jmp irq_return 1515 jmp irq_return
1511 paranoid_restore: 1516 paranoid_restore:
1512 TRACE_IRQS_IRETQ_DEBUG 0 1517 TRACE_IRQS_IRETQ_DEBUG 0
1513 RESTORE_ALL 8 1518 RESTORE_ALL 8
1514 jmp irq_return 1519 jmp irq_return
1515 paranoid_userspace: 1520 paranoid_userspace:
1516 GET_THREAD_INFO(%rcx) 1521 GET_THREAD_INFO(%rcx)
1517 movl TI_flags(%rcx),%ebx 1522 movl TI_flags(%rcx),%ebx
1518 andl $_TIF_WORK_MASK,%ebx 1523 andl $_TIF_WORK_MASK,%ebx
1519 jz paranoid_swapgs 1524 jz paranoid_swapgs
1520 movq %rsp,%rdi /* &pt_regs */ 1525 movq %rsp,%rdi /* &pt_regs */
1521 call sync_regs 1526 call sync_regs
1522 movq %rax,%rsp /* switch stack for scheduling */ 1527 movq %rax,%rsp /* switch stack for scheduling */
1523 testl $_TIF_NEED_RESCHED,%ebx 1528 testl $_TIF_NEED_RESCHED,%ebx
1524 jnz paranoid_schedule 1529 jnz paranoid_schedule
1525 movl %ebx,%edx /* arg3: thread flags */ 1530 movl %ebx,%edx /* arg3: thread flags */
1526 TRACE_IRQS_ON 1531 TRACE_IRQS_ON
1527 ENABLE_INTERRUPTS(CLBR_NONE) 1532 ENABLE_INTERRUPTS(CLBR_NONE)
1528 xorl %esi,%esi /* arg2: oldset */ 1533 xorl %esi,%esi /* arg2: oldset */
1529 movq %rsp,%rdi /* arg1: &pt_regs */ 1534 movq %rsp,%rdi /* arg1: &pt_regs */
1530 call do_notify_resume 1535 call do_notify_resume
1531 DISABLE_INTERRUPTS(CLBR_NONE) 1536 DISABLE_INTERRUPTS(CLBR_NONE)
1532 TRACE_IRQS_OFF 1537 TRACE_IRQS_OFF
1533 jmp paranoid_userspace 1538 jmp paranoid_userspace
1534 paranoid_schedule: 1539 paranoid_schedule:
1535 TRACE_IRQS_ON 1540 TRACE_IRQS_ON
1536 ENABLE_INTERRUPTS(CLBR_ANY) 1541 ENABLE_INTERRUPTS(CLBR_ANY)
1537 SCHEDULE_USER 1542 SCHEDULE_USER
1538 DISABLE_INTERRUPTS(CLBR_ANY) 1543 DISABLE_INTERRUPTS(CLBR_ANY)
1539 TRACE_IRQS_OFF 1544 TRACE_IRQS_OFF
1540 jmp paranoid_userspace 1545 jmp paranoid_userspace
1541 CFI_ENDPROC 1546 CFI_ENDPROC
1542 END(paranoid_exit) 1547 END(paranoid_exit)
1543 1548
1544 /* 1549 /*
1545 * Exception entry point. This expects an error code/orig_rax on the stack. 1550 * Exception entry point. This expects an error code/orig_rax on the stack.
1546 * returns in "no swapgs flag" in %ebx. 1551 * returns in "no swapgs flag" in %ebx.
1547 */ 1552 */
1548 ENTRY(error_entry) 1553 ENTRY(error_entry)
1549 XCPT_FRAME 1554 XCPT_FRAME
1550 CFI_ADJUST_CFA_OFFSET 15*8 1555 CFI_ADJUST_CFA_OFFSET 15*8
1551 /* oldrax contains error code */ 1556 /* oldrax contains error code */
1552 cld 1557 cld
1553 movq_cfi rdi, RDI+8 1558 movq_cfi rdi, RDI+8
1554 movq_cfi rsi, RSI+8 1559 movq_cfi rsi, RSI+8
1555 movq_cfi rdx, RDX+8 1560 movq_cfi rdx, RDX+8
1556 movq_cfi rcx, RCX+8 1561 movq_cfi rcx, RCX+8
1557 movq_cfi rax, RAX+8 1562 movq_cfi rax, RAX+8
1558 movq_cfi r8, R8+8 1563 movq_cfi r8, R8+8
1559 movq_cfi r9, R9+8 1564 movq_cfi r9, R9+8
1560 movq_cfi r10, R10+8 1565 movq_cfi r10, R10+8
1561 movq_cfi r11, R11+8 1566 movq_cfi r11, R11+8
1562 movq_cfi rbx, RBX+8 1567 movq_cfi rbx, RBX+8
1563 movq_cfi rbp, RBP+8 1568 movq_cfi rbp, RBP+8
1564 movq_cfi r12, R12+8 1569 movq_cfi r12, R12+8
1565 movq_cfi r13, R13+8 1570 movq_cfi r13, R13+8
1566 movq_cfi r14, R14+8 1571 movq_cfi r14, R14+8
1567 movq_cfi r15, R15+8 1572 movq_cfi r15, R15+8
1568 xorl %ebx,%ebx 1573 xorl %ebx,%ebx
1569 testl $3,CS+8(%rsp) 1574 testl $3,CS+8(%rsp)
1570 je error_kernelspace 1575 je error_kernelspace
1571 error_swapgs: 1576 error_swapgs:
1572 SWAPGS 1577 SWAPGS
1573 error_sti: 1578 error_sti:
1574 TRACE_IRQS_OFF 1579 TRACE_IRQS_OFF
1575 ret 1580 ret
1576 1581
1577 /* 1582 /*
1578 * There are two places in the kernel that can potentially fault with 1583 * There are two places in the kernel that can potentially fault with
1579 * usergs. Handle them here. The exception handlers after iret run with 1584 * usergs. Handle them here. The exception handlers after iret run with
1580 * kernel gs again, so don't set the user space flag. B stepping K8s 1585 * kernel gs again, so don't set the user space flag. B stepping K8s
1581 * sometimes report an truncated RIP for IRET exceptions returning to 1586 * sometimes report an truncated RIP for IRET exceptions returning to
1582 * compat mode. Check for these here too. 1587 * compat mode. Check for these here too.
1583 */ 1588 */
1584 error_kernelspace: 1589 error_kernelspace:
1585 incl %ebx 1590 incl %ebx
1586 leaq irq_return(%rip),%rcx 1591 leaq irq_return(%rip),%rcx
1587 cmpq %rcx,RIP+8(%rsp) 1592 cmpq %rcx,RIP+8(%rsp)
1588 je error_swapgs 1593 je error_swapgs
1589 movl %ecx,%eax /* zero extend */ 1594 movl %ecx,%eax /* zero extend */
1590 cmpq %rax,RIP+8(%rsp) 1595 cmpq %rax,RIP+8(%rsp)
1591 je bstep_iret 1596 je bstep_iret
1592 cmpq $gs_change,RIP+8(%rsp) 1597 cmpq $gs_change,RIP+8(%rsp)
1593 je error_swapgs 1598 je error_swapgs
1594 jmp error_sti 1599 jmp error_sti
1595 1600
1596 bstep_iret: 1601 bstep_iret:
1597 /* Fix truncated RIP */ 1602 /* Fix truncated RIP */
1598 movq %rcx,RIP+8(%rsp) 1603 movq %rcx,RIP+8(%rsp)
1599 jmp error_swapgs 1604 jmp error_swapgs
1600 CFI_ENDPROC 1605 CFI_ENDPROC
1601 END(error_entry) 1606 END(error_entry)
1602 1607
1603 1608
1604 /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ 1609 /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
1605 ENTRY(error_exit) 1610 ENTRY(error_exit)
1606 DEFAULT_FRAME 1611 DEFAULT_FRAME
1607 movl %ebx,%eax 1612 movl %ebx,%eax
1608 RESTORE_REST 1613 RESTORE_REST
1609 DISABLE_INTERRUPTS(CLBR_NONE) 1614 DISABLE_INTERRUPTS(CLBR_NONE)
1610 TRACE_IRQS_OFF 1615 TRACE_IRQS_OFF
1611 GET_THREAD_INFO(%rcx) 1616 GET_THREAD_INFO(%rcx)
1612 testl %eax,%eax 1617 testl %eax,%eax
1613 jne retint_kernel 1618 jne retint_kernel
1614 LOCKDEP_SYS_EXIT_IRQ 1619 LOCKDEP_SYS_EXIT_IRQ
1615 movl TI_flags(%rcx),%edx 1620 movl TI_flags(%rcx),%edx
1616 movl $_TIF_WORK_MASK,%edi 1621 movl $_TIF_WORK_MASK,%edi
1617 andl %edi,%edx 1622 andl %edi,%edx
1618 jnz retint_careful 1623 jnz retint_careful
1619 jmp retint_swapgs 1624 jmp retint_swapgs
1620 CFI_ENDPROC 1625 CFI_ENDPROC
1621 END(error_exit) 1626 END(error_exit)
1622 1627
1623 /* 1628 /*
1624 * Test if a given stack is an NMI stack or not. 1629 * Test if a given stack is an NMI stack or not.
1625 */ 1630 */
1626 .macro test_in_nmi reg stack nmi_ret normal_ret 1631 .macro test_in_nmi reg stack nmi_ret normal_ret
1627 cmpq %\reg, \stack 1632 cmpq %\reg, \stack
1628 ja \normal_ret 1633 ja \normal_ret
1629 subq $EXCEPTION_STKSZ, %\reg 1634 subq $EXCEPTION_STKSZ, %\reg
1630 cmpq %\reg, \stack 1635 cmpq %\reg, \stack
1631 jb \normal_ret 1636 jb \normal_ret
1632 jmp \nmi_ret 1637 jmp \nmi_ret
1633 .endm 1638 .endm
1634 1639
1635 /* runs on exception stack */ 1640 /* runs on exception stack */
1636 ENTRY(nmi) 1641 ENTRY(nmi)
1637 INTR_FRAME 1642 INTR_FRAME
1638 PARAVIRT_ADJUST_EXCEPTION_FRAME 1643 PARAVIRT_ADJUST_EXCEPTION_FRAME
1639 /* 1644 /*
1640 * We allow breakpoints in NMIs. If a breakpoint occurs, then 1645 * We allow breakpoints in NMIs. If a breakpoint occurs, then
1641 * the iretq it performs will take us out of NMI context. 1646 * the iretq it performs will take us out of NMI context.
1642 * This means that we can have nested NMIs where the next 1647 * This means that we can have nested NMIs where the next
1643 * NMI is using the top of the stack of the previous NMI. We 1648 * NMI is using the top of the stack of the previous NMI. We
1644 * can't let it execute because the nested NMI will corrupt the 1649 * can't let it execute because the nested NMI will corrupt the
1645 * stack of the previous NMI. NMI handlers are not re-entrant 1650 * stack of the previous NMI. NMI handlers are not re-entrant
1646 * anyway. 1651 * anyway.
1647 * 1652 *
1648 * To handle this case we do the following: 1653 * To handle this case we do the following:
1649 * Check the a special location on the stack that contains 1654 * Check the a special location on the stack that contains
1650 * a variable that is set when NMIs are executing. 1655 * a variable that is set when NMIs are executing.
1651 * The interrupted task's stack is also checked to see if it 1656 * The interrupted task's stack is also checked to see if it
1652 * is an NMI stack. 1657 * is an NMI stack.
1653 * If the variable is not set and the stack is not the NMI 1658 * If the variable is not set and the stack is not the NMI
1654 * stack then: 1659 * stack then:
1655 * o Set the special variable on the stack 1660 * o Set the special variable on the stack
1656 * o Copy the interrupt frame into a "saved" location on the stack 1661 * o Copy the interrupt frame into a "saved" location on the stack
1657 * o Copy the interrupt frame into a "copy" location on the stack 1662 * o Copy the interrupt frame into a "copy" location on the stack
1658 * o Continue processing the NMI 1663 * o Continue processing the NMI
1659 * If the variable is set or the previous stack is the NMI stack: 1664 * If the variable is set or the previous stack is the NMI stack:
1660 * o Modify the "copy" location to jump to the repeate_nmi 1665 * o Modify the "copy" location to jump to the repeate_nmi
1661 * o return back to the first NMI 1666 * o return back to the first NMI
1662 * 1667 *
1663 * Now on exit of the first NMI, we first clear the stack variable 1668 * Now on exit of the first NMI, we first clear the stack variable
1664 * The NMI stack will tell any nested NMIs at that point that it is 1669 * The NMI stack will tell any nested NMIs at that point that it is
1665 * nested. Then we pop the stack normally with iret, and if there was 1670 * nested. Then we pop the stack normally with iret, and if there was
1666 * a nested NMI that updated the copy interrupt stack frame, a 1671 * a nested NMI that updated the copy interrupt stack frame, a
1667 * jump will be made to the repeat_nmi code that will handle the second 1672 * jump will be made to the repeat_nmi code that will handle the second
1668 * NMI. 1673 * NMI.
1669 */ 1674 */
1670 1675
1671 /* Use %rdx as out temp variable throughout */ 1676 /* Use %rdx as out temp variable throughout */
1672 pushq_cfi %rdx 1677 pushq_cfi %rdx
1673 CFI_REL_OFFSET rdx, 0 1678 CFI_REL_OFFSET rdx, 0
1674 1679
1675 /* 1680 /*
1676 * If %cs was not the kernel segment, then the NMI triggered in user 1681 * If %cs was not the kernel segment, then the NMI triggered in user
1677 * space, which means it is definitely not nested. 1682 * space, which means it is definitely not nested.
1678 */ 1683 */
1679 cmpl $__KERNEL_CS, 16(%rsp) 1684 cmpl $__KERNEL_CS, 16(%rsp)
1680 jne first_nmi 1685 jne first_nmi
1681 1686
1682 /* 1687 /*
1683 * Check the special variable on the stack to see if NMIs are 1688 * Check the special variable on the stack to see if NMIs are
1684 * executing. 1689 * executing.
1685 */ 1690 */
1686 cmpl $1, -8(%rsp) 1691 cmpl $1, -8(%rsp)
1687 je nested_nmi 1692 je nested_nmi
1688 1693
1689 /* 1694 /*
1690 * Now test if the previous stack was an NMI stack. 1695 * Now test if the previous stack was an NMI stack.
1691 * We need the double check. We check the NMI stack to satisfy the 1696 * We need the double check. We check the NMI stack to satisfy the
1692 * race when the first NMI clears the variable before returning. 1697 * race when the first NMI clears the variable before returning.
1693 * We check the variable because the first NMI could be in a 1698 * We check the variable because the first NMI could be in a
1694 * breakpoint routine using a breakpoint stack. 1699 * breakpoint routine using a breakpoint stack.
1695 */ 1700 */
1696 lea 6*8(%rsp), %rdx 1701 lea 6*8(%rsp), %rdx
1697 test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi 1702 test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
1698 CFI_REMEMBER_STATE 1703 CFI_REMEMBER_STATE
1699 1704
1700 nested_nmi: 1705 nested_nmi:
1701 /* 1706 /*
1702 * Do nothing if we interrupted the fixup in repeat_nmi. 1707 * Do nothing if we interrupted the fixup in repeat_nmi.
1703 * It's about to repeat the NMI handler, so we are fine 1708 * It's about to repeat the NMI handler, so we are fine
1704 * with ignoring this one. 1709 * with ignoring this one.
1705 */ 1710 */
1706 movq $repeat_nmi, %rdx 1711 movq $repeat_nmi, %rdx
1707 cmpq 8(%rsp), %rdx 1712 cmpq 8(%rsp), %rdx
1708 ja 1f 1713 ja 1f
1709 movq $end_repeat_nmi, %rdx 1714 movq $end_repeat_nmi, %rdx
1710 cmpq 8(%rsp), %rdx 1715 cmpq 8(%rsp), %rdx
1711 ja nested_nmi_out 1716 ja nested_nmi_out
1712 1717
1713 1: 1718 1:
1714 /* Set up the interrupted NMIs stack to jump to repeat_nmi */ 1719 /* Set up the interrupted NMIs stack to jump to repeat_nmi */
1715 leaq -1*8(%rsp), %rdx 1720 leaq -1*8(%rsp), %rdx
1716 movq %rdx, %rsp 1721 movq %rdx, %rsp
1717 CFI_ADJUST_CFA_OFFSET 1*8 1722 CFI_ADJUST_CFA_OFFSET 1*8
1718 leaq -10*8(%rsp), %rdx 1723 leaq -10*8(%rsp), %rdx
1719 pushq_cfi $__KERNEL_DS 1724 pushq_cfi $__KERNEL_DS
1720 pushq_cfi %rdx 1725 pushq_cfi %rdx
1721 pushfq_cfi 1726 pushfq_cfi
1722 pushq_cfi $__KERNEL_CS 1727 pushq_cfi $__KERNEL_CS
1723 pushq_cfi $repeat_nmi 1728 pushq_cfi $repeat_nmi
1724 1729
1725 /* Put stack back */ 1730 /* Put stack back */
1726 addq $(6*8), %rsp 1731 addq $(6*8), %rsp
1727 CFI_ADJUST_CFA_OFFSET -6*8 1732 CFI_ADJUST_CFA_OFFSET -6*8
1728 1733
1729 nested_nmi_out: 1734 nested_nmi_out:
1730 popq_cfi %rdx 1735 popq_cfi %rdx
1731 CFI_RESTORE rdx 1736 CFI_RESTORE rdx
1732 1737
1733 /* No need to check faults here */ 1738 /* No need to check faults here */
1734 INTERRUPT_RETURN 1739 INTERRUPT_RETURN
1735 1740
1736 CFI_RESTORE_STATE 1741 CFI_RESTORE_STATE
1737 first_nmi: 1742 first_nmi:
1738 /* 1743 /*
1739 * Because nested NMIs will use the pushed location that we 1744 * Because nested NMIs will use the pushed location that we
1740 * stored in rdx, we must keep that space available. 1745 * stored in rdx, we must keep that space available.
1741 * Here's what our stack frame will look like: 1746 * Here's what our stack frame will look like:
1742 * +-------------------------+ 1747 * +-------------------------+
1743 * | original SS | 1748 * | original SS |
1744 * | original Return RSP | 1749 * | original Return RSP |
1745 * | original RFLAGS | 1750 * | original RFLAGS |
1746 * | original CS | 1751 * | original CS |
1747 * | original RIP | 1752 * | original RIP |
1748 * +-------------------------+ 1753 * +-------------------------+
1749 * | temp storage for rdx | 1754 * | temp storage for rdx |
1750 * +-------------------------+ 1755 * +-------------------------+
1751 * | NMI executing variable | 1756 * | NMI executing variable |
1752 * +-------------------------+ 1757 * +-------------------------+
1753 * | copied SS | 1758 * | copied SS |
1754 * | copied Return RSP | 1759 * | copied Return RSP |
1755 * | copied RFLAGS | 1760 * | copied RFLAGS |
1756 * | copied CS | 1761 * | copied CS |
1757 * | copied RIP | 1762 * | copied RIP |
1758 * +-------------------------+ 1763 * +-------------------------+
1759 * | Saved SS | 1764 * | Saved SS |
1760 * | Saved Return RSP | 1765 * | Saved Return RSP |
1761 * | Saved RFLAGS | 1766 * | Saved RFLAGS |
1762 * | Saved CS | 1767 * | Saved CS |
1763 * | Saved RIP | 1768 * | Saved RIP |
1764 * +-------------------------+ 1769 * +-------------------------+
1765 * | pt_regs | 1770 * | pt_regs |
1766 * +-------------------------+ 1771 * +-------------------------+
1767 * 1772 *
1768 * The saved stack frame is used to fix up the copied stack frame 1773 * The saved stack frame is used to fix up the copied stack frame
1769 * that a nested NMI may change to make the interrupted NMI iret jump 1774 * that a nested NMI may change to make the interrupted NMI iret jump
1770 * to the repeat_nmi. The original stack frame and the temp storage 1775 * to the repeat_nmi. The original stack frame and the temp storage
1771 * is also used by nested NMIs and can not be trusted on exit. 1776 * is also used by nested NMIs and can not be trusted on exit.
1772 */ 1777 */
1773 /* Do not pop rdx, nested NMIs will corrupt that part of the stack */ 1778 /* Do not pop rdx, nested NMIs will corrupt that part of the stack */
1774 movq (%rsp), %rdx 1779 movq (%rsp), %rdx
1775 CFI_RESTORE rdx 1780 CFI_RESTORE rdx
1776 1781
1777 /* Set the NMI executing variable on the stack. */ 1782 /* Set the NMI executing variable on the stack. */
1778 pushq_cfi $1 1783 pushq_cfi $1
1779 1784
1780 /* 1785 /*
1781 * Leave room for the "copied" frame 1786 * Leave room for the "copied" frame
1782 */ 1787 */
1783 subq $(5*8), %rsp 1788 subq $(5*8), %rsp
1784 CFI_ADJUST_CFA_OFFSET 5*8 1789 CFI_ADJUST_CFA_OFFSET 5*8
1785 1790
1786 /* Copy the stack frame to the Saved frame */ 1791 /* Copy the stack frame to the Saved frame */
1787 .rept 5 1792 .rept 5
1788 pushq_cfi 11*8(%rsp) 1793 pushq_cfi 11*8(%rsp)
1789 .endr 1794 .endr
1790 CFI_DEF_CFA_OFFSET SS+8-RIP 1795 CFI_DEF_CFA_OFFSET SS+8-RIP
1791 1796
1792 /* Everything up to here is safe from nested NMIs */ 1797 /* Everything up to here is safe from nested NMIs */
1793 1798
1794 /* 1799 /*
1795 * If there was a nested NMI, the first NMI's iret will return 1800 * If there was a nested NMI, the first NMI's iret will return
1796 * here. But NMIs are still enabled and we can take another 1801 * here. But NMIs are still enabled and we can take another
1797 * nested NMI. The nested NMI checks the interrupted RIP to see 1802 * nested NMI. The nested NMI checks the interrupted RIP to see
1798 * if it is between repeat_nmi and end_repeat_nmi, and if so 1803 * if it is between repeat_nmi and end_repeat_nmi, and if so
1799 * it will just return, as we are about to repeat an NMI anyway. 1804 * it will just return, as we are about to repeat an NMI anyway.
1800 * This makes it safe to copy to the stack frame that a nested 1805 * This makes it safe to copy to the stack frame that a nested
1801 * NMI will update. 1806 * NMI will update.
1802 */ 1807 */
1803 repeat_nmi: 1808 repeat_nmi:
1804 /* 1809 /*
1805 * Update the stack variable to say we are still in NMI (the update 1810 * Update the stack variable to say we are still in NMI (the update
1806 * is benign for the non-repeat case, where 1 was pushed just above 1811 * is benign for the non-repeat case, where 1 was pushed just above
1807 * to this very stack slot). 1812 * to this very stack slot).
1808 */ 1813 */
1809 movq $1, 10*8(%rsp) 1814 movq $1, 10*8(%rsp)
1810 1815
1811 /* Make another copy, this one may be modified by nested NMIs */ 1816 /* Make another copy, this one may be modified by nested NMIs */
1812 addq $(10*8), %rsp 1817 addq $(10*8), %rsp
1813 CFI_ADJUST_CFA_OFFSET -10*8 1818 CFI_ADJUST_CFA_OFFSET -10*8
1814 .rept 5 1819 .rept 5
1815 pushq_cfi -6*8(%rsp) 1820 pushq_cfi -6*8(%rsp)
1816 .endr 1821 .endr
1817 subq $(5*8), %rsp 1822 subq $(5*8), %rsp
1818 CFI_DEF_CFA_OFFSET SS+8-RIP 1823 CFI_DEF_CFA_OFFSET SS+8-RIP
1819 end_repeat_nmi: 1824 end_repeat_nmi:
1820 1825
1821 /* 1826 /*
1822 * Everything below this point can be preempted by a nested 1827 * Everything below this point can be preempted by a nested
1823 * NMI if the first NMI took an exception and reset our iret stack 1828 * NMI if the first NMI took an exception and reset our iret stack
1824 * so that we repeat another NMI. 1829 * so that we repeat another NMI.
1825 */ 1830 */
1826 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ 1831 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1827 subq $ORIG_RAX-R15, %rsp 1832 subq $ORIG_RAX-R15, %rsp
1828 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1833 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1829 /* 1834 /*
1830 * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit 1835 * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit
1831 * as we should not be calling schedule in NMI context. 1836 * as we should not be calling schedule in NMI context.
1832 * Even with normal interrupts enabled. An NMI should not be 1837 * Even with normal interrupts enabled. An NMI should not be
1833 * setting NEED_RESCHED or anything that normal interrupts and 1838 * setting NEED_RESCHED or anything that normal interrupts and
1834 * exceptions might do. 1839 * exceptions might do.
1835 */ 1840 */
1836 call save_paranoid 1841 call save_paranoid
1837 DEFAULT_FRAME 0 1842 DEFAULT_FRAME 0
1838 1843
1839 /* 1844 /*
1840 * Save off the CR2 register. If we take a page fault in the NMI then 1845 * Save off the CR2 register. If we take a page fault in the NMI then
1841 * it could corrupt the CR2 value. If the NMI preempts a page fault 1846 * it could corrupt the CR2 value. If the NMI preempts a page fault
1842 * handler before it was able to read the CR2 register, and then the 1847 * handler before it was able to read the CR2 register, and then the
1843 * NMI itself takes a page fault, the page fault that was preempted 1848 * NMI itself takes a page fault, the page fault that was preempted
1844 * will read the information from the NMI page fault and not the 1849 * will read the information from the NMI page fault and not the
1845 * origin fault. Save it off and restore it if it changes. 1850 * origin fault. Save it off and restore it if it changes.
1846 * Use the r12 callee-saved register. 1851 * Use the r12 callee-saved register.
1847 */ 1852 */
1848 movq %cr2, %r12 1853 movq %cr2, %r12
1849 1854
1850 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ 1855 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
1851 movq %rsp,%rdi 1856 movq %rsp,%rdi
1852 movq $-1,%rsi 1857 movq $-1,%rsi
1853 call do_nmi 1858 call do_nmi
1854 1859
1855 /* Did the NMI take a page fault? Restore cr2 if it did */ 1860 /* Did the NMI take a page fault? Restore cr2 if it did */
1856 movq %cr2, %rcx 1861 movq %cr2, %rcx
1857 cmpq %rcx, %r12 1862 cmpq %rcx, %r12
1858 je 1f 1863 je 1f
1859 movq %r12, %cr2 1864 movq %r12, %cr2
1860 1: 1865 1:
1861 1866
1862 testl %ebx,%ebx /* swapgs needed? */ 1867 testl %ebx,%ebx /* swapgs needed? */
1863 jnz nmi_restore 1868 jnz nmi_restore
1864 nmi_swapgs: 1869 nmi_swapgs:
1865 SWAPGS_UNSAFE_STACK 1870 SWAPGS_UNSAFE_STACK
1866 nmi_restore: 1871 nmi_restore:
1867 /* Pop the extra iret frame at once */ 1872 /* Pop the extra iret frame at once */
1868 RESTORE_ALL 6*8 1873 RESTORE_ALL 6*8
1869 1874
1870 /* Clear the NMI executing stack variable */ 1875 /* Clear the NMI executing stack variable */
1871 movq $0, 5*8(%rsp) 1876 movq $0, 5*8(%rsp)
1872 jmp irq_return 1877 jmp irq_return
1873 CFI_ENDPROC 1878 CFI_ENDPROC
1874 END(nmi) 1879 END(nmi)
1875 1880
1876 ENTRY(ignore_sysret) 1881 ENTRY(ignore_sysret)
1877 CFI_STARTPROC 1882 CFI_STARTPROC
1878 mov $-ENOSYS,%eax 1883 mov $-ENOSYS,%eax
1879 sysret 1884 sysret
1880 CFI_ENDPROC 1885 CFI_ENDPROC
1881 END(ignore_sysret) 1886 END(ignore_sysret)
1882 1887
1883 /* 1888 /*
1884 * End of kprobes section 1889 * End of kprobes section
1885 */ 1890 */
1886 .popsection 1891 .popsection
1887 1892
drivers/xen/events.c
1 /* 1 /*
2 * Xen event channels 2 * Xen event channels
3 * 3 *
4 * Xen models interrupts with abstract event channels. Because each 4 * Xen models interrupts with abstract event channels. Because each
5 * domain gets 1024 event channels, but NR_IRQ is not that large, we 5 * domain gets 1024 event channels, but NR_IRQ is not that large, we
6 * must dynamically map irqs<->event channels. The event channels 6 * must dynamically map irqs<->event channels. The event channels
7 * interface with the rest of the kernel by defining a xen interrupt 7 * interface with the rest of the kernel by defining a xen interrupt
8 * chip. When an event is received, it is mapped to an irq and sent 8 * chip. When an event is received, it is mapped to an irq and sent
9 * through the normal interrupt processing path. 9 * through the normal interrupt processing path.
10 * 10 *
11 * There are four kinds of events which can be mapped to an event 11 * There are four kinds of events which can be mapped to an event
12 * channel: 12 * channel:
13 * 13 *
14 * 1. Inter-domain notifications. This includes all the virtual 14 * 1. Inter-domain notifications. This includes all the virtual
15 * device events, since they're driven by front-ends in another domain 15 * device events, since they're driven by front-ends in another domain
16 * (typically dom0). 16 * (typically dom0).
17 * 2. VIRQs, typically used for timers. These are per-cpu events. 17 * 2. VIRQs, typically used for timers. These are per-cpu events.
18 * 3. IPIs. 18 * 3. IPIs.
19 * 4. PIRQs - Hardware interrupts. 19 * 4. PIRQs - Hardware interrupts.
20 * 20 *
21 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 21 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
22 */ 22 */
23 23
24 #include <linux/linkage.h> 24 #include <linux/linkage.h>
25 #include <linux/interrupt.h> 25 #include <linux/interrupt.h>
26 #include <linux/irq.h> 26 #include <linux/irq.h>
27 #include <linux/module.h> 27 #include <linux/module.h>
28 #include <linux/string.h> 28 #include <linux/string.h>
29 #include <linux/bootmem.h> 29 #include <linux/bootmem.h>
30 #include <linux/slab.h> 30 #include <linux/slab.h>
31 #include <linux/irqnr.h> 31 #include <linux/irqnr.h>
32 #include <linux/pci.h> 32 #include <linux/pci.h>
33 33
34 #ifdef CONFIG_X86 34 #ifdef CONFIG_X86
35 #include <asm/desc.h> 35 #include <asm/desc.h>
36 #include <asm/ptrace.h> 36 #include <asm/ptrace.h>
37 #include <asm/irq.h> 37 #include <asm/irq.h>
38 #include <asm/idle.h> 38 #include <asm/idle.h>
39 #include <asm/io_apic.h> 39 #include <asm/io_apic.h>
40 #include <asm/xen/page.h> 40 #include <asm/xen/page.h>
41 #include <asm/xen/pci.h> 41 #include <asm/xen/pci.h>
42 #endif 42 #endif
43 #include <asm/sync_bitops.h> 43 #include <asm/sync_bitops.h>
44 #include <asm/xen/hypercall.h> 44 #include <asm/xen/hypercall.h>
45 #include <asm/xen/hypervisor.h> 45 #include <asm/xen/hypervisor.h>
46 46
47 #include <xen/xen.h> 47 #include <xen/xen.h>
48 #include <xen/hvm.h> 48 #include <xen/hvm.h>
49 #include <xen/xen-ops.h> 49 #include <xen/xen-ops.h>
50 #include <xen/events.h> 50 #include <xen/events.h>
51 #include <xen/interface/xen.h> 51 #include <xen/interface/xen.h>
52 #include <xen/interface/event_channel.h> 52 #include <xen/interface/event_channel.h>
53 #include <xen/interface/hvm/hvm_op.h> 53 #include <xen/interface/hvm/hvm_op.h>
54 #include <xen/interface/hvm/params.h> 54 #include <xen/interface/hvm/params.h>
55 #include <xen/interface/physdev.h> 55 #include <xen/interface/physdev.h>
56 #include <xen/interface/sched.h> 56 #include <xen/interface/sched.h>
57 #include <asm/hw_irq.h> 57 #include <asm/hw_irq.h>
58 58
59 /* 59 /*
60 * This lock protects updates to the following mapping and reference-count 60 * This lock protects updates to the following mapping and reference-count
61 * arrays. The lock does not need to be acquired to read the mapping tables. 61 * arrays. The lock does not need to be acquired to read the mapping tables.
62 */ 62 */
63 static DEFINE_MUTEX(irq_mapping_update_lock); 63 static DEFINE_MUTEX(irq_mapping_update_lock);
64 64
65 static LIST_HEAD(xen_irq_list_head); 65 static LIST_HEAD(xen_irq_list_head);
66 66
67 /* IRQ <-> VIRQ mapping. */ 67 /* IRQ <-> VIRQ mapping. */
68 static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1}; 68 static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1};
69 69
70 /* IRQ <-> IPI mapping */ 70 /* IRQ <-> IPI mapping */
71 static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1}; 71 static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1};
72 72
73 /* Interrupt types. */ 73 /* Interrupt types. */
74 enum xen_irq_type { 74 enum xen_irq_type {
75 IRQT_UNBOUND = 0, 75 IRQT_UNBOUND = 0,
76 IRQT_PIRQ, 76 IRQT_PIRQ,
77 IRQT_VIRQ, 77 IRQT_VIRQ,
78 IRQT_IPI, 78 IRQT_IPI,
79 IRQT_EVTCHN 79 IRQT_EVTCHN
80 }; 80 };
81 81
82 /* 82 /*
83 * Packed IRQ information: 83 * Packed IRQ information:
84 * type - enum xen_irq_type 84 * type - enum xen_irq_type
85 * event channel - irq->event channel mapping 85 * event channel - irq->event channel mapping
86 * cpu - cpu this event channel is bound to 86 * cpu - cpu this event channel is bound to
87 * index - type-specific information: 87 * index - type-specific information:
88 * PIRQ - vector, with MSB being "needs EIO", or physical IRQ of the HVM 88 * PIRQ - vector, with MSB being "needs EIO", or physical IRQ of the HVM
89 * guest, or GSI (real passthrough IRQ) of the device. 89 * guest, or GSI (real passthrough IRQ) of the device.
90 * VIRQ - virq number 90 * VIRQ - virq number
91 * IPI - IPI vector 91 * IPI - IPI vector
92 * EVTCHN - 92 * EVTCHN -
93 */ 93 */
94 struct irq_info { 94 struct irq_info {
95 struct list_head list; 95 struct list_head list;
96 int refcnt; 96 int refcnt;
97 enum xen_irq_type type; /* type */ 97 enum xen_irq_type type; /* type */
98 unsigned irq; 98 unsigned irq;
99 unsigned short evtchn; /* event channel */ 99 unsigned short evtchn; /* event channel */
100 unsigned short cpu; /* cpu bound */ 100 unsigned short cpu; /* cpu bound */
101 101
102 union { 102 union {
103 unsigned short virq; 103 unsigned short virq;
104 enum ipi_vector ipi; 104 enum ipi_vector ipi;
105 struct { 105 struct {
106 unsigned short pirq; 106 unsigned short pirq;
107 unsigned short gsi; 107 unsigned short gsi;
108 unsigned char vector; 108 unsigned char vector;
109 unsigned char flags; 109 unsigned char flags;
110 uint16_t domid; 110 uint16_t domid;
111 } pirq; 111 } pirq;
112 } u; 112 } u;
113 }; 113 };
114 #define PIRQ_NEEDS_EOI (1 << 0) 114 #define PIRQ_NEEDS_EOI (1 << 0)
115 #define PIRQ_SHAREABLE (1 << 1) 115 #define PIRQ_SHAREABLE (1 << 1)
116 116
117 static int *evtchn_to_irq; 117 static int *evtchn_to_irq;
118 #ifdef CONFIG_X86 118 #ifdef CONFIG_X86
119 static unsigned long *pirq_eoi_map; 119 static unsigned long *pirq_eoi_map;
120 #endif 120 #endif
121 static bool (*pirq_needs_eoi)(unsigned irq); 121 static bool (*pirq_needs_eoi)(unsigned irq);
122 122
123 static DEFINE_PER_CPU(unsigned long [NR_EVENT_CHANNELS/BITS_PER_LONG], 123 static DEFINE_PER_CPU(unsigned long [NR_EVENT_CHANNELS/BITS_PER_LONG],
124 cpu_evtchn_mask); 124 cpu_evtchn_mask);
125 125
126 /* Xen will never allocate port zero for any purpose. */ 126 /* Xen will never allocate port zero for any purpose. */
127 #define VALID_EVTCHN(chn) ((chn) != 0) 127 #define VALID_EVTCHN(chn) ((chn) != 0)
128 128
129 static struct irq_chip xen_dynamic_chip; 129 static struct irq_chip xen_dynamic_chip;
130 static struct irq_chip xen_percpu_chip; 130 static struct irq_chip xen_percpu_chip;
131 static struct irq_chip xen_pirq_chip; 131 static struct irq_chip xen_pirq_chip;
132 static void enable_dynirq(struct irq_data *data); 132 static void enable_dynirq(struct irq_data *data);
133 static void disable_dynirq(struct irq_data *data); 133 static void disable_dynirq(struct irq_data *data);
134 134
135 /* Get info for IRQ */ 135 /* Get info for IRQ */
136 static struct irq_info *info_for_irq(unsigned irq) 136 static struct irq_info *info_for_irq(unsigned irq)
137 { 137 {
138 return irq_get_handler_data(irq); 138 return irq_get_handler_data(irq);
139 } 139 }
140 140
141 /* Constructors for packed IRQ information. */ 141 /* Constructors for packed IRQ information. */
142 static void xen_irq_info_common_init(struct irq_info *info, 142 static void xen_irq_info_common_init(struct irq_info *info,
143 unsigned irq, 143 unsigned irq,
144 enum xen_irq_type type, 144 enum xen_irq_type type,
145 unsigned short evtchn, 145 unsigned short evtchn,
146 unsigned short cpu) 146 unsigned short cpu)
147 { 147 {
148 148
149 BUG_ON(info->type != IRQT_UNBOUND && info->type != type); 149 BUG_ON(info->type != IRQT_UNBOUND && info->type != type);
150 150
151 info->type = type; 151 info->type = type;
152 info->irq = irq; 152 info->irq = irq;
153 info->evtchn = evtchn; 153 info->evtchn = evtchn;
154 info->cpu = cpu; 154 info->cpu = cpu;
155 155
156 evtchn_to_irq[evtchn] = irq; 156 evtchn_to_irq[evtchn] = irq;
157 } 157 }
158 158
159 static void xen_irq_info_evtchn_init(unsigned irq, 159 static void xen_irq_info_evtchn_init(unsigned irq,
160 unsigned short evtchn) 160 unsigned short evtchn)
161 { 161 {
162 struct irq_info *info = info_for_irq(irq); 162 struct irq_info *info = info_for_irq(irq);
163 163
164 xen_irq_info_common_init(info, irq, IRQT_EVTCHN, evtchn, 0); 164 xen_irq_info_common_init(info, irq, IRQT_EVTCHN, evtchn, 0);
165 } 165 }
166 166
167 static void xen_irq_info_ipi_init(unsigned cpu, 167 static void xen_irq_info_ipi_init(unsigned cpu,
168 unsigned irq, 168 unsigned irq,
169 unsigned short evtchn, 169 unsigned short evtchn,
170 enum ipi_vector ipi) 170 enum ipi_vector ipi)
171 { 171 {
172 struct irq_info *info = info_for_irq(irq); 172 struct irq_info *info = info_for_irq(irq);
173 173
174 xen_irq_info_common_init(info, irq, IRQT_IPI, evtchn, 0); 174 xen_irq_info_common_init(info, irq, IRQT_IPI, evtchn, 0);
175 175
176 info->u.ipi = ipi; 176 info->u.ipi = ipi;
177 177
178 per_cpu(ipi_to_irq, cpu)[ipi] = irq; 178 per_cpu(ipi_to_irq, cpu)[ipi] = irq;
179 } 179 }
180 180
181 static void xen_irq_info_virq_init(unsigned cpu, 181 static void xen_irq_info_virq_init(unsigned cpu,
182 unsigned irq, 182 unsigned irq,
183 unsigned short evtchn, 183 unsigned short evtchn,
184 unsigned short virq) 184 unsigned short virq)
185 { 185 {
186 struct irq_info *info = info_for_irq(irq); 186 struct irq_info *info = info_for_irq(irq);
187 187
188 xen_irq_info_common_init(info, irq, IRQT_VIRQ, evtchn, 0); 188 xen_irq_info_common_init(info, irq, IRQT_VIRQ, evtchn, 0);
189 189
190 info->u.virq = virq; 190 info->u.virq = virq;
191 191
192 per_cpu(virq_to_irq, cpu)[virq] = irq; 192 per_cpu(virq_to_irq, cpu)[virq] = irq;
193 } 193 }
194 194
195 static void xen_irq_info_pirq_init(unsigned irq, 195 static void xen_irq_info_pirq_init(unsigned irq,
196 unsigned short evtchn, 196 unsigned short evtchn,
197 unsigned short pirq, 197 unsigned short pirq,
198 unsigned short gsi, 198 unsigned short gsi,
199 unsigned short vector, 199 unsigned short vector,
200 uint16_t domid, 200 uint16_t domid,
201 unsigned char flags) 201 unsigned char flags)
202 { 202 {
203 struct irq_info *info = info_for_irq(irq); 203 struct irq_info *info = info_for_irq(irq);
204 204
205 xen_irq_info_common_init(info, irq, IRQT_PIRQ, evtchn, 0); 205 xen_irq_info_common_init(info, irq, IRQT_PIRQ, evtchn, 0);
206 206
207 info->u.pirq.pirq = pirq; 207 info->u.pirq.pirq = pirq;
208 info->u.pirq.gsi = gsi; 208 info->u.pirq.gsi = gsi;
209 info->u.pirq.vector = vector; 209 info->u.pirq.vector = vector;
210 info->u.pirq.domid = domid; 210 info->u.pirq.domid = domid;
211 info->u.pirq.flags = flags; 211 info->u.pirq.flags = flags;
212 } 212 }
213 213
214 /* 214 /*
215 * Accessors for packed IRQ information. 215 * Accessors for packed IRQ information.
216 */ 216 */
217 static unsigned int evtchn_from_irq(unsigned irq) 217 static unsigned int evtchn_from_irq(unsigned irq)
218 { 218 {
219 if (unlikely(WARN(irq < 0 || irq >= nr_irqs, "Invalid irq %d!\n", irq))) 219 if (unlikely(WARN(irq < 0 || irq >= nr_irqs, "Invalid irq %d!\n", irq)))
220 return 0; 220 return 0;
221 221
222 return info_for_irq(irq)->evtchn; 222 return info_for_irq(irq)->evtchn;
223 } 223 }
224 224
225 unsigned irq_from_evtchn(unsigned int evtchn) 225 unsigned irq_from_evtchn(unsigned int evtchn)
226 { 226 {
227 return evtchn_to_irq[evtchn]; 227 return evtchn_to_irq[evtchn];
228 } 228 }
229 EXPORT_SYMBOL_GPL(irq_from_evtchn); 229 EXPORT_SYMBOL_GPL(irq_from_evtchn);
230 230
231 static enum ipi_vector ipi_from_irq(unsigned irq) 231 static enum ipi_vector ipi_from_irq(unsigned irq)
232 { 232 {
233 struct irq_info *info = info_for_irq(irq); 233 struct irq_info *info = info_for_irq(irq);
234 234
235 BUG_ON(info == NULL); 235 BUG_ON(info == NULL);
236 BUG_ON(info->type != IRQT_IPI); 236 BUG_ON(info->type != IRQT_IPI);
237 237
238 return info->u.ipi; 238 return info->u.ipi;
239 } 239 }
240 240
241 static unsigned virq_from_irq(unsigned irq) 241 static unsigned virq_from_irq(unsigned irq)
242 { 242 {
243 struct irq_info *info = info_for_irq(irq); 243 struct irq_info *info = info_for_irq(irq);
244 244
245 BUG_ON(info == NULL); 245 BUG_ON(info == NULL);
246 BUG_ON(info->type != IRQT_VIRQ); 246 BUG_ON(info->type != IRQT_VIRQ);
247 247
248 return info->u.virq; 248 return info->u.virq;
249 } 249 }
250 250
251 static unsigned pirq_from_irq(unsigned irq) 251 static unsigned pirq_from_irq(unsigned irq)
252 { 252 {
253 struct irq_info *info = info_for_irq(irq); 253 struct irq_info *info = info_for_irq(irq);
254 254
255 BUG_ON(info == NULL); 255 BUG_ON(info == NULL);
256 BUG_ON(info->type != IRQT_PIRQ); 256 BUG_ON(info->type != IRQT_PIRQ);
257 257
258 return info->u.pirq.pirq; 258 return info->u.pirq.pirq;
259 } 259 }
260 260
261 static enum xen_irq_type type_from_irq(unsigned irq) 261 static enum xen_irq_type type_from_irq(unsigned irq)
262 { 262 {
263 return info_for_irq(irq)->type; 263 return info_for_irq(irq)->type;
264 } 264 }
265 265
266 static unsigned cpu_from_irq(unsigned irq) 266 static unsigned cpu_from_irq(unsigned irq)
267 { 267 {
268 return info_for_irq(irq)->cpu; 268 return info_for_irq(irq)->cpu;
269 } 269 }
270 270
271 static unsigned int cpu_from_evtchn(unsigned int evtchn) 271 static unsigned int cpu_from_evtchn(unsigned int evtchn)
272 { 272 {
273 int irq = evtchn_to_irq[evtchn]; 273 int irq = evtchn_to_irq[evtchn];
274 unsigned ret = 0; 274 unsigned ret = 0;
275 275
276 if (irq != -1) 276 if (irq != -1)
277 ret = cpu_from_irq(irq); 277 ret = cpu_from_irq(irq);
278 278
279 return ret; 279 return ret;
280 } 280 }
281 281
282 #ifdef CONFIG_X86 282 #ifdef CONFIG_X86
283 static bool pirq_check_eoi_map(unsigned irq) 283 static bool pirq_check_eoi_map(unsigned irq)
284 { 284 {
285 return test_bit(pirq_from_irq(irq), pirq_eoi_map); 285 return test_bit(pirq_from_irq(irq), pirq_eoi_map);
286 } 286 }
287 #endif 287 #endif
288 288
289 static bool pirq_needs_eoi_flag(unsigned irq) 289 static bool pirq_needs_eoi_flag(unsigned irq)
290 { 290 {
291 struct irq_info *info = info_for_irq(irq); 291 struct irq_info *info = info_for_irq(irq);
292 BUG_ON(info->type != IRQT_PIRQ); 292 BUG_ON(info->type != IRQT_PIRQ);
293 293
294 return info->u.pirq.flags & PIRQ_NEEDS_EOI; 294 return info->u.pirq.flags & PIRQ_NEEDS_EOI;
295 } 295 }
296 296
297 static inline unsigned long active_evtchns(unsigned int cpu, 297 static inline unsigned long active_evtchns(unsigned int cpu,
298 struct shared_info *sh, 298 struct shared_info *sh,
299 unsigned int idx) 299 unsigned int idx)
300 { 300 {
301 return sh->evtchn_pending[idx] & 301 return sh->evtchn_pending[idx] &
302 per_cpu(cpu_evtchn_mask, cpu)[idx] & 302 per_cpu(cpu_evtchn_mask, cpu)[idx] &
303 ~sh->evtchn_mask[idx]; 303 ~sh->evtchn_mask[idx];
304 } 304 }
305 305
306 static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) 306 static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
307 { 307 {
308 int irq = evtchn_to_irq[chn]; 308 int irq = evtchn_to_irq[chn];
309 309
310 BUG_ON(irq == -1); 310 BUG_ON(irq == -1);
311 #ifdef CONFIG_SMP 311 #ifdef CONFIG_SMP
312 cpumask_copy(irq_to_desc(irq)->irq_data.affinity, cpumask_of(cpu)); 312 cpumask_copy(irq_to_desc(irq)->irq_data.affinity, cpumask_of(cpu));
313 #endif 313 #endif
314 314
315 clear_bit(chn, per_cpu(cpu_evtchn_mask, cpu_from_irq(irq))); 315 clear_bit(chn, per_cpu(cpu_evtchn_mask, cpu_from_irq(irq)));
316 set_bit(chn, per_cpu(cpu_evtchn_mask, cpu)); 316 set_bit(chn, per_cpu(cpu_evtchn_mask, cpu));
317 317
318 info_for_irq(irq)->cpu = cpu; 318 info_for_irq(irq)->cpu = cpu;
319 } 319 }
320 320
321 static void init_evtchn_cpu_bindings(void) 321 static void init_evtchn_cpu_bindings(void)
322 { 322 {
323 int i; 323 int i;
324 #ifdef CONFIG_SMP 324 #ifdef CONFIG_SMP
325 struct irq_info *info; 325 struct irq_info *info;
326 326
327 /* By default all event channels notify CPU#0. */ 327 /* By default all event channels notify CPU#0. */
328 list_for_each_entry(info, &xen_irq_list_head, list) { 328 list_for_each_entry(info, &xen_irq_list_head, list) {
329 struct irq_desc *desc = irq_to_desc(info->irq); 329 struct irq_desc *desc = irq_to_desc(info->irq);
330 cpumask_copy(desc->irq_data.affinity, cpumask_of(0)); 330 cpumask_copy(desc->irq_data.affinity, cpumask_of(0));
331 } 331 }
332 #endif 332 #endif
333 333
334 for_each_possible_cpu(i) 334 for_each_possible_cpu(i)
335 memset(per_cpu(cpu_evtchn_mask, i), 335 memset(per_cpu(cpu_evtchn_mask, i),
336 (i == 0) ? ~0 : 0, sizeof(*per_cpu(cpu_evtchn_mask, i))); 336 (i == 0) ? ~0 : 0, sizeof(*per_cpu(cpu_evtchn_mask, i)));
337 } 337 }
338 338
339 static inline void clear_evtchn(int port) 339 static inline void clear_evtchn(int port)
340 { 340 {
341 struct shared_info *s = HYPERVISOR_shared_info; 341 struct shared_info *s = HYPERVISOR_shared_info;
342 sync_clear_bit(port, &s->evtchn_pending[0]); 342 sync_clear_bit(port, &s->evtchn_pending[0]);
343 } 343 }
344 344
345 static inline void set_evtchn(int port) 345 static inline void set_evtchn(int port)
346 { 346 {
347 struct shared_info *s = HYPERVISOR_shared_info; 347 struct shared_info *s = HYPERVISOR_shared_info;
348 sync_set_bit(port, &s->evtchn_pending[0]); 348 sync_set_bit(port, &s->evtchn_pending[0]);
349 } 349 }
350 350
351 static inline int test_evtchn(int port) 351 static inline int test_evtchn(int port)
352 { 352 {
353 struct shared_info *s = HYPERVISOR_shared_info; 353 struct shared_info *s = HYPERVISOR_shared_info;
354 return sync_test_bit(port, &s->evtchn_pending[0]); 354 return sync_test_bit(port, &s->evtchn_pending[0]);
355 } 355 }
356 356
357 357
358 /** 358 /**
359 * notify_remote_via_irq - send event to remote end of event channel via irq 359 * notify_remote_via_irq - send event to remote end of event channel via irq
360 * @irq: irq of event channel to send event to 360 * @irq: irq of event channel to send event to
361 * 361 *
362 * Unlike notify_remote_via_evtchn(), this is safe to use across 362 * Unlike notify_remote_via_evtchn(), this is safe to use across
363 * save/restore. Notifications on a broken connection are silently 363 * save/restore. Notifications on a broken connection are silently
364 * dropped. 364 * dropped.
365 */ 365 */
366 void notify_remote_via_irq(int irq) 366 void notify_remote_via_irq(int irq)
367 { 367 {
368 int evtchn = evtchn_from_irq(irq); 368 int evtchn = evtchn_from_irq(irq);
369 369
370 if (VALID_EVTCHN(evtchn)) 370 if (VALID_EVTCHN(evtchn))
371 notify_remote_via_evtchn(evtchn); 371 notify_remote_via_evtchn(evtchn);
372 } 372 }
373 EXPORT_SYMBOL_GPL(notify_remote_via_irq); 373 EXPORT_SYMBOL_GPL(notify_remote_via_irq);
374 374
375 static void mask_evtchn(int port) 375 static void mask_evtchn(int port)
376 { 376 {
377 struct shared_info *s = HYPERVISOR_shared_info; 377 struct shared_info *s = HYPERVISOR_shared_info;
378 sync_set_bit(port, &s->evtchn_mask[0]); 378 sync_set_bit(port, &s->evtchn_mask[0]);
379 } 379 }
380 380
381 static void unmask_evtchn(int port) 381 static void unmask_evtchn(int port)
382 { 382 {
383 struct shared_info *s = HYPERVISOR_shared_info; 383 struct shared_info *s = HYPERVISOR_shared_info;
384 unsigned int cpu = get_cpu(); 384 unsigned int cpu = get_cpu();
385 int do_hypercall = 0, evtchn_pending = 0; 385 int do_hypercall = 0, evtchn_pending = 0;
386 386
387 BUG_ON(!irqs_disabled()); 387 BUG_ON(!irqs_disabled());
388 388
389 if (unlikely((cpu != cpu_from_evtchn(port)))) 389 if (unlikely((cpu != cpu_from_evtchn(port))))
390 do_hypercall = 1; 390 do_hypercall = 1;
391 else 391 else
392 evtchn_pending = sync_test_bit(port, &s->evtchn_pending[0]); 392 evtchn_pending = sync_test_bit(port, &s->evtchn_pending[0]);
393 393
394 if (unlikely(evtchn_pending && xen_hvm_domain())) 394 if (unlikely(evtchn_pending && xen_hvm_domain()))
395 do_hypercall = 1; 395 do_hypercall = 1;
396 396
397 /* Slow path (hypercall) if this is a non-local port or if this is 397 /* Slow path (hypercall) if this is a non-local port or if this is
398 * an hvm domain and an event is pending (hvm domains don't have 398 * an hvm domain and an event is pending (hvm domains don't have
399 * their own implementation of irq_enable). */ 399 * their own implementation of irq_enable). */
400 if (do_hypercall) { 400 if (do_hypercall) {
401 struct evtchn_unmask unmask = { .port = port }; 401 struct evtchn_unmask unmask = { .port = port };
402 (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); 402 (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
403 } else { 403 } else {
404 struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); 404 struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
405 405
406 sync_clear_bit(port, &s->evtchn_mask[0]); 406 sync_clear_bit(port, &s->evtchn_mask[0]);
407 407
408 /* 408 /*
409 * The following is basically the equivalent of 409 * The following is basically the equivalent of
410 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose 410 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose
411 * the interrupt edge' if the channel is masked. 411 * the interrupt edge' if the channel is masked.
412 */ 412 */
413 if (evtchn_pending && 413 if (evtchn_pending &&
414 !sync_test_and_set_bit(port / BITS_PER_LONG, 414 !sync_test_and_set_bit(port / BITS_PER_LONG,
415 &vcpu_info->evtchn_pending_sel)) 415 &vcpu_info->evtchn_pending_sel))
416 vcpu_info->evtchn_upcall_pending = 1; 416 vcpu_info->evtchn_upcall_pending = 1;
417 } 417 }
418 418
419 put_cpu(); 419 put_cpu();
420 } 420 }
421 421
422 static void xen_irq_init(unsigned irq) 422 static void xen_irq_init(unsigned irq)
423 { 423 {
424 struct irq_info *info; 424 struct irq_info *info;
425 #ifdef CONFIG_SMP 425 #ifdef CONFIG_SMP
426 struct irq_desc *desc = irq_to_desc(irq); 426 struct irq_desc *desc = irq_to_desc(irq);
427 427
428 /* By default all event channels notify CPU#0. */ 428 /* By default all event channels notify CPU#0. */
429 cpumask_copy(desc->irq_data.affinity, cpumask_of(0)); 429 cpumask_copy(desc->irq_data.affinity, cpumask_of(0));
430 #endif 430 #endif
431 431
432 info = kzalloc(sizeof(*info), GFP_KERNEL); 432 info = kzalloc(sizeof(*info), GFP_KERNEL);
433 if (info == NULL) 433 if (info == NULL)
434 panic("Unable to allocate metadata for IRQ%d\n", irq); 434 panic("Unable to allocate metadata for IRQ%d\n", irq);
435 435
436 info->type = IRQT_UNBOUND; 436 info->type = IRQT_UNBOUND;
437 info->refcnt = -1; 437 info->refcnt = -1;
438 438
439 irq_set_handler_data(irq, info); 439 irq_set_handler_data(irq, info);
440 440
441 list_add_tail(&info->list, &xen_irq_list_head); 441 list_add_tail(&info->list, &xen_irq_list_head);
442 } 442 }
443 443
444 static int __must_check xen_allocate_irq_dynamic(void) 444 static int __must_check xen_allocate_irq_dynamic(void)
445 { 445 {
446 int first = 0; 446 int first = 0;
447 int irq; 447 int irq;
448 448
449 #ifdef CONFIG_X86_IO_APIC 449 #ifdef CONFIG_X86_IO_APIC
450 /* 450 /*
451 * For an HVM guest or domain 0 which see "real" (emulated or 451 * For an HVM guest or domain 0 which see "real" (emulated or
452 * actual respectively) GSIs we allocate dynamic IRQs 452 * actual respectively) GSIs we allocate dynamic IRQs
453 * e.g. those corresponding to event channels or MSIs 453 * e.g. those corresponding to event channels or MSIs
454 * etc. from the range above those "real" GSIs to avoid 454 * etc. from the range above those "real" GSIs to avoid
455 * collisions. 455 * collisions.
456 */ 456 */
457 if (xen_initial_domain() || xen_hvm_domain()) 457 if (xen_initial_domain() || xen_hvm_domain())
458 first = get_nr_irqs_gsi(); 458 first = get_nr_irqs_gsi();
459 #endif 459 #endif
460 460
461 irq = irq_alloc_desc_from(first, -1); 461 irq = irq_alloc_desc_from(first, -1);
462 462
463 if (irq >= 0) 463 if (irq >= 0)
464 xen_irq_init(irq); 464 xen_irq_init(irq);
465 465
466 return irq; 466 return irq;
467 } 467 }
468 468
469 static int __must_check xen_allocate_irq_gsi(unsigned gsi) 469 static int __must_check xen_allocate_irq_gsi(unsigned gsi)
470 { 470 {
471 int irq; 471 int irq;
472 472
473 /* 473 /*
474 * A PV guest has no concept of a GSI (since it has no ACPI 474 * A PV guest has no concept of a GSI (since it has no ACPI
475 * nor access to/knowledge of the physical APICs). Therefore 475 * nor access to/knowledge of the physical APICs). Therefore
476 * all IRQs are dynamically allocated from the entire IRQ 476 * all IRQs are dynamically allocated from the entire IRQ
477 * space. 477 * space.
478 */ 478 */
479 if (xen_pv_domain() && !xen_initial_domain()) 479 if (xen_pv_domain() && !xen_initial_domain())
480 return xen_allocate_irq_dynamic(); 480 return xen_allocate_irq_dynamic();
481 481
482 /* Legacy IRQ descriptors are already allocated by the arch. */ 482 /* Legacy IRQ descriptors are already allocated by the arch. */
483 if (gsi < NR_IRQS_LEGACY) 483 if (gsi < NR_IRQS_LEGACY)
484 irq = gsi; 484 irq = gsi;
485 else 485 else
486 irq = irq_alloc_desc_at(gsi, -1); 486 irq = irq_alloc_desc_at(gsi, -1);
487 487
488 xen_irq_init(irq); 488 xen_irq_init(irq);
489 489
490 return irq; 490 return irq;
491 } 491 }
492 492
493 static void xen_free_irq(unsigned irq) 493 static void xen_free_irq(unsigned irq)
494 { 494 {
495 struct irq_info *info = irq_get_handler_data(irq); 495 struct irq_info *info = irq_get_handler_data(irq);
496 496
497 list_del(&info->list); 497 list_del(&info->list);
498 498
499 irq_set_handler_data(irq, NULL); 499 irq_set_handler_data(irq, NULL);
500 500
501 WARN_ON(info->refcnt > 0); 501 WARN_ON(info->refcnt > 0);
502 502
503 kfree(info); 503 kfree(info);
504 504
505 /* Legacy IRQ descriptors are managed by the arch. */ 505 /* Legacy IRQ descriptors are managed by the arch. */
506 if (irq < NR_IRQS_LEGACY) 506 if (irq < NR_IRQS_LEGACY)
507 return; 507 return;
508 508
509 irq_free_desc(irq); 509 irq_free_desc(irq);
510 } 510 }
511 511
512 static void pirq_query_unmask(int irq) 512 static void pirq_query_unmask(int irq)
513 { 513 {
514 struct physdev_irq_status_query irq_status; 514 struct physdev_irq_status_query irq_status;
515 struct irq_info *info = info_for_irq(irq); 515 struct irq_info *info = info_for_irq(irq);
516 516
517 BUG_ON(info->type != IRQT_PIRQ); 517 BUG_ON(info->type != IRQT_PIRQ);
518 518
519 irq_status.irq = pirq_from_irq(irq); 519 irq_status.irq = pirq_from_irq(irq);
520 if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status)) 520 if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
521 irq_status.flags = 0; 521 irq_status.flags = 0;
522 522
523 info->u.pirq.flags &= ~PIRQ_NEEDS_EOI; 523 info->u.pirq.flags &= ~PIRQ_NEEDS_EOI;
524 if (irq_status.flags & XENIRQSTAT_needs_eoi) 524 if (irq_status.flags & XENIRQSTAT_needs_eoi)
525 info->u.pirq.flags |= PIRQ_NEEDS_EOI; 525 info->u.pirq.flags |= PIRQ_NEEDS_EOI;
526 } 526 }
527 527
528 static bool probing_irq(int irq) 528 static bool probing_irq(int irq)
529 { 529 {
530 struct irq_desc *desc = irq_to_desc(irq); 530 struct irq_desc *desc = irq_to_desc(irq);
531 531
532 return desc && desc->action == NULL; 532 return desc && desc->action == NULL;
533 } 533 }
534 534
535 static void eoi_pirq(struct irq_data *data) 535 static void eoi_pirq(struct irq_data *data)
536 { 536 {
537 int evtchn = evtchn_from_irq(data->irq); 537 int evtchn = evtchn_from_irq(data->irq);
538 struct physdev_eoi eoi = { .irq = pirq_from_irq(data->irq) }; 538 struct physdev_eoi eoi = { .irq = pirq_from_irq(data->irq) };
539 int rc = 0; 539 int rc = 0;
540 540
541 irq_move_irq(data); 541 irq_move_irq(data);
542 542
543 if (VALID_EVTCHN(evtchn)) 543 if (VALID_EVTCHN(evtchn))
544 clear_evtchn(evtchn); 544 clear_evtchn(evtchn);
545 545
546 if (pirq_needs_eoi(data->irq)) { 546 if (pirq_needs_eoi(data->irq)) {
547 rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi); 547 rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
548 WARN_ON(rc); 548 WARN_ON(rc);
549 } 549 }
550 } 550 }
551 551
552 static void mask_ack_pirq(struct irq_data *data) 552 static void mask_ack_pirq(struct irq_data *data)
553 { 553 {
554 disable_dynirq(data); 554 disable_dynirq(data);
555 eoi_pirq(data); 555 eoi_pirq(data);
556 } 556 }
557 557
558 static unsigned int __startup_pirq(unsigned int irq) 558 static unsigned int __startup_pirq(unsigned int irq)
559 { 559 {
560 struct evtchn_bind_pirq bind_pirq; 560 struct evtchn_bind_pirq bind_pirq;
561 struct irq_info *info = info_for_irq(irq); 561 struct irq_info *info = info_for_irq(irq);
562 int evtchn = evtchn_from_irq(irq); 562 int evtchn = evtchn_from_irq(irq);
563 int rc; 563 int rc;
564 564
565 BUG_ON(info->type != IRQT_PIRQ); 565 BUG_ON(info->type != IRQT_PIRQ);
566 566
567 if (VALID_EVTCHN(evtchn)) 567 if (VALID_EVTCHN(evtchn))
568 goto out; 568 goto out;
569 569
570 bind_pirq.pirq = pirq_from_irq(irq); 570 bind_pirq.pirq = pirq_from_irq(irq);
571 /* NB. We are happy to share unless we are probing. */ 571 /* NB. We are happy to share unless we are probing. */
572 bind_pirq.flags = info->u.pirq.flags & PIRQ_SHAREABLE ? 572 bind_pirq.flags = info->u.pirq.flags & PIRQ_SHAREABLE ?
573 BIND_PIRQ__WILL_SHARE : 0; 573 BIND_PIRQ__WILL_SHARE : 0;
574 rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq); 574 rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq);
575 if (rc != 0) { 575 if (rc != 0) {
576 if (!probing_irq(irq)) 576 if (!probing_irq(irq))
577 printk(KERN_INFO "Failed to obtain physical IRQ %d\n", 577 printk(KERN_INFO "Failed to obtain physical IRQ %d\n",
578 irq); 578 irq);
579 return 0; 579 return 0;
580 } 580 }
581 evtchn = bind_pirq.port; 581 evtchn = bind_pirq.port;
582 582
583 pirq_query_unmask(irq); 583 pirq_query_unmask(irq);
584 584
585 evtchn_to_irq[evtchn] = irq; 585 evtchn_to_irq[evtchn] = irq;
586 bind_evtchn_to_cpu(evtchn, 0); 586 bind_evtchn_to_cpu(evtchn, 0);
587 info->evtchn = evtchn; 587 info->evtchn = evtchn;
588 588
589 out: 589 out:
590 unmask_evtchn(evtchn); 590 unmask_evtchn(evtchn);
591 eoi_pirq(irq_get_irq_data(irq)); 591 eoi_pirq(irq_get_irq_data(irq));
592 592
593 return 0; 593 return 0;
594 } 594 }
595 595
596 static unsigned int startup_pirq(struct irq_data *data) 596 static unsigned int startup_pirq(struct irq_data *data)
597 { 597 {
598 return __startup_pirq(data->irq); 598 return __startup_pirq(data->irq);
599 } 599 }
600 600
601 static void shutdown_pirq(struct irq_data *data) 601 static void shutdown_pirq(struct irq_data *data)
602 { 602 {
603 struct evtchn_close close; 603 struct evtchn_close close;
604 unsigned int irq = data->irq; 604 unsigned int irq = data->irq;
605 struct irq_info *info = info_for_irq(irq); 605 struct irq_info *info = info_for_irq(irq);
606 int evtchn = evtchn_from_irq(irq); 606 int evtchn = evtchn_from_irq(irq);
607 607
608 BUG_ON(info->type != IRQT_PIRQ); 608 BUG_ON(info->type != IRQT_PIRQ);
609 609
610 if (!VALID_EVTCHN(evtchn)) 610 if (!VALID_EVTCHN(evtchn))
611 return; 611 return;
612 612
613 mask_evtchn(evtchn); 613 mask_evtchn(evtchn);
614 614
615 close.port = evtchn; 615 close.port = evtchn;
616 if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) 616 if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
617 BUG(); 617 BUG();
618 618
619 bind_evtchn_to_cpu(evtchn, 0); 619 bind_evtchn_to_cpu(evtchn, 0);
620 evtchn_to_irq[evtchn] = -1; 620 evtchn_to_irq[evtchn] = -1;
621 info->evtchn = 0; 621 info->evtchn = 0;
622 } 622 }
623 623
624 static void enable_pirq(struct irq_data *data) 624 static void enable_pirq(struct irq_data *data)
625 { 625 {
626 startup_pirq(data); 626 startup_pirq(data);
627 } 627 }
628 628
629 static void disable_pirq(struct irq_data *data) 629 static void disable_pirq(struct irq_data *data)
630 { 630 {
631 disable_dynirq(data); 631 disable_dynirq(data);
632 } 632 }
633 633
634 int xen_irq_from_gsi(unsigned gsi) 634 int xen_irq_from_gsi(unsigned gsi)
635 { 635 {
636 struct irq_info *info; 636 struct irq_info *info;
637 637
638 list_for_each_entry(info, &xen_irq_list_head, list) { 638 list_for_each_entry(info, &xen_irq_list_head, list) {
639 if (info->type != IRQT_PIRQ) 639 if (info->type != IRQT_PIRQ)
640 continue; 640 continue;
641 641
642 if (info->u.pirq.gsi == gsi) 642 if (info->u.pirq.gsi == gsi)
643 return info->irq; 643 return info->irq;
644 } 644 }
645 645
646 return -1; 646 return -1;
647 } 647 }
648 EXPORT_SYMBOL_GPL(xen_irq_from_gsi); 648 EXPORT_SYMBOL_GPL(xen_irq_from_gsi);
649 649
650 /* 650 /*
651 * Do not make any assumptions regarding the relationship between the 651 * Do not make any assumptions regarding the relationship between the
652 * IRQ number returned here and the Xen pirq argument. 652 * IRQ number returned here and the Xen pirq argument.
653 * 653 *
654 * Note: We don't assign an event channel until the irq actually started 654 * Note: We don't assign an event channel until the irq actually started
655 * up. Return an existing irq if we've already got one for the gsi. 655 * up. Return an existing irq if we've already got one for the gsi.
656 * 656 *
657 * Shareable implies level triggered, not shareable implies edge 657 * Shareable implies level triggered, not shareable implies edge
658 * triggered here. 658 * triggered here.
659 */ 659 */
660 int xen_bind_pirq_gsi_to_irq(unsigned gsi, 660 int xen_bind_pirq_gsi_to_irq(unsigned gsi,
661 unsigned pirq, int shareable, char *name) 661 unsigned pirq, int shareable, char *name)
662 { 662 {
663 int irq = -1; 663 int irq = -1;
664 struct physdev_irq irq_op; 664 struct physdev_irq irq_op;
665 665
666 mutex_lock(&irq_mapping_update_lock); 666 mutex_lock(&irq_mapping_update_lock);
667 667
668 irq = xen_irq_from_gsi(gsi); 668 irq = xen_irq_from_gsi(gsi);
669 if (irq != -1) { 669 if (irq != -1) {
670 printk(KERN_INFO "xen_map_pirq_gsi: returning irq %d for gsi %u\n", 670 printk(KERN_INFO "xen_map_pirq_gsi: returning irq %d for gsi %u\n",
671 irq, gsi); 671 irq, gsi);
672 goto out; 672 goto out;
673 } 673 }
674 674
675 irq = xen_allocate_irq_gsi(gsi); 675 irq = xen_allocate_irq_gsi(gsi);
676 if (irq < 0) 676 if (irq < 0)
677 goto out; 677 goto out;
678 678
679 irq_op.irq = irq; 679 irq_op.irq = irq;
680 irq_op.vector = 0; 680 irq_op.vector = 0;
681 681
682 /* Only the privileged domain can do this. For non-priv, the pcifront 682 /* Only the privileged domain can do this. For non-priv, the pcifront
683 * driver provides a PCI bus that does the call to do exactly 683 * driver provides a PCI bus that does the call to do exactly
684 * this in the priv domain. */ 684 * this in the priv domain. */
685 if (xen_initial_domain() && 685 if (xen_initial_domain() &&
686 HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) { 686 HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
687 xen_free_irq(irq); 687 xen_free_irq(irq);
688 irq = -ENOSPC; 688 irq = -ENOSPC;
689 goto out; 689 goto out;
690 } 690 }
691 691
692 xen_irq_info_pirq_init(irq, 0, pirq, gsi, irq_op.vector, DOMID_SELF, 692 xen_irq_info_pirq_init(irq, 0, pirq, gsi, irq_op.vector, DOMID_SELF,
693 shareable ? PIRQ_SHAREABLE : 0); 693 shareable ? PIRQ_SHAREABLE : 0);
694 694
695 pirq_query_unmask(irq); 695 pirq_query_unmask(irq);
696 /* We try to use the handler with the appropriate semantic for the 696 /* We try to use the handler with the appropriate semantic for the
697 * type of interrupt: if the interrupt is an edge triggered 697 * type of interrupt: if the interrupt is an edge triggered
698 * interrupt we use handle_edge_irq. 698 * interrupt we use handle_edge_irq.
699 * 699 *
700 * On the other hand if the interrupt is level triggered we use 700 * On the other hand if the interrupt is level triggered we use
701 * handle_fasteoi_irq like the native code does for this kind of 701 * handle_fasteoi_irq like the native code does for this kind of
702 * interrupts. 702 * interrupts.
703 * 703 *
704 * Depending on the Xen version, pirq_needs_eoi might return true 704 * Depending on the Xen version, pirq_needs_eoi might return true
705 * not only for level triggered interrupts but for edge triggered 705 * not only for level triggered interrupts but for edge triggered
706 * interrupts too. In any case Xen always honors the eoi mechanism, 706 * interrupts too. In any case Xen always honors the eoi mechanism,
707 * not injecting any more pirqs of the same kind if the first one 707 * not injecting any more pirqs of the same kind if the first one
708 * hasn't received an eoi yet. Therefore using the fasteoi handler 708 * hasn't received an eoi yet. Therefore using the fasteoi handler
709 * is the right choice either way. 709 * is the right choice either way.
710 */ 710 */
711 if (shareable) 711 if (shareable)
712 irq_set_chip_and_handler_name(irq, &xen_pirq_chip, 712 irq_set_chip_and_handler_name(irq, &xen_pirq_chip,
713 handle_fasteoi_irq, name); 713 handle_fasteoi_irq, name);
714 else 714 else
715 irq_set_chip_and_handler_name(irq, &xen_pirq_chip, 715 irq_set_chip_and_handler_name(irq, &xen_pirq_chip,
716 handle_edge_irq, name); 716 handle_edge_irq, name);
717 717
718 out: 718 out:
719 mutex_unlock(&irq_mapping_update_lock); 719 mutex_unlock(&irq_mapping_update_lock);
720 720
721 return irq; 721 return irq;
722 } 722 }
723 723
724 #ifdef CONFIG_PCI_MSI 724 #ifdef CONFIG_PCI_MSI
725 int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc) 725 int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc)
726 { 726 {
727 int rc; 727 int rc;
728 struct physdev_get_free_pirq op_get_free_pirq; 728 struct physdev_get_free_pirq op_get_free_pirq;
729 729
730 op_get_free_pirq.type = MAP_PIRQ_TYPE_MSI; 730 op_get_free_pirq.type = MAP_PIRQ_TYPE_MSI;
731 rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_free_pirq, &op_get_free_pirq); 731 rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_free_pirq, &op_get_free_pirq);
732 732
733 WARN_ONCE(rc == -ENOSYS, 733 WARN_ONCE(rc == -ENOSYS,
734 "hypervisor does not support the PHYSDEVOP_get_free_pirq interface\n"); 734 "hypervisor does not support the PHYSDEVOP_get_free_pirq interface\n");
735 735
736 return rc ? -1 : op_get_free_pirq.pirq; 736 return rc ? -1 : op_get_free_pirq.pirq;
737 } 737 }
738 738
739 int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, 739 int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc,
740 int pirq, int vector, const char *name, 740 int pirq, int vector, const char *name,
741 domid_t domid) 741 domid_t domid)
742 { 742 {
743 int irq, ret; 743 int irq, ret;
744 744
745 mutex_lock(&irq_mapping_update_lock); 745 mutex_lock(&irq_mapping_update_lock);
746 746
747 irq = xen_allocate_irq_dynamic(); 747 irq = xen_allocate_irq_dynamic();
748 if (irq < 0) 748 if (irq < 0)
749 goto out; 749 goto out;
750 750
751 irq_set_chip_and_handler_name(irq, &xen_pirq_chip, handle_edge_irq, 751 irq_set_chip_and_handler_name(irq, &xen_pirq_chip, handle_edge_irq,
752 name); 752 name);
753 753
754 xen_irq_info_pirq_init(irq, 0, pirq, 0, vector, domid, 0); 754 xen_irq_info_pirq_init(irq, 0, pirq, 0, vector, domid, 0);
755 ret = irq_set_msi_desc(irq, msidesc); 755 ret = irq_set_msi_desc(irq, msidesc);
756 if (ret < 0) 756 if (ret < 0)
757 goto error_irq; 757 goto error_irq;
758 out: 758 out:
759 mutex_unlock(&irq_mapping_update_lock); 759 mutex_unlock(&irq_mapping_update_lock);
760 return irq; 760 return irq;
761 error_irq: 761 error_irq:
762 mutex_unlock(&irq_mapping_update_lock); 762 mutex_unlock(&irq_mapping_update_lock);
763 xen_free_irq(irq); 763 xen_free_irq(irq);
764 return ret; 764 return ret;
765 } 765 }
766 #endif 766 #endif
767 767
768 int xen_destroy_irq(int irq) 768 int xen_destroy_irq(int irq)
769 { 769 {
770 struct irq_desc *desc; 770 struct irq_desc *desc;
771 struct physdev_unmap_pirq unmap_irq; 771 struct physdev_unmap_pirq unmap_irq;
772 struct irq_info *info = info_for_irq(irq); 772 struct irq_info *info = info_for_irq(irq);
773 int rc = -ENOENT; 773 int rc = -ENOENT;
774 774
775 mutex_lock(&irq_mapping_update_lock); 775 mutex_lock(&irq_mapping_update_lock);
776 776
777 desc = irq_to_desc(irq); 777 desc = irq_to_desc(irq);
778 if (!desc) 778 if (!desc)
779 goto out; 779 goto out;
780 780
781 if (xen_initial_domain()) { 781 if (xen_initial_domain()) {
782 unmap_irq.pirq = info->u.pirq.pirq; 782 unmap_irq.pirq = info->u.pirq.pirq;
783 unmap_irq.domid = info->u.pirq.domid; 783 unmap_irq.domid = info->u.pirq.domid;
784 rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq); 784 rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq);
785 /* If another domain quits without making the pci_disable_msix 785 /* If another domain quits without making the pci_disable_msix
786 * call, the Xen hypervisor takes care of freeing the PIRQs 786 * call, the Xen hypervisor takes care of freeing the PIRQs
787 * (free_domain_pirqs). 787 * (free_domain_pirqs).
788 */ 788 */
789 if ((rc == -ESRCH && info->u.pirq.domid != DOMID_SELF)) 789 if ((rc == -ESRCH && info->u.pirq.domid != DOMID_SELF))
790 printk(KERN_INFO "domain %d does not have %d anymore\n", 790 printk(KERN_INFO "domain %d does not have %d anymore\n",
791 info->u.pirq.domid, info->u.pirq.pirq); 791 info->u.pirq.domid, info->u.pirq.pirq);
792 else if (rc) { 792 else if (rc) {
793 printk(KERN_WARNING "unmap irq failed %d\n", rc); 793 printk(KERN_WARNING "unmap irq failed %d\n", rc);
794 goto out; 794 goto out;
795 } 795 }
796 } 796 }
797 797
798 xen_free_irq(irq); 798 xen_free_irq(irq);
799 799
800 out: 800 out:
801 mutex_unlock(&irq_mapping_update_lock); 801 mutex_unlock(&irq_mapping_update_lock);
802 return rc; 802 return rc;
803 } 803 }
804 804
805 int xen_irq_from_pirq(unsigned pirq) 805 int xen_irq_from_pirq(unsigned pirq)
806 { 806 {
807 int irq; 807 int irq;
808 808
809 struct irq_info *info; 809 struct irq_info *info;
810 810
811 mutex_lock(&irq_mapping_update_lock); 811 mutex_lock(&irq_mapping_update_lock);
812 812
813 list_for_each_entry(info, &xen_irq_list_head, list) { 813 list_for_each_entry(info, &xen_irq_list_head, list) {
814 if (info->type != IRQT_PIRQ) 814 if (info->type != IRQT_PIRQ)
815 continue; 815 continue;
816 irq = info->irq; 816 irq = info->irq;
817 if (info->u.pirq.pirq == pirq) 817 if (info->u.pirq.pirq == pirq)
818 goto out; 818 goto out;
819 } 819 }
820 irq = -1; 820 irq = -1;
821 out: 821 out:
822 mutex_unlock(&irq_mapping_update_lock); 822 mutex_unlock(&irq_mapping_update_lock);
823 823
824 return irq; 824 return irq;
825 } 825 }
826 826
827 827
828 int xen_pirq_from_irq(unsigned irq) 828 int xen_pirq_from_irq(unsigned irq)
829 { 829 {
830 return pirq_from_irq(irq); 830 return pirq_from_irq(irq);
831 } 831 }
832 EXPORT_SYMBOL_GPL(xen_pirq_from_irq); 832 EXPORT_SYMBOL_GPL(xen_pirq_from_irq);
833 int bind_evtchn_to_irq(unsigned int evtchn) 833 int bind_evtchn_to_irq(unsigned int evtchn)
834 { 834 {
835 int irq; 835 int irq;
836 836
837 mutex_lock(&irq_mapping_update_lock); 837 mutex_lock(&irq_mapping_update_lock);
838 838
839 irq = evtchn_to_irq[evtchn]; 839 irq = evtchn_to_irq[evtchn];
840 840
841 if (irq == -1) { 841 if (irq == -1) {
842 irq = xen_allocate_irq_dynamic(); 842 irq = xen_allocate_irq_dynamic();
843 if (irq < 0) 843 if (irq < 0)
844 goto out; 844 goto out;
845 845
846 irq_set_chip_and_handler_name(irq, &xen_dynamic_chip, 846 irq_set_chip_and_handler_name(irq, &xen_dynamic_chip,
847 handle_edge_irq, "event"); 847 handle_edge_irq, "event");
848 848
849 xen_irq_info_evtchn_init(irq, evtchn); 849 xen_irq_info_evtchn_init(irq, evtchn);
850 } else { 850 } else {
851 struct irq_info *info = info_for_irq(irq); 851 struct irq_info *info = info_for_irq(irq);
852 WARN_ON(info == NULL || info->type != IRQT_EVTCHN); 852 WARN_ON(info == NULL || info->type != IRQT_EVTCHN);
853 } 853 }
854 irq_clear_status_flags(irq, IRQ_NOREQUEST|IRQ_NOAUTOEN); 854 irq_clear_status_flags(irq, IRQ_NOREQUEST|IRQ_NOAUTOEN);
855 855
856 out: 856 out:
857 mutex_unlock(&irq_mapping_update_lock); 857 mutex_unlock(&irq_mapping_update_lock);
858 858
859 return irq; 859 return irq;
860 } 860 }
861 EXPORT_SYMBOL_GPL(bind_evtchn_to_irq); 861 EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
862 862
863 static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) 863 static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
864 { 864 {
865 struct evtchn_bind_ipi bind_ipi; 865 struct evtchn_bind_ipi bind_ipi;
866 int evtchn, irq; 866 int evtchn, irq;
867 867
868 mutex_lock(&irq_mapping_update_lock); 868 mutex_lock(&irq_mapping_update_lock);
869 869
870 irq = per_cpu(ipi_to_irq, cpu)[ipi]; 870 irq = per_cpu(ipi_to_irq, cpu)[ipi];
871 871
872 if (irq == -1) { 872 if (irq == -1) {
873 irq = xen_allocate_irq_dynamic(); 873 irq = xen_allocate_irq_dynamic();
874 if (irq < 0) 874 if (irq < 0)
875 goto out; 875 goto out;
876 876
877 irq_set_chip_and_handler_name(irq, &xen_percpu_chip, 877 irq_set_chip_and_handler_name(irq, &xen_percpu_chip,
878 handle_percpu_irq, "ipi"); 878 handle_percpu_irq, "ipi");
879 879
880 bind_ipi.vcpu = cpu; 880 bind_ipi.vcpu = cpu;
881 if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, 881 if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
882 &bind_ipi) != 0) 882 &bind_ipi) != 0)
883 BUG(); 883 BUG();
884 evtchn = bind_ipi.port; 884 evtchn = bind_ipi.port;
885 885
886 xen_irq_info_ipi_init(cpu, irq, evtchn, ipi); 886 xen_irq_info_ipi_init(cpu, irq, evtchn, ipi);
887 887
888 bind_evtchn_to_cpu(evtchn, cpu); 888 bind_evtchn_to_cpu(evtchn, cpu);
889 } else { 889 } else {
890 struct irq_info *info = info_for_irq(irq); 890 struct irq_info *info = info_for_irq(irq);
891 WARN_ON(info == NULL || info->type != IRQT_IPI); 891 WARN_ON(info == NULL || info->type != IRQT_IPI);
892 } 892 }
893 893
894 out: 894 out:
895 mutex_unlock(&irq_mapping_update_lock); 895 mutex_unlock(&irq_mapping_update_lock);
896 return irq; 896 return irq;
897 } 897 }
898 898
899 static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain, 899 static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
900 unsigned int remote_port) 900 unsigned int remote_port)
901 { 901 {
902 struct evtchn_bind_interdomain bind_interdomain; 902 struct evtchn_bind_interdomain bind_interdomain;
903 int err; 903 int err;
904 904
905 bind_interdomain.remote_dom = remote_domain; 905 bind_interdomain.remote_dom = remote_domain;
906 bind_interdomain.remote_port = remote_port; 906 bind_interdomain.remote_port = remote_port;
907 907
908 err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, 908 err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
909 &bind_interdomain); 909 &bind_interdomain);
910 910
911 return err ? : bind_evtchn_to_irq(bind_interdomain.local_port); 911 return err ? : bind_evtchn_to_irq(bind_interdomain.local_port);
912 } 912 }
913 913
914 static int find_virq(unsigned int virq, unsigned int cpu) 914 static int find_virq(unsigned int virq, unsigned int cpu)
915 { 915 {
916 struct evtchn_status status; 916 struct evtchn_status status;
917 int port, rc = -ENOENT; 917 int port, rc = -ENOENT;
918 918
919 memset(&status, 0, sizeof(status)); 919 memset(&status, 0, sizeof(status));
920 for (port = 0; port <= NR_EVENT_CHANNELS; port++) { 920 for (port = 0; port <= NR_EVENT_CHANNELS; port++) {
921 status.dom = DOMID_SELF; 921 status.dom = DOMID_SELF;
922 status.port = port; 922 status.port = port;
923 rc = HYPERVISOR_event_channel_op(EVTCHNOP_status, &status); 923 rc = HYPERVISOR_event_channel_op(EVTCHNOP_status, &status);
924 if (rc < 0) 924 if (rc < 0)
925 continue; 925 continue;
926 if (status.status != EVTCHNSTAT_virq) 926 if (status.status != EVTCHNSTAT_virq)
927 continue; 927 continue;
928 if (status.u.virq == virq && status.vcpu == cpu) { 928 if (status.u.virq == virq && status.vcpu == cpu) {
929 rc = port; 929 rc = port;
930 break; 930 break;
931 } 931 }
932 } 932 }
933 return rc; 933 return rc;
934 } 934 }
935 935
936 int bind_virq_to_irq(unsigned int virq, unsigned int cpu) 936 int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
937 { 937 {
938 struct evtchn_bind_virq bind_virq; 938 struct evtchn_bind_virq bind_virq;
939 int evtchn, irq, ret; 939 int evtchn, irq, ret;
940 940
941 mutex_lock(&irq_mapping_update_lock); 941 mutex_lock(&irq_mapping_update_lock);
942 942
943 irq = per_cpu(virq_to_irq, cpu)[virq]; 943 irq = per_cpu(virq_to_irq, cpu)[virq];
944 944
945 if (irq == -1) { 945 if (irq == -1) {
946 irq = xen_allocate_irq_dynamic(); 946 irq = xen_allocate_irq_dynamic();
947 if (irq < 0) 947 if (irq < 0)
948 goto out; 948 goto out;
949 949
950 irq_set_chip_and_handler_name(irq, &xen_percpu_chip, 950 irq_set_chip_and_handler_name(irq, &xen_percpu_chip,
951 handle_percpu_irq, "virq"); 951 handle_percpu_irq, "virq");
952 952
953 bind_virq.virq = virq; 953 bind_virq.virq = virq;
954 bind_virq.vcpu = cpu; 954 bind_virq.vcpu = cpu;
955 ret = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, 955 ret = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
956 &bind_virq); 956 &bind_virq);
957 if (ret == 0) 957 if (ret == 0)
958 evtchn = bind_virq.port; 958 evtchn = bind_virq.port;
959 else { 959 else {
960 if (ret == -EEXIST) 960 if (ret == -EEXIST)
961 ret = find_virq(virq, cpu); 961 ret = find_virq(virq, cpu);
962 BUG_ON(ret < 0); 962 BUG_ON(ret < 0);
963 evtchn = ret; 963 evtchn = ret;
964 } 964 }
965 965
966 xen_irq_info_virq_init(cpu, irq, evtchn, virq); 966 xen_irq_info_virq_init(cpu, irq, evtchn, virq);
967 967
968 bind_evtchn_to_cpu(evtchn, cpu); 968 bind_evtchn_to_cpu(evtchn, cpu);
969 } else { 969 } else {
970 struct irq_info *info = info_for_irq(irq); 970 struct irq_info *info = info_for_irq(irq);
971 WARN_ON(info == NULL || info->type != IRQT_VIRQ); 971 WARN_ON(info == NULL || info->type != IRQT_VIRQ);
972 } 972 }
973 973
974 out: 974 out:
975 mutex_unlock(&irq_mapping_update_lock); 975 mutex_unlock(&irq_mapping_update_lock);
976 976
977 return irq; 977 return irq;
978 } 978 }
979 979
980 static void unbind_from_irq(unsigned int irq) 980 static void unbind_from_irq(unsigned int irq)
981 { 981 {
982 struct evtchn_close close; 982 struct evtchn_close close;
983 int evtchn = evtchn_from_irq(irq); 983 int evtchn = evtchn_from_irq(irq);
984 struct irq_info *info = irq_get_handler_data(irq); 984 struct irq_info *info = irq_get_handler_data(irq);
985 985
986 mutex_lock(&irq_mapping_update_lock); 986 mutex_lock(&irq_mapping_update_lock);
987 987
988 if (info->refcnt > 0) { 988 if (info->refcnt > 0) {
989 info->refcnt--; 989 info->refcnt--;
990 if (info->refcnt != 0) 990 if (info->refcnt != 0)
991 goto done; 991 goto done;
992 } 992 }
993 993
994 if (VALID_EVTCHN(evtchn)) { 994 if (VALID_EVTCHN(evtchn)) {
995 close.port = evtchn; 995 close.port = evtchn;
996 if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) 996 if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
997 BUG(); 997 BUG();
998 998
999 switch (type_from_irq(irq)) { 999 switch (type_from_irq(irq)) {
1000 case IRQT_VIRQ: 1000 case IRQT_VIRQ:
1001 per_cpu(virq_to_irq, cpu_from_evtchn(evtchn)) 1001 per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
1002 [virq_from_irq(irq)] = -1; 1002 [virq_from_irq(irq)] = -1;
1003 break; 1003 break;
1004 case IRQT_IPI: 1004 case IRQT_IPI:
1005 per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn)) 1005 per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn))
1006 [ipi_from_irq(irq)] = -1; 1006 [ipi_from_irq(irq)] = -1;
1007 break; 1007 break;
1008 default: 1008 default:
1009 break; 1009 break;
1010 } 1010 }
1011 1011
1012 /* Closed ports are implicitly re-bound to VCPU0. */ 1012 /* Closed ports are implicitly re-bound to VCPU0. */
1013 bind_evtchn_to_cpu(evtchn, 0); 1013 bind_evtchn_to_cpu(evtchn, 0);
1014 1014
1015 evtchn_to_irq[evtchn] = -1; 1015 evtchn_to_irq[evtchn] = -1;
1016 } 1016 }
1017 1017
1018 BUG_ON(info_for_irq(irq)->type == IRQT_UNBOUND); 1018 BUG_ON(info_for_irq(irq)->type == IRQT_UNBOUND);
1019 1019
1020 xen_free_irq(irq); 1020 xen_free_irq(irq);
1021 1021
1022 done: 1022 done:
1023 mutex_unlock(&irq_mapping_update_lock); 1023 mutex_unlock(&irq_mapping_update_lock);
1024 } 1024 }
1025 1025
1026 int bind_evtchn_to_irqhandler(unsigned int evtchn, 1026 int bind_evtchn_to_irqhandler(unsigned int evtchn,
1027 irq_handler_t handler, 1027 irq_handler_t handler,
1028 unsigned long irqflags, 1028 unsigned long irqflags,
1029 const char *devname, void *dev_id) 1029 const char *devname, void *dev_id)
1030 { 1030 {
1031 int irq, retval; 1031 int irq, retval;
1032 1032
1033 irq = bind_evtchn_to_irq(evtchn); 1033 irq = bind_evtchn_to_irq(evtchn);
1034 if (irq < 0) 1034 if (irq < 0)
1035 return irq; 1035 return irq;
1036 retval = request_irq(irq, handler, irqflags, devname, dev_id); 1036 retval = request_irq(irq, handler, irqflags, devname, dev_id);
1037 if (retval != 0) { 1037 if (retval != 0) {
1038 unbind_from_irq(irq); 1038 unbind_from_irq(irq);
1039 return retval; 1039 return retval;
1040 } 1040 }
1041 1041
1042 return irq; 1042 return irq;
1043 } 1043 }
1044 EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler); 1044 EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
1045 1045
1046 int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain, 1046 int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
1047 unsigned int remote_port, 1047 unsigned int remote_port,
1048 irq_handler_t handler, 1048 irq_handler_t handler,
1049 unsigned long irqflags, 1049 unsigned long irqflags,
1050 const char *devname, 1050 const char *devname,
1051 void *dev_id) 1051 void *dev_id)
1052 { 1052 {
1053 int irq, retval; 1053 int irq, retval;
1054 1054
1055 irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port); 1055 irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port);
1056 if (irq < 0) 1056 if (irq < 0)
1057 return irq; 1057 return irq;
1058 1058
1059 retval = request_irq(irq, handler, irqflags, devname, dev_id); 1059 retval = request_irq(irq, handler, irqflags, devname, dev_id);
1060 if (retval != 0) { 1060 if (retval != 0) {
1061 unbind_from_irq(irq); 1061 unbind_from_irq(irq);
1062 return retval; 1062 return retval;
1063 } 1063 }
1064 1064
1065 return irq; 1065 return irq;
1066 } 1066 }
1067 EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler); 1067 EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler);
1068 1068
1069 int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, 1069 int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
1070 irq_handler_t handler, 1070 irq_handler_t handler,
1071 unsigned long irqflags, const char *devname, void *dev_id) 1071 unsigned long irqflags, const char *devname, void *dev_id)
1072 { 1072 {
1073 int irq, retval; 1073 int irq, retval;
1074 1074
1075 irq = bind_virq_to_irq(virq, cpu); 1075 irq = bind_virq_to_irq(virq, cpu);
1076 if (irq < 0) 1076 if (irq < 0)
1077 return irq; 1077 return irq;
1078 retval = request_irq(irq, handler, irqflags, devname, dev_id); 1078 retval = request_irq(irq, handler, irqflags, devname, dev_id);
1079 if (retval != 0) { 1079 if (retval != 0) {
1080 unbind_from_irq(irq); 1080 unbind_from_irq(irq);
1081 return retval; 1081 return retval;
1082 } 1082 }
1083 1083
1084 return irq; 1084 return irq;
1085 } 1085 }
1086 EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler); 1086 EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
1087 1087
1088 int bind_ipi_to_irqhandler(enum ipi_vector ipi, 1088 int bind_ipi_to_irqhandler(enum ipi_vector ipi,
1089 unsigned int cpu, 1089 unsigned int cpu,
1090 irq_handler_t handler, 1090 irq_handler_t handler,
1091 unsigned long irqflags, 1091 unsigned long irqflags,
1092 const char *devname, 1092 const char *devname,
1093 void *dev_id) 1093 void *dev_id)
1094 { 1094 {
1095 int irq, retval; 1095 int irq, retval;
1096 1096
1097 irq = bind_ipi_to_irq(ipi, cpu); 1097 irq = bind_ipi_to_irq(ipi, cpu);
1098 if (irq < 0) 1098 if (irq < 0)
1099 return irq; 1099 return irq;
1100 1100
1101 irqflags |= IRQF_NO_SUSPEND | IRQF_FORCE_RESUME | IRQF_EARLY_RESUME; 1101 irqflags |= IRQF_NO_SUSPEND | IRQF_FORCE_RESUME | IRQF_EARLY_RESUME;
1102 retval = request_irq(irq, handler, irqflags, devname, dev_id); 1102 retval = request_irq(irq, handler, irqflags, devname, dev_id);
1103 if (retval != 0) { 1103 if (retval != 0) {
1104 unbind_from_irq(irq); 1104 unbind_from_irq(irq);
1105 return retval; 1105 return retval;
1106 } 1106 }
1107 1107
1108 return irq; 1108 return irq;
1109 } 1109 }
1110 1110
1111 void unbind_from_irqhandler(unsigned int irq, void *dev_id) 1111 void unbind_from_irqhandler(unsigned int irq, void *dev_id)
1112 { 1112 {
1113 free_irq(irq, dev_id); 1113 free_irq(irq, dev_id);
1114 unbind_from_irq(irq); 1114 unbind_from_irq(irq);
1115 } 1115 }
1116 EXPORT_SYMBOL_GPL(unbind_from_irqhandler); 1116 EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
1117 1117
1118 int evtchn_make_refcounted(unsigned int evtchn) 1118 int evtchn_make_refcounted(unsigned int evtchn)
1119 { 1119 {
1120 int irq = evtchn_to_irq[evtchn]; 1120 int irq = evtchn_to_irq[evtchn];
1121 struct irq_info *info; 1121 struct irq_info *info;
1122 1122
1123 if (irq == -1) 1123 if (irq == -1)
1124 return -ENOENT; 1124 return -ENOENT;
1125 1125
1126 info = irq_get_handler_data(irq); 1126 info = irq_get_handler_data(irq);
1127 1127
1128 if (!info) 1128 if (!info)
1129 return -ENOENT; 1129 return -ENOENT;
1130 1130
1131 WARN_ON(info->refcnt != -1); 1131 WARN_ON(info->refcnt != -1);
1132 1132
1133 info->refcnt = 1; 1133 info->refcnt = 1;
1134 1134
1135 return 0; 1135 return 0;
1136 } 1136 }
1137 EXPORT_SYMBOL_GPL(evtchn_make_refcounted); 1137 EXPORT_SYMBOL_GPL(evtchn_make_refcounted);
1138 1138
1139 int evtchn_get(unsigned int evtchn) 1139 int evtchn_get(unsigned int evtchn)
1140 { 1140 {
1141 int irq; 1141 int irq;
1142 struct irq_info *info; 1142 struct irq_info *info;
1143 int err = -ENOENT; 1143 int err = -ENOENT;
1144 1144
1145 if (evtchn >= NR_EVENT_CHANNELS) 1145 if (evtchn >= NR_EVENT_CHANNELS)
1146 return -EINVAL; 1146 return -EINVAL;
1147 1147
1148 mutex_lock(&irq_mapping_update_lock); 1148 mutex_lock(&irq_mapping_update_lock);
1149 1149
1150 irq = evtchn_to_irq[evtchn]; 1150 irq = evtchn_to_irq[evtchn];
1151 if (irq == -1) 1151 if (irq == -1)
1152 goto done; 1152 goto done;
1153 1153
1154 info = irq_get_handler_data(irq); 1154 info = irq_get_handler_data(irq);
1155 1155
1156 if (!info) 1156 if (!info)
1157 goto done; 1157 goto done;
1158 1158
1159 err = -EINVAL; 1159 err = -EINVAL;
1160 if (info->refcnt <= 0) 1160 if (info->refcnt <= 0)
1161 goto done; 1161 goto done;
1162 1162
1163 info->refcnt++; 1163 info->refcnt++;
1164 err = 0; 1164 err = 0;
1165 done: 1165 done:
1166 mutex_unlock(&irq_mapping_update_lock); 1166 mutex_unlock(&irq_mapping_update_lock);
1167 1167
1168 return err; 1168 return err;
1169 } 1169 }
1170 EXPORT_SYMBOL_GPL(evtchn_get); 1170 EXPORT_SYMBOL_GPL(evtchn_get);
1171 1171
1172 void evtchn_put(unsigned int evtchn) 1172 void evtchn_put(unsigned int evtchn)
1173 { 1173 {
1174 int irq = evtchn_to_irq[evtchn]; 1174 int irq = evtchn_to_irq[evtchn];
1175 if (WARN_ON(irq == -1)) 1175 if (WARN_ON(irq == -1))
1176 return; 1176 return;
1177 unbind_from_irq(irq); 1177 unbind_from_irq(irq);
1178 } 1178 }
1179 EXPORT_SYMBOL_GPL(evtchn_put); 1179 EXPORT_SYMBOL_GPL(evtchn_put);
1180 1180
1181 void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) 1181 void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
1182 { 1182 {
1183 int irq = per_cpu(ipi_to_irq, cpu)[vector]; 1183 int irq = per_cpu(ipi_to_irq, cpu)[vector];
1184 BUG_ON(irq < 0); 1184 BUG_ON(irq < 0);
1185 notify_remote_via_irq(irq); 1185 notify_remote_via_irq(irq);
1186 } 1186 }
1187 1187
1188 irqreturn_t xen_debug_interrupt(int irq, void *dev_id) 1188 irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
1189 { 1189 {
1190 struct shared_info *sh = HYPERVISOR_shared_info; 1190 struct shared_info *sh = HYPERVISOR_shared_info;
1191 int cpu = smp_processor_id(); 1191 int cpu = smp_processor_id();
1192 unsigned long *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu); 1192 unsigned long *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu);
1193 int i; 1193 int i;
1194 unsigned long flags; 1194 unsigned long flags;
1195 static DEFINE_SPINLOCK(debug_lock); 1195 static DEFINE_SPINLOCK(debug_lock);
1196 struct vcpu_info *v; 1196 struct vcpu_info *v;
1197 1197
1198 spin_lock_irqsave(&debug_lock, flags); 1198 spin_lock_irqsave(&debug_lock, flags);
1199 1199
1200 printk("\nvcpu %d\n ", cpu); 1200 printk("\nvcpu %d\n ", cpu);
1201 1201
1202 for_each_online_cpu(i) { 1202 for_each_online_cpu(i) {
1203 int pending; 1203 int pending;
1204 v = per_cpu(xen_vcpu, i); 1204 v = per_cpu(xen_vcpu, i);
1205 pending = (get_irq_regs() && i == cpu) 1205 pending = (get_irq_regs() && i == cpu)
1206 ? xen_irqs_disabled(get_irq_regs()) 1206 ? xen_irqs_disabled(get_irq_regs())
1207 : v->evtchn_upcall_mask; 1207 : v->evtchn_upcall_mask;
1208 printk("%d: masked=%d pending=%d event_sel %0*lx\n ", i, 1208 printk("%d: masked=%d pending=%d event_sel %0*lx\n ", i,
1209 pending, v->evtchn_upcall_pending, 1209 pending, v->evtchn_upcall_pending,
1210 (int)(sizeof(v->evtchn_pending_sel)*2), 1210 (int)(sizeof(v->evtchn_pending_sel)*2),
1211 v->evtchn_pending_sel); 1211 v->evtchn_pending_sel);
1212 } 1212 }
1213 v = per_cpu(xen_vcpu, cpu); 1213 v = per_cpu(xen_vcpu, cpu);
1214 1214
1215 printk("\npending:\n "); 1215 printk("\npending:\n ");
1216 for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--) 1216 for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--)
1217 printk("%0*lx%s", (int)sizeof(sh->evtchn_pending[0])*2, 1217 printk("%0*lx%s", (int)sizeof(sh->evtchn_pending[0])*2,
1218 sh->evtchn_pending[i], 1218 sh->evtchn_pending[i],
1219 i % 8 == 0 ? "\n " : " "); 1219 i % 8 == 0 ? "\n " : " ");
1220 printk("\nglobal mask:\n "); 1220 printk("\nglobal mask:\n ");
1221 for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) 1221 for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
1222 printk("%0*lx%s", 1222 printk("%0*lx%s",
1223 (int)(sizeof(sh->evtchn_mask[0])*2), 1223 (int)(sizeof(sh->evtchn_mask[0])*2),
1224 sh->evtchn_mask[i], 1224 sh->evtchn_mask[i],
1225 i % 8 == 0 ? "\n " : " "); 1225 i % 8 == 0 ? "\n " : " ");
1226 1226
1227 printk("\nglobally unmasked:\n "); 1227 printk("\nglobally unmasked:\n ");
1228 for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) 1228 for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
1229 printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2), 1229 printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2),
1230 sh->evtchn_pending[i] & ~sh->evtchn_mask[i], 1230 sh->evtchn_pending[i] & ~sh->evtchn_mask[i],
1231 i % 8 == 0 ? "\n " : " "); 1231 i % 8 == 0 ? "\n " : " ");
1232 1232
1233 printk("\nlocal cpu%d mask:\n ", cpu); 1233 printk("\nlocal cpu%d mask:\n ", cpu);
1234 for (i = (NR_EVENT_CHANNELS/BITS_PER_LONG)-1; i >= 0; i--) 1234 for (i = (NR_EVENT_CHANNELS/BITS_PER_LONG)-1; i >= 0; i--)
1235 printk("%0*lx%s", (int)(sizeof(cpu_evtchn[0])*2), 1235 printk("%0*lx%s", (int)(sizeof(cpu_evtchn[0])*2),
1236 cpu_evtchn[i], 1236 cpu_evtchn[i],
1237 i % 8 == 0 ? "\n " : " "); 1237 i % 8 == 0 ? "\n " : " ");
1238 1238
1239 printk("\nlocally unmasked:\n "); 1239 printk("\nlocally unmasked:\n ");
1240 for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) { 1240 for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) {
1241 unsigned long pending = sh->evtchn_pending[i] 1241 unsigned long pending = sh->evtchn_pending[i]
1242 & ~sh->evtchn_mask[i] 1242 & ~sh->evtchn_mask[i]
1243 & cpu_evtchn[i]; 1243 & cpu_evtchn[i];
1244 printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2), 1244 printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2),
1245 pending, i % 8 == 0 ? "\n " : " "); 1245 pending, i % 8 == 0 ? "\n " : " ");
1246 } 1246 }
1247 1247
1248 printk("\npending list:\n"); 1248 printk("\npending list:\n");
1249 for (i = 0; i < NR_EVENT_CHANNELS; i++) { 1249 for (i = 0; i < NR_EVENT_CHANNELS; i++) {
1250 if (sync_test_bit(i, sh->evtchn_pending)) { 1250 if (sync_test_bit(i, sh->evtchn_pending)) {
1251 int word_idx = i / BITS_PER_LONG; 1251 int word_idx = i / BITS_PER_LONG;
1252 printk(" %d: event %d -> irq %d%s%s%s\n", 1252 printk(" %d: event %d -> irq %d%s%s%s\n",
1253 cpu_from_evtchn(i), i, 1253 cpu_from_evtchn(i), i,
1254 evtchn_to_irq[i], 1254 evtchn_to_irq[i],
1255 sync_test_bit(word_idx, &v->evtchn_pending_sel) 1255 sync_test_bit(word_idx, &v->evtchn_pending_sel)
1256 ? "" : " l2-clear", 1256 ? "" : " l2-clear",
1257 !sync_test_bit(i, sh->evtchn_mask) 1257 !sync_test_bit(i, sh->evtchn_mask)
1258 ? "" : " globally-masked", 1258 ? "" : " globally-masked",
1259 sync_test_bit(i, cpu_evtchn) 1259 sync_test_bit(i, cpu_evtchn)
1260 ? "" : " locally-masked"); 1260 ? "" : " locally-masked");
1261 } 1261 }
1262 } 1262 }
1263 1263
1264 spin_unlock_irqrestore(&debug_lock, flags); 1264 spin_unlock_irqrestore(&debug_lock, flags);
1265 1265
1266 return IRQ_HANDLED; 1266 return IRQ_HANDLED;
1267 } 1267 }
1268 1268
1269 static DEFINE_PER_CPU(unsigned, xed_nesting_count); 1269 static DEFINE_PER_CPU(unsigned, xed_nesting_count);
1270 static DEFINE_PER_CPU(unsigned int, current_word_idx); 1270 static DEFINE_PER_CPU(unsigned int, current_word_idx);
1271 static DEFINE_PER_CPU(unsigned int, current_bit_idx); 1271 static DEFINE_PER_CPU(unsigned int, current_bit_idx);
1272 1272
1273 /* 1273 /*
1274 * Mask out the i least significant bits of w 1274 * Mask out the i least significant bits of w
1275 */ 1275 */
1276 #define MASK_LSBS(w, i) (w & ((~0UL) << i)) 1276 #define MASK_LSBS(w, i) (w & ((~0UL) << i))
1277 1277
1278 /* 1278 /*
1279 * Search the CPUs pending events bitmasks. For each one found, map 1279 * Search the CPUs pending events bitmasks. For each one found, map
1280 * the event number to an irq, and feed it into do_IRQ() for 1280 * the event number to an irq, and feed it into do_IRQ() for
1281 * handling. 1281 * handling.
1282 * 1282 *
1283 * Xen uses a two-level bitmap to speed searching. The first level is 1283 * Xen uses a two-level bitmap to speed searching. The first level is
1284 * a bitset of words which contain pending event bits. The second 1284 * a bitset of words which contain pending event bits. The second
1285 * level is a bitset of pending events themselves. 1285 * level is a bitset of pending events themselves.
1286 */ 1286 */
1287 static void __xen_evtchn_do_upcall(void) 1287 static void __xen_evtchn_do_upcall(void)
1288 { 1288 {
1289 int start_word_idx, start_bit_idx; 1289 int start_word_idx, start_bit_idx;
1290 int word_idx, bit_idx; 1290 int word_idx, bit_idx;
1291 int i; 1291 int i;
1292 int cpu = get_cpu(); 1292 int cpu = get_cpu();
1293 struct shared_info *s = HYPERVISOR_shared_info; 1293 struct shared_info *s = HYPERVISOR_shared_info;
1294 struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); 1294 struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
1295 unsigned count; 1295 unsigned count;
1296 1296
1297 do { 1297 do {
1298 unsigned long pending_words; 1298 unsigned long pending_words;
1299 1299
1300 vcpu_info->evtchn_upcall_pending = 0; 1300 vcpu_info->evtchn_upcall_pending = 0;
1301 1301
1302 if (__this_cpu_inc_return(xed_nesting_count) - 1) 1302 if (__this_cpu_inc_return(xed_nesting_count) - 1)
1303 goto out; 1303 goto out;
1304 1304
1305 #ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */ 1305 #ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
1306 /* Clear master flag /before/ clearing selector flag. */ 1306 /* Clear master flag /before/ clearing selector flag. */
1307 wmb(); 1307 wmb();
1308 #endif 1308 #endif
1309 pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0); 1309 pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
1310 1310
1311 start_word_idx = __this_cpu_read(current_word_idx); 1311 start_word_idx = __this_cpu_read(current_word_idx);
1312 start_bit_idx = __this_cpu_read(current_bit_idx); 1312 start_bit_idx = __this_cpu_read(current_bit_idx);
1313 1313
1314 word_idx = start_word_idx; 1314 word_idx = start_word_idx;
1315 1315
1316 for (i = 0; pending_words != 0; i++) { 1316 for (i = 0; pending_words != 0; i++) {
1317 unsigned long pending_bits; 1317 unsigned long pending_bits;
1318 unsigned long words; 1318 unsigned long words;
1319 1319
1320 words = MASK_LSBS(pending_words, word_idx); 1320 words = MASK_LSBS(pending_words, word_idx);
1321 1321
1322 /* 1322 /*
1323 * If we masked out all events, wrap to beginning. 1323 * If we masked out all events, wrap to beginning.
1324 */ 1324 */
1325 if (words == 0) { 1325 if (words == 0) {
1326 word_idx = 0; 1326 word_idx = 0;
1327 bit_idx = 0; 1327 bit_idx = 0;
1328 continue; 1328 continue;
1329 } 1329 }
1330 word_idx = __ffs(words); 1330 word_idx = __ffs(words);
1331 1331
1332 pending_bits = active_evtchns(cpu, s, word_idx); 1332 pending_bits = active_evtchns(cpu, s, word_idx);
1333 bit_idx = 0; /* usually scan entire word from start */ 1333 bit_idx = 0; /* usually scan entire word from start */
1334 if (word_idx == start_word_idx) { 1334 if (word_idx == start_word_idx) {
1335 /* We scan the starting word in two parts */ 1335 /* We scan the starting word in two parts */
1336 if (i == 0) 1336 if (i == 0)
1337 /* 1st time: start in the middle */ 1337 /* 1st time: start in the middle */
1338 bit_idx = start_bit_idx; 1338 bit_idx = start_bit_idx;
1339 else 1339 else
1340 /* 2nd time: mask bits done already */ 1340 /* 2nd time: mask bits done already */
1341 bit_idx &= (1UL << start_bit_idx) - 1; 1341 bit_idx &= (1UL << start_bit_idx) - 1;
1342 } 1342 }
1343 1343
1344 do { 1344 do {
1345 unsigned long bits; 1345 unsigned long bits;
1346 int port, irq; 1346 int port, irq;
1347 struct irq_desc *desc; 1347 struct irq_desc *desc;
1348 1348
1349 bits = MASK_LSBS(pending_bits, bit_idx); 1349 bits = MASK_LSBS(pending_bits, bit_idx);
1350 1350
1351 /* If we masked out all events, move on. */ 1351 /* If we masked out all events, move on. */
1352 if (bits == 0) 1352 if (bits == 0)
1353 break; 1353 break;
1354 1354
1355 bit_idx = __ffs(bits); 1355 bit_idx = __ffs(bits);
1356 1356
1357 /* Process port. */ 1357 /* Process port. */
1358 port = (word_idx * BITS_PER_LONG) + bit_idx; 1358 port = (word_idx * BITS_PER_LONG) + bit_idx;
1359 irq = evtchn_to_irq[port]; 1359 irq = evtchn_to_irq[port];
1360 1360
1361 if (irq != -1) { 1361 if (irq != -1) {
1362 desc = irq_to_desc(irq); 1362 desc = irq_to_desc(irq);
1363 if (desc) 1363 if (desc)
1364 generic_handle_irq_desc(irq, desc); 1364 generic_handle_irq_desc(irq, desc);
1365 } 1365 }
1366 1366
1367 bit_idx = (bit_idx + 1) % BITS_PER_LONG; 1367 bit_idx = (bit_idx + 1) % BITS_PER_LONG;
1368 1368
1369 /* Next caller starts at last processed + 1 */ 1369 /* Next caller starts at last processed + 1 */
1370 __this_cpu_write(current_word_idx, 1370 __this_cpu_write(current_word_idx,
1371 bit_idx ? word_idx : 1371 bit_idx ? word_idx :
1372 (word_idx+1) % BITS_PER_LONG); 1372 (word_idx+1) % BITS_PER_LONG);
1373 __this_cpu_write(current_bit_idx, bit_idx); 1373 __this_cpu_write(current_bit_idx, bit_idx);
1374 } while (bit_idx != 0); 1374 } while (bit_idx != 0);
1375 1375
1376 /* Scan start_l1i twice; all others once. */ 1376 /* Scan start_l1i twice; all others once. */
1377 if ((word_idx != start_word_idx) || (i != 0)) 1377 if ((word_idx != start_word_idx) || (i != 0))
1378 pending_words &= ~(1UL << word_idx); 1378 pending_words &= ~(1UL << word_idx);
1379 1379
1380 word_idx = (word_idx + 1) % BITS_PER_LONG; 1380 word_idx = (word_idx + 1) % BITS_PER_LONG;
1381 } 1381 }
1382 1382
1383 BUG_ON(!irqs_disabled()); 1383 BUG_ON(!irqs_disabled());
1384 1384
1385 count = __this_cpu_read(xed_nesting_count); 1385 count = __this_cpu_read(xed_nesting_count);
1386 __this_cpu_write(xed_nesting_count, 0); 1386 __this_cpu_write(xed_nesting_count, 0);
1387 } while (count != 1 || vcpu_info->evtchn_upcall_pending); 1387 } while (count != 1 || vcpu_info->evtchn_upcall_pending);
1388 1388
1389 out: 1389 out:
1390 1390
1391 put_cpu(); 1391 put_cpu();
1392 } 1392 }
1393 1393
1394 void xen_evtchn_do_upcall(struct pt_regs *regs) 1394 void xen_evtchn_do_upcall(struct pt_regs *regs)
1395 { 1395 {
1396 struct pt_regs *old_regs = set_irq_regs(regs); 1396 struct pt_regs *old_regs = set_irq_regs(regs);
1397 1397
1398 irq_enter(); 1398 irq_enter();
1399 #ifdef CONFIG_X86 1399 #ifdef CONFIG_X86
1400 exit_idle(); 1400 exit_idle();
1401 #endif 1401 #endif
1402 1402
1403 __xen_evtchn_do_upcall(); 1403 __xen_evtchn_do_upcall();
1404 1404
1405 irq_exit(); 1405 irq_exit();
1406 set_irq_regs(old_regs); 1406 set_irq_regs(old_regs);
1407 } 1407 }
1408 1408
1409 void xen_hvm_evtchn_do_upcall(void) 1409 void xen_hvm_evtchn_do_upcall(void)
1410 { 1410 {
1411 __xen_evtchn_do_upcall(); 1411 __xen_evtchn_do_upcall();
1412 } 1412 }
1413 EXPORT_SYMBOL_GPL(xen_hvm_evtchn_do_upcall); 1413 EXPORT_SYMBOL_GPL(xen_hvm_evtchn_do_upcall);
1414 1414
1415 /* Rebind a new event channel to an existing irq. */ 1415 /* Rebind a new event channel to an existing irq. */
1416 void rebind_evtchn_irq(int evtchn, int irq) 1416 void rebind_evtchn_irq(int evtchn, int irq)
1417 { 1417 {
1418 struct irq_info *info = info_for_irq(irq); 1418 struct irq_info *info = info_for_irq(irq);
1419 1419
1420 /* Make sure the irq is masked, since the new event channel 1420 /* Make sure the irq is masked, since the new event channel
1421 will also be masked. */ 1421 will also be masked. */
1422 disable_irq(irq); 1422 disable_irq(irq);
1423 1423
1424 mutex_lock(&irq_mapping_update_lock); 1424 mutex_lock(&irq_mapping_update_lock);
1425 1425
1426 /* After resume the irq<->evtchn mappings are all cleared out */ 1426 /* After resume the irq<->evtchn mappings are all cleared out */
1427 BUG_ON(evtchn_to_irq[evtchn] != -1); 1427 BUG_ON(evtchn_to_irq[evtchn] != -1);
1428 /* Expect irq to have been bound before, 1428 /* Expect irq to have been bound before,
1429 so there should be a proper type */ 1429 so there should be a proper type */
1430 BUG_ON(info->type == IRQT_UNBOUND); 1430 BUG_ON(info->type == IRQT_UNBOUND);
1431 1431
1432 xen_irq_info_evtchn_init(irq, evtchn); 1432 xen_irq_info_evtchn_init(irq, evtchn);
1433 1433
1434 mutex_unlock(&irq_mapping_update_lock); 1434 mutex_unlock(&irq_mapping_update_lock);
1435 1435
1436 /* new event channels are always bound to cpu 0 */ 1436 /* new event channels are always bound to cpu 0 */
1437 irq_set_affinity(irq, cpumask_of(0)); 1437 irq_set_affinity(irq, cpumask_of(0));
1438 1438
1439 /* Unmask the event channel. */ 1439 /* Unmask the event channel. */
1440 enable_irq(irq); 1440 enable_irq(irq);
1441 } 1441 }
1442 1442
1443 /* Rebind an evtchn so that it gets delivered to a specific cpu */ 1443 /* Rebind an evtchn so that it gets delivered to a specific cpu */
1444 static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) 1444 static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
1445 { 1445 {
1446 struct evtchn_bind_vcpu bind_vcpu; 1446 struct evtchn_bind_vcpu bind_vcpu;
1447 int evtchn = evtchn_from_irq(irq); 1447 int evtchn = evtchn_from_irq(irq);
1448 1448
1449 if (!VALID_EVTCHN(evtchn)) 1449 if (!VALID_EVTCHN(evtchn))
1450 return -1; 1450 return -1;
1451 1451
1452 /* 1452 /*
1453 * Events delivered via platform PCI interrupts are always 1453 * Events delivered via platform PCI interrupts are always
1454 * routed to vcpu 0 and hence cannot be rebound. 1454 * routed to vcpu 0 and hence cannot be rebound.
1455 */ 1455 */
1456 if (xen_hvm_domain() && !xen_have_vector_callback) 1456 if (xen_hvm_domain() && !xen_have_vector_callback)
1457 return -1; 1457 return -1;
1458 1458
1459 /* Send future instances of this interrupt to other vcpu. */ 1459 /* Send future instances of this interrupt to other vcpu. */
1460 bind_vcpu.port = evtchn; 1460 bind_vcpu.port = evtchn;
1461 bind_vcpu.vcpu = tcpu; 1461 bind_vcpu.vcpu = tcpu;
1462 1462
1463 /* 1463 /*
1464 * If this fails, it usually just indicates that we're dealing with a 1464 * If this fails, it usually just indicates that we're dealing with a
1465 * virq or IPI channel, which don't actually need to be rebound. Ignore 1465 * virq or IPI channel, which don't actually need to be rebound. Ignore
1466 * it, but don't do the xenlinux-level rebind in that case. 1466 * it, but don't do the xenlinux-level rebind in that case.
1467 */ 1467 */
1468 if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) 1468 if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
1469 bind_evtchn_to_cpu(evtchn, tcpu); 1469 bind_evtchn_to_cpu(evtchn, tcpu);
1470 1470
1471 return 0; 1471 return 0;
1472 } 1472 }
1473 1473
1474 static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest, 1474 static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest,
1475 bool force) 1475 bool force)
1476 { 1476 {
1477 unsigned tcpu = cpumask_first(dest); 1477 unsigned tcpu = cpumask_first(dest);
1478 1478
1479 return rebind_irq_to_cpu(data->irq, tcpu); 1479 return rebind_irq_to_cpu(data->irq, tcpu);
1480 } 1480 }
1481 1481
1482 int resend_irq_on_evtchn(unsigned int irq) 1482 int resend_irq_on_evtchn(unsigned int irq)
1483 { 1483 {
1484 int masked, evtchn = evtchn_from_irq(irq); 1484 int masked, evtchn = evtchn_from_irq(irq);
1485 struct shared_info *s = HYPERVISOR_shared_info; 1485 struct shared_info *s = HYPERVISOR_shared_info;
1486 1486
1487 if (!VALID_EVTCHN(evtchn)) 1487 if (!VALID_EVTCHN(evtchn))
1488 return 1; 1488 return 1;
1489 1489
1490 masked = sync_test_and_set_bit(evtchn, s->evtchn_mask); 1490 masked = sync_test_and_set_bit(evtchn, s->evtchn_mask);
1491 sync_set_bit(evtchn, s->evtchn_pending); 1491 sync_set_bit(evtchn, s->evtchn_pending);
1492 if (!masked) 1492 if (!masked)
1493 unmask_evtchn(evtchn); 1493 unmask_evtchn(evtchn);
1494 1494
1495 return 1; 1495 return 1;
1496 } 1496 }
1497 1497
1498 static void enable_dynirq(struct irq_data *data) 1498 static void enable_dynirq(struct irq_data *data)
1499 { 1499 {
1500 int evtchn = evtchn_from_irq(data->irq); 1500 int evtchn = evtchn_from_irq(data->irq);
1501 1501
1502 if (VALID_EVTCHN(evtchn)) 1502 if (VALID_EVTCHN(evtchn))
1503 unmask_evtchn(evtchn); 1503 unmask_evtchn(evtchn);
1504 } 1504 }
1505 1505
1506 static void disable_dynirq(struct irq_data *data) 1506 static void disable_dynirq(struct irq_data *data)
1507 { 1507 {
1508 int evtchn = evtchn_from_irq(data->irq); 1508 int evtchn = evtchn_from_irq(data->irq);
1509 1509
1510 if (VALID_EVTCHN(evtchn)) 1510 if (VALID_EVTCHN(evtchn))
1511 mask_evtchn(evtchn); 1511 mask_evtchn(evtchn);
1512 } 1512 }
1513 1513
1514 static void ack_dynirq(struct irq_data *data) 1514 static void ack_dynirq(struct irq_data *data)
1515 { 1515 {
1516 int evtchn = evtchn_from_irq(data->irq); 1516 int evtchn = evtchn_from_irq(data->irq);
1517 1517
1518 irq_move_irq(data); 1518 irq_move_irq(data);
1519 1519
1520 if (VALID_EVTCHN(evtchn)) 1520 if (VALID_EVTCHN(evtchn))
1521 clear_evtchn(evtchn); 1521 clear_evtchn(evtchn);
1522 } 1522 }
1523 1523
1524 static void mask_ack_dynirq(struct irq_data *data) 1524 static void mask_ack_dynirq(struct irq_data *data)
1525 { 1525 {
1526 disable_dynirq(data); 1526 disable_dynirq(data);
1527 ack_dynirq(data); 1527 ack_dynirq(data);
1528 } 1528 }
1529 1529
1530 static int retrigger_dynirq(struct irq_data *data) 1530 static int retrigger_dynirq(struct irq_data *data)
1531 { 1531 {
1532 int evtchn = evtchn_from_irq(data->irq); 1532 int evtchn = evtchn_from_irq(data->irq);
1533 struct shared_info *sh = HYPERVISOR_shared_info; 1533 struct shared_info *sh = HYPERVISOR_shared_info;
1534 int ret = 0; 1534 int ret = 0;
1535 1535
1536 if (VALID_EVTCHN(evtchn)) { 1536 if (VALID_EVTCHN(evtchn)) {
1537 int masked; 1537 int masked;
1538 1538
1539 masked = sync_test_and_set_bit(evtchn, sh->evtchn_mask); 1539 masked = sync_test_and_set_bit(evtchn, sh->evtchn_mask);
1540 sync_set_bit(evtchn, sh->evtchn_pending); 1540 sync_set_bit(evtchn, sh->evtchn_pending);
1541 if (!masked) 1541 if (!masked)
1542 unmask_evtchn(evtchn); 1542 unmask_evtchn(evtchn);
1543 ret = 1; 1543 ret = 1;
1544 } 1544 }
1545 1545
1546 return ret; 1546 return ret;
1547 } 1547 }
1548 1548
1549 static void restore_pirqs(void) 1549 static void restore_pirqs(void)
1550 { 1550 {
1551 int pirq, rc, irq, gsi; 1551 int pirq, rc, irq, gsi;
1552 struct physdev_map_pirq map_irq; 1552 struct physdev_map_pirq map_irq;
1553 struct irq_info *info; 1553 struct irq_info *info;
1554 1554
1555 list_for_each_entry(info, &xen_irq_list_head, list) { 1555 list_for_each_entry(info, &xen_irq_list_head, list) {
1556 if (info->type != IRQT_PIRQ) 1556 if (info->type != IRQT_PIRQ)
1557 continue; 1557 continue;
1558 1558
1559 pirq = info->u.pirq.pirq; 1559 pirq = info->u.pirq.pirq;
1560 gsi = info->u.pirq.gsi; 1560 gsi = info->u.pirq.gsi;
1561 irq = info->irq; 1561 irq = info->irq;
1562 1562
1563 /* save/restore of PT devices doesn't work, so at this point the 1563 /* save/restore of PT devices doesn't work, so at this point the
1564 * only devices present are GSI based emulated devices */ 1564 * only devices present are GSI based emulated devices */
1565 if (!gsi) 1565 if (!gsi)
1566 continue; 1566 continue;
1567 1567
1568 map_irq.domid = DOMID_SELF; 1568 map_irq.domid = DOMID_SELF;
1569 map_irq.type = MAP_PIRQ_TYPE_GSI; 1569 map_irq.type = MAP_PIRQ_TYPE_GSI;
1570 map_irq.index = gsi; 1570 map_irq.index = gsi;
1571 map_irq.pirq = pirq; 1571 map_irq.pirq = pirq;
1572 1572
1573 rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); 1573 rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
1574 if (rc) { 1574 if (rc) {
1575 printk(KERN_WARNING "xen map irq failed gsi=%d irq=%d pirq=%d rc=%d\n", 1575 printk(KERN_WARNING "xen map irq failed gsi=%d irq=%d pirq=%d rc=%d\n",
1576 gsi, irq, pirq, rc); 1576 gsi, irq, pirq, rc);
1577 xen_free_irq(irq); 1577 xen_free_irq(irq);
1578 continue; 1578 continue;
1579 } 1579 }
1580 1580
1581 printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq); 1581 printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq);
1582 1582
1583 __startup_pirq(irq); 1583 __startup_pirq(irq);
1584 } 1584 }
1585 } 1585 }
1586 1586
1587 static void restore_cpu_virqs(unsigned int cpu) 1587 static void restore_cpu_virqs(unsigned int cpu)
1588 { 1588 {
1589 struct evtchn_bind_virq bind_virq; 1589 struct evtchn_bind_virq bind_virq;
1590 int virq, irq, evtchn; 1590 int virq, irq, evtchn;
1591 1591
1592 for (virq = 0; virq < NR_VIRQS; virq++) { 1592 for (virq = 0; virq < NR_VIRQS; virq++) {
1593 if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) 1593 if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1)
1594 continue; 1594 continue;
1595 1595
1596 BUG_ON(virq_from_irq(irq) != virq); 1596 BUG_ON(virq_from_irq(irq) != virq);
1597 1597
1598 /* Get a new binding from Xen. */ 1598 /* Get a new binding from Xen. */
1599 bind_virq.virq = virq; 1599 bind_virq.virq = virq;
1600 bind_virq.vcpu = cpu; 1600 bind_virq.vcpu = cpu;
1601 if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, 1601 if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
1602 &bind_virq) != 0) 1602 &bind_virq) != 0)
1603 BUG(); 1603 BUG();
1604 evtchn = bind_virq.port; 1604 evtchn = bind_virq.port;
1605 1605
1606 /* Record the new mapping. */ 1606 /* Record the new mapping. */
1607 xen_irq_info_virq_init(cpu, irq, evtchn, virq); 1607 xen_irq_info_virq_init(cpu, irq, evtchn, virq);
1608 bind_evtchn_to_cpu(evtchn, cpu); 1608 bind_evtchn_to_cpu(evtchn, cpu);
1609 } 1609 }
1610 } 1610 }
1611 1611
1612 static void restore_cpu_ipis(unsigned int cpu) 1612 static void restore_cpu_ipis(unsigned int cpu)
1613 { 1613 {
1614 struct evtchn_bind_ipi bind_ipi; 1614 struct evtchn_bind_ipi bind_ipi;
1615 int ipi, irq, evtchn; 1615 int ipi, irq, evtchn;
1616 1616
1617 for (ipi = 0; ipi < XEN_NR_IPIS; ipi++) { 1617 for (ipi = 0; ipi < XEN_NR_IPIS; ipi++) {
1618 if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) 1618 if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1)
1619 continue; 1619 continue;
1620 1620
1621 BUG_ON(ipi_from_irq(irq) != ipi); 1621 BUG_ON(ipi_from_irq(irq) != ipi);
1622 1622
1623 /* Get a new binding from Xen. */ 1623 /* Get a new binding from Xen. */
1624 bind_ipi.vcpu = cpu; 1624 bind_ipi.vcpu = cpu;
1625 if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, 1625 if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
1626 &bind_ipi) != 0) 1626 &bind_ipi) != 0)
1627 BUG(); 1627 BUG();
1628 evtchn = bind_ipi.port; 1628 evtchn = bind_ipi.port;
1629 1629
1630 /* Record the new mapping. */ 1630 /* Record the new mapping. */
1631 xen_irq_info_ipi_init(cpu, irq, evtchn, ipi); 1631 xen_irq_info_ipi_init(cpu, irq, evtchn, ipi);
1632 bind_evtchn_to_cpu(evtchn, cpu); 1632 bind_evtchn_to_cpu(evtchn, cpu);
1633 } 1633 }
1634 } 1634 }
1635 1635
1636 /* Clear an irq's pending state, in preparation for polling on it */ 1636 /* Clear an irq's pending state, in preparation for polling on it */
1637 void xen_clear_irq_pending(int irq) 1637 void xen_clear_irq_pending(int irq)
1638 { 1638 {
1639 int evtchn = evtchn_from_irq(irq); 1639 int evtchn = evtchn_from_irq(irq);
1640 1640
1641 if (VALID_EVTCHN(evtchn)) 1641 if (VALID_EVTCHN(evtchn))
1642 clear_evtchn(evtchn); 1642 clear_evtchn(evtchn);
1643 } 1643 }
1644 EXPORT_SYMBOL(xen_clear_irq_pending); 1644 EXPORT_SYMBOL(xen_clear_irq_pending);
1645 void xen_set_irq_pending(int irq) 1645 void xen_set_irq_pending(int irq)
1646 { 1646 {
1647 int evtchn = evtchn_from_irq(irq); 1647 int evtchn = evtchn_from_irq(irq);
1648 1648
1649 if (VALID_EVTCHN(evtchn)) 1649 if (VALID_EVTCHN(evtchn))
1650 set_evtchn(evtchn); 1650 set_evtchn(evtchn);
1651 } 1651 }
1652 1652
1653 bool xen_test_irq_pending(int irq) 1653 bool xen_test_irq_pending(int irq)
1654 { 1654 {
1655 int evtchn = evtchn_from_irq(irq); 1655 int evtchn = evtchn_from_irq(irq);
1656 bool ret = false; 1656 bool ret = false;
1657 1657
1658 if (VALID_EVTCHN(evtchn)) 1658 if (VALID_EVTCHN(evtchn))
1659 ret = test_evtchn(evtchn); 1659 ret = test_evtchn(evtchn);
1660 1660
1661 return ret; 1661 return ret;
1662 } 1662 }
1663 1663
1664 /* Poll waiting for an irq to become pending with timeout. In the usual case, 1664 /* Poll waiting for an irq to become pending with timeout. In the usual case,
1665 * the irq will be disabled so it won't deliver an interrupt. */ 1665 * the irq will be disabled so it won't deliver an interrupt. */
1666 void xen_poll_irq_timeout(int irq, u64 timeout) 1666 void xen_poll_irq_timeout(int irq, u64 timeout)
1667 { 1667 {
1668 evtchn_port_t evtchn = evtchn_from_irq(irq); 1668 evtchn_port_t evtchn = evtchn_from_irq(irq);
1669 1669
1670 if (VALID_EVTCHN(evtchn)) { 1670 if (VALID_EVTCHN(evtchn)) {
1671 struct sched_poll poll; 1671 struct sched_poll poll;
1672 1672
1673 poll.nr_ports = 1; 1673 poll.nr_ports = 1;
1674 poll.timeout = timeout; 1674 poll.timeout = timeout;
1675 set_xen_guest_handle(poll.ports, &evtchn); 1675 set_xen_guest_handle(poll.ports, &evtchn);
1676 1676
1677 if (HYPERVISOR_sched_op(SCHEDOP_poll, &poll) != 0) 1677 if (HYPERVISOR_sched_op(SCHEDOP_poll, &poll) != 0)
1678 BUG(); 1678 BUG();
1679 } 1679 }
1680 } 1680 }
1681 EXPORT_SYMBOL(xen_poll_irq_timeout); 1681 EXPORT_SYMBOL(xen_poll_irq_timeout);
1682 /* Poll waiting for an irq to become pending. In the usual case, the 1682 /* Poll waiting for an irq to become pending. In the usual case, the
1683 * irq will be disabled so it won't deliver an interrupt. */ 1683 * irq will be disabled so it won't deliver an interrupt. */
1684 void xen_poll_irq(int irq) 1684 void xen_poll_irq(int irq)
1685 { 1685 {
1686 xen_poll_irq_timeout(irq, 0 /* no timeout */); 1686 xen_poll_irq_timeout(irq, 0 /* no timeout */);
1687 } 1687 }
1688 1688
1689 /* Check whether the IRQ line is shared with other guests. */ 1689 /* Check whether the IRQ line is shared with other guests. */
1690 int xen_test_irq_shared(int irq) 1690 int xen_test_irq_shared(int irq)
1691 { 1691 {
1692 struct irq_info *info = info_for_irq(irq); 1692 struct irq_info *info = info_for_irq(irq);
1693 struct physdev_irq_status_query irq_status = { .irq = info->u.pirq.pirq }; 1693 struct physdev_irq_status_query irq_status = { .irq = info->u.pirq.pirq };
1694 1694
1695 if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status)) 1695 if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
1696 return 0; 1696 return 0;
1697 return !(irq_status.flags & XENIRQSTAT_shared); 1697 return !(irq_status.flags & XENIRQSTAT_shared);
1698 } 1698 }
1699 EXPORT_SYMBOL_GPL(xen_test_irq_shared); 1699 EXPORT_SYMBOL_GPL(xen_test_irq_shared);
1700 1700
1701 void xen_irq_resume(void) 1701 void xen_irq_resume(void)
1702 { 1702 {
1703 unsigned int cpu, evtchn; 1703 unsigned int cpu, evtchn;
1704 struct irq_info *info; 1704 struct irq_info *info;
1705 1705
1706 init_evtchn_cpu_bindings(); 1706 init_evtchn_cpu_bindings();
1707 1707
1708 /* New event-channel space is not 'live' yet. */ 1708 /* New event-channel space is not 'live' yet. */
1709 for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) 1709 for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
1710 mask_evtchn(evtchn); 1710 mask_evtchn(evtchn);
1711 1711
1712 /* No IRQ <-> event-channel mappings. */ 1712 /* No IRQ <-> event-channel mappings. */
1713 list_for_each_entry(info, &xen_irq_list_head, list) 1713 list_for_each_entry(info, &xen_irq_list_head, list)
1714 info->evtchn = 0; /* zap event-channel binding */ 1714 info->evtchn = 0; /* zap event-channel binding */
1715 1715
1716 for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) 1716 for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
1717 evtchn_to_irq[evtchn] = -1; 1717 evtchn_to_irq[evtchn] = -1;
1718 1718
1719 for_each_possible_cpu(cpu) { 1719 for_each_possible_cpu(cpu) {
1720 restore_cpu_virqs(cpu); 1720 restore_cpu_virqs(cpu);
1721 restore_cpu_ipis(cpu); 1721 restore_cpu_ipis(cpu);
1722 } 1722 }
1723 1723
1724 restore_pirqs(); 1724 restore_pirqs();
1725 } 1725 }
1726 1726
1727 static struct irq_chip xen_dynamic_chip __read_mostly = { 1727 static struct irq_chip xen_dynamic_chip __read_mostly = {
1728 .name = "xen-dyn", 1728 .name = "xen-dyn",
1729 1729
1730 .irq_disable = disable_dynirq, 1730 .irq_disable = disable_dynirq,
1731 .irq_mask = disable_dynirq, 1731 .irq_mask = disable_dynirq,
1732 .irq_unmask = enable_dynirq, 1732 .irq_unmask = enable_dynirq,
1733 1733
1734 .irq_ack = ack_dynirq, 1734 .irq_ack = ack_dynirq,
1735 .irq_mask_ack = mask_ack_dynirq, 1735 .irq_mask_ack = mask_ack_dynirq,
1736 1736
1737 .irq_set_affinity = set_affinity_irq, 1737 .irq_set_affinity = set_affinity_irq,
1738 .irq_retrigger = retrigger_dynirq, 1738 .irq_retrigger = retrigger_dynirq,
1739 }; 1739 };
1740 1740
1741 static struct irq_chip xen_pirq_chip __read_mostly = { 1741 static struct irq_chip xen_pirq_chip __read_mostly = {
1742 .name = "xen-pirq", 1742 .name = "xen-pirq",
1743 1743
1744 .irq_startup = startup_pirq, 1744 .irq_startup = startup_pirq,
1745 .irq_shutdown = shutdown_pirq, 1745 .irq_shutdown = shutdown_pirq,
1746 .irq_enable = enable_pirq, 1746 .irq_enable = enable_pirq,
1747 .irq_disable = disable_pirq, 1747 .irq_disable = disable_pirq,
1748 1748
1749 .irq_mask = disable_dynirq, 1749 .irq_mask = disable_dynirq,
1750 .irq_unmask = enable_dynirq, 1750 .irq_unmask = enable_dynirq,
1751 1751
1752 .irq_ack = eoi_pirq, 1752 .irq_ack = eoi_pirq,
1753 .irq_eoi = eoi_pirq, 1753 .irq_eoi = eoi_pirq,
1754 .irq_mask_ack = mask_ack_pirq, 1754 .irq_mask_ack = mask_ack_pirq,
1755 1755
1756 .irq_set_affinity = set_affinity_irq, 1756 .irq_set_affinity = set_affinity_irq,
1757 1757
1758 .irq_retrigger = retrigger_dynirq, 1758 .irq_retrigger = retrigger_dynirq,
1759 }; 1759 };
1760 1760
1761 static struct irq_chip xen_percpu_chip __read_mostly = { 1761 static struct irq_chip xen_percpu_chip __read_mostly = {
1762 .name = "xen-percpu", 1762 .name = "xen-percpu",
1763 1763
1764 .irq_disable = disable_dynirq, 1764 .irq_disable = disable_dynirq,
1765 .irq_mask = disable_dynirq, 1765 .irq_mask = disable_dynirq,
1766 .irq_unmask = enable_dynirq, 1766 .irq_unmask = enable_dynirq,
1767 1767
1768 .irq_ack = ack_dynirq, 1768 .irq_ack = ack_dynirq,
1769 }; 1769 };
1770 1770
1771 int xen_set_callback_via(uint64_t via) 1771 int xen_set_callback_via(uint64_t via)
1772 { 1772 {
1773 struct xen_hvm_param a; 1773 struct xen_hvm_param a;
1774 a.domid = DOMID_SELF; 1774 a.domid = DOMID_SELF;
1775 a.index = HVM_PARAM_CALLBACK_IRQ; 1775 a.index = HVM_PARAM_CALLBACK_IRQ;
1776 a.value = via; 1776 a.value = via;
1777 return HYPERVISOR_hvm_op(HVMOP_set_param, &a); 1777 return HYPERVISOR_hvm_op(HVMOP_set_param, &a);
1778 } 1778 }
1779 EXPORT_SYMBOL_GPL(xen_set_callback_via); 1779 EXPORT_SYMBOL_GPL(xen_set_callback_via);
1780 1780
1781 #ifdef CONFIG_XEN_PVHVM 1781 #ifdef CONFIG_XEN_PVHVM
1782 /* Vector callbacks are better than PCI interrupts to receive event 1782 /* Vector callbacks are better than PCI interrupts to receive event
1783 * channel notifications because we can receive vector callbacks on any 1783 * channel notifications because we can receive vector callbacks on any
1784 * vcpu and we don't need PCI support or APIC interactions. */ 1784 * vcpu and we don't need PCI support or APIC interactions. */
1785 void xen_callback_vector(void) 1785 void xen_callback_vector(void)
1786 { 1786 {
1787 int rc; 1787 int rc;
1788 uint64_t callback_via; 1788 uint64_t callback_via;
1789 if (xen_have_vector_callback) { 1789 if (xen_have_vector_callback) {
1790 callback_via = HVM_CALLBACK_VECTOR(XEN_HVM_EVTCHN_CALLBACK); 1790 callback_via = HVM_CALLBACK_VECTOR(HYPERVISOR_CALLBACK_VECTOR);
1791 rc = xen_set_callback_via(callback_via); 1791 rc = xen_set_callback_via(callback_via);
1792 if (rc) { 1792 if (rc) {
1793 printk(KERN_ERR "Request for Xen HVM callback vector" 1793 printk(KERN_ERR "Request for Xen HVM callback vector"
1794 " failed.\n"); 1794 " failed.\n");
1795 xen_have_vector_callback = 0; 1795 xen_have_vector_callback = 0;
1796 return; 1796 return;
1797 } 1797 }
1798 printk(KERN_INFO "Xen HVM callback vector for event delivery is " 1798 printk(KERN_INFO "Xen HVM callback vector for event delivery is "
1799 "enabled\n"); 1799 "enabled\n");
1800 /* in the restore case the vector has already been allocated */ 1800 /* in the restore case the vector has already been allocated */
1801 if (!test_bit(XEN_HVM_EVTCHN_CALLBACK, used_vectors)) 1801 if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors))
1802 alloc_intr_gate(XEN_HVM_EVTCHN_CALLBACK, xen_hvm_callback_vector); 1802 alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR,
1803 xen_hvm_callback_vector);
1803 } 1804 }
1804 } 1805 }
1805 #else 1806 #else
1806 void xen_callback_vector(void) {} 1807 void xen_callback_vector(void) {}
1807 #endif 1808 #endif
1808 1809
1809 void __init xen_init_IRQ(void) 1810 void __init xen_init_IRQ(void)
1810 { 1811 {
1811 int i; 1812 int i;
1812 1813
1813 evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq), 1814 evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq),
1814 GFP_KERNEL); 1815 GFP_KERNEL);
1815 BUG_ON(!evtchn_to_irq); 1816 BUG_ON(!evtchn_to_irq);
1816 for (i = 0; i < NR_EVENT_CHANNELS; i++) 1817 for (i = 0; i < NR_EVENT_CHANNELS; i++)
1817 evtchn_to_irq[i] = -1; 1818 evtchn_to_irq[i] = -1;
1818 1819
1819 init_evtchn_cpu_bindings(); 1820 init_evtchn_cpu_bindings();
1820 1821
1821 /* No event channels are 'live' right now. */ 1822 /* No event channels are 'live' right now. */
1822 for (i = 0; i < NR_EVENT_CHANNELS; i++) 1823 for (i = 0; i < NR_EVENT_CHANNELS; i++)
1823 mask_evtchn(i); 1824 mask_evtchn(i);
1824 1825
1825 pirq_needs_eoi = pirq_needs_eoi_flag; 1826 pirq_needs_eoi = pirq_needs_eoi_flag;
1826 1827
1827 #ifdef CONFIG_X86 1828 #ifdef CONFIG_X86
1828 if (xen_hvm_domain()) { 1829 if (xen_hvm_domain()) {
1829 xen_callback_vector(); 1830 xen_callback_vector();
1830 native_init_IRQ(); 1831 native_init_IRQ();
1831 /* pci_xen_hvm_init must be called after native_init_IRQ so that 1832 /* pci_xen_hvm_init must be called after native_init_IRQ so that
1832 * __acpi_register_gsi can point at the right function */ 1833 * __acpi_register_gsi can point at the right function */
1833 pci_xen_hvm_init(); 1834 pci_xen_hvm_init();
1834 } else { 1835 } else {
1835 int rc; 1836 int rc;
1836 struct physdev_pirq_eoi_gmfn eoi_gmfn; 1837 struct physdev_pirq_eoi_gmfn eoi_gmfn;
1837 1838
1838 irq_ctx_init(smp_processor_id()); 1839 irq_ctx_init(smp_processor_id());
1839 if (xen_initial_domain()) 1840 if (xen_initial_domain())
1840 pci_xen_initial_domain(); 1841 pci_xen_initial_domain();
1841 1842
1842 pirq_eoi_map = (void *)__get_free_page(GFP_KERNEL|__GFP_ZERO); 1843 pirq_eoi_map = (void *)__get_free_page(GFP_KERNEL|__GFP_ZERO);
1843 eoi_gmfn.gmfn = virt_to_mfn(pirq_eoi_map); 1844 eoi_gmfn.gmfn = virt_to_mfn(pirq_eoi_map);
1844 rc = HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn_v2, &eoi_gmfn); 1845 rc = HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn_v2, &eoi_gmfn);
1845 if (rc != 0) { 1846 if (rc != 0) {
1846 free_page((unsigned long) pirq_eoi_map); 1847 free_page((unsigned long) pirq_eoi_map);
1847 pirq_eoi_map = NULL; 1848 pirq_eoi_map = NULL;
1848 } else 1849 } else
1849 pirq_needs_eoi = pirq_check_eoi_map; 1850 pirq_needs_eoi = pirq_check_eoi_map;
1850 } 1851 }
1851 #endif 1852 #endif
1852 } 1853 }
1853 1854