Commit 56dd9470d7c8734f055da2a6bac553caf4a468eb
1 parent
6dbe51c251
Exists in
smarc-l5.0.0_1.0.0-ga
and in
5 other branches
context_tracking: Move exception handling to generic code
Exceptions handling on context tracking should share common treatment: on entry we exit user mode if the exception triggered in that context. Then on exception exit we return to that previous context. Generalize this to avoid duplication across archs. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Li Zhong <zhong@linux.vnet.ibm.com> Cc: Kevin Hilman <khilman@linaro.org> Cc: Mats Liljegren <mats.liljegren@enea.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ingo Molnar <mingo@kernel.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Namhyung Kim <namhyung.kim@lge.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Showing 5 changed files with 19 additions and 26 deletions Inline Diff
arch/x86/include/asm/context_tracking.h
1 | #ifndef _ASM_X86_CONTEXT_TRACKING_H | 1 | #ifndef _ASM_X86_CONTEXT_TRACKING_H |
2 | #define _ASM_X86_CONTEXT_TRACKING_H | 2 | #define _ASM_X86_CONTEXT_TRACKING_H |
3 | 3 | ||
4 | #ifndef __ASSEMBLY__ | ||
5 | #include <linux/context_tracking.h> | ||
6 | #include <asm/ptrace.h> | ||
7 | |||
8 | static inline void exception_enter(struct pt_regs *regs) | ||
9 | { | ||
10 | user_exit(); | ||
11 | } | ||
12 | |||
13 | static inline void exception_exit(struct pt_regs *regs) | ||
14 | { | ||
15 | #ifdef CONFIG_CONTEXT_TRACKING | 4 | #ifdef CONFIG_CONTEXT_TRACKING |
16 | if (user_mode(regs)) | ||
17 | user_enter(); | ||
18 | #endif | ||
19 | } | ||
20 | |||
21 | #else /* __ASSEMBLY__ */ | ||
22 | |||
23 | #ifdef CONFIG_CONTEXT_TRACKING | ||
24 | # define SCHEDULE_USER call schedule_user | 5 | # define SCHEDULE_USER call schedule_user |
25 | #else | 6 | #else |
26 | # define SCHEDULE_USER call schedule | 7 | # define SCHEDULE_USER call schedule |
27 | #endif | 8 | #endif |
28 | |||
29 | #endif /* !__ASSEMBLY__ */ | ||
30 | 9 | ||
31 | #endif | 10 | #endif |
32 | 11 |
arch/x86/kernel/kvm.c
1 | /* | 1 | /* |
2 | * KVM paravirt_ops implementation | 2 | * KVM paravirt_ops implementation |
3 | * | 3 | * |
4 | * This program is free software; you can redistribute it and/or modify | 4 | * This program is free software; you can redistribute it and/or modify |
5 | * it under the terms of the GNU General Public License as published by | 5 | * it under the terms of the GNU General Public License as published by |
6 | * the Free Software Foundation; either version 2 of the License, or | 6 | * the Free Software Foundation; either version 2 of the License, or |
7 | * (at your option) any later version. | 7 | * (at your option) any later version. |
8 | * | 8 | * |
9 | * This program is distributed in the hope that it will be useful, | 9 | * This program is distributed in the hope that it will be useful, |
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
12 | * GNU General Public License for more details. | 12 | * GNU General Public License for more details. |
13 | * | 13 | * |
14 | * You should have received a copy of the GNU General Public License | 14 | * You should have received a copy of the GNU General Public License |
15 | * along with this program; if not, write to the Free Software | 15 | * along with this program; if not, write to the Free Software |
16 | * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | 16 | * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
17 | * | 17 | * |
18 | * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | 18 | * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com> |
19 | * Copyright IBM Corporation, 2007 | 19 | * Copyright IBM Corporation, 2007 |
20 | * Authors: Anthony Liguori <aliguori@us.ibm.com> | 20 | * Authors: Anthony Liguori <aliguori@us.ibm.com> |
21 | */ | 21 | */ |
22 | 22 | ||
23 | #include <linux/context_tracking.h> | ||
23 | #include <linux/module.h> | 24 | #include <linux/module.h> |
24 | #include <linux/kernel.h> | 25 | #include <linux/kernel.h> |
25 | #include <linux/kvm_para.h> | 26 | #include <linux/kvm_para.h> |
26 | #include <linux/cpu.h> | 27 | #include <linux/cpu.h> |
27 | #include <linux/mm.h> | 28 | #include <linux/mm.h> |
28 | #include <linux/highmem.h> | 29 | #include <linux/highmem.h> |
29 | #include <linux/hardirq.h> | 30 | #include <linux/hardirq.h> |
30 | #include <linux/notifier.h> | 31 | #include <linux/notifier.h> |
31 | #include <linux/reboot.h> | 32 | #include <linux/reboot.h> |
32 | #include <linux/hash.h> | 33 | #include <linux/hash.h> |
33 | #include <linux/sched.h> | 34 | #include <linux/sched.h> |
34 | #include <linux/slab.h> | 35 | #include <linux/slab.h> |
35 | #include <linux/kprobes.h> | 36 | #include <linux/kprobes.h> |
36 | #include <asm/timer.h> | 37 | #include <asm/timer.h> |
37 | #include <asm/cpu.h> | 38 | #include <asm/cpu.h> |
38 | #include <asm/traps.h> | 39 | #include <asm/traps.h> |
39 | #include <asm/desc.h> | 40 | #include <asm/desc.h> |
40 | #include <asm/tlbflush.h> | 41 | #include <asm/tlbflush.h> |
41 | #include <asm/idle.h> | 42 | #include <asm/idle.h> |
42 | #include <asm/apic.h> | 43 | #include <asm/apic.h> |
43 | #include <asm/apicdef.h> | 44 | #include <asm/apicdef.h> |
44 | #include <asm/hypervisor.h> | 45 | #include <asm/hypervisor.h> |
45 | #include <asm/kvm_guest.h> | 46 | #include <asm/kvm_guest.h> |
46 | #include <asm/context_tracking.h> | ||
47 | 47 | ||
48 | static int kvmapf = 1; | 48 | static int kvmapf = 1; |
49 | 49 | ||
50 | static int parse_no_kvmapf(char *arg) | 50 | static int parse_no_kvmapf(char *arg) |
51 | { | 51 | { |
52 | kvmapf = 0; | 52 | kvmapf = 0; |
53 | return 0; | 53 | return 0; |
54 | } | 54 | } |
55 | 55 | ||
56 | early_param("no-kvmapf", parse_no_kvmapf); | 56 | early_param("no-kvmapf", parse_no_kvmapf); |
57 | 57 | ||
58 | static int steal_acc = 1; | 58 | static int steal_acc = 1; |
59 | static int parse_no_stealacc(char *arg) | 59 | static int parse_no_stealacc(char *arg) |
60 | { | 60 | { |
61 | steal_acc = 0; | 61 | steal_acc = 0; |
62 | return 0; | 62 | return 0; |
63 | } | 63 | } |
64 | 64 | ||
65 | early_param("no-steal-acc", parse_no_stealacc); | 65 | early_param("no-steal-acc", parse_no_stealacc); |
66 | 66 | ||
67 | static int kvmclock_vsyscall = 1; | 67 | static int kvmclock_vsyscall = 1; |
68 | static int parse_no_kvmclock_vsyscall(char *arg) | 68 | static int parse_no_kvmclock_vsyscall(char *arg) |
69 | { | 69 | { |
70 | kvmclock_vsyscall = 0; | 70 | kvmclock_vsyscall = 0; |
71 | return 0; | 71 | return 0; |
72 | } | 72 | } |
73 | 73 | ||
74 | early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); | 74 | early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); |
75 | 75 | ||
76 | static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); | 76 | static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); |
77 | static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); | 77 | static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); |
78 | static int has_steal_clock = 0; | 78 | static int has_steal_clock = 0; |
79 | 79 | ||
80 | /* | 80 | /* |
81 | * No need for any "IO delay" on KVM | 81 | * No need for any "IO delay" on KVM |
82 | */ | 82 | */ |
83 | static void kvm_io_delay(void) | 83 | static void kvm_io_delay(void) |
84 | { | 84 | { |
85 | } | 85 | } |
86 | 86 | ||
87 | #define KVM_TASK_SLEEP_HASHBITS 8 | 87 | #define KVM_TASK_SLEEP_HASHBITS 8 |
88 | #define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS) | 88 | #define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS) |
89 | 89 | ||
90 | struct kvm_task_sleep_node { | 90 | struct kvm_task_sleep_node { |
91 | struct hlist_node link; | 91 | struct hlist_node link; |
92 | wait_queue_head_t wq; | 92 | wait_queue_head_t wq; |
93 | u32 token; | 93 | u32 token; |
94 | int cpu; | 94 | int cpu; |
95 | bool halted; | 95 | bool halted; |
96 | }; | 96 | }; |
97 | 97 | ||
98 | static struct kvm_task_sleep_head { | 98 | static struct kvm_task_sleep_head { |
99 | spinlock_t lock; | 99 | spinlock_t lock; |
100 | struct hlist_head list; | 100 | struct hlist_head list; |
101 | } async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE]; | 101 | } async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE]; |
102 | 102 | ||
103 | static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b, | 103 | static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b, |
104 | u32 token) | 104 | u32 token) |
105 | { | 105 | { |
106 | struct hlist_node *p; | 106 | struct hlist_node *p; |
107 | 107 | ||
108 | hlist_for_each(p, &b->list) { | 108 | hlist_for_each(p, &b->list) { |
109 | struct kvm_task_sleep_node *n = | 109 | struct kvm_task_sleep_node *n = |
110 | hlist_entry(p, typeof(*n), link); | 110 | hlist_entry(p, typeof(*n), link); |
111 | if (n->token == token) | 111 | if (n->token == token) |
112 | return n; | 112 | return n; |
113 | } | 113 | } |
114 | 114 | ||
115 | return NULL; | 115 | return NULL; |
116 | } | 116 | } |
117 | 117 | ||
118 | void kvm_async_pf_task_wait(u32 token) | 118 | void kvm_async_pf_task_wait(u32 token) |
119 | { | 119 | { |
120 | u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); | 120 | u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); |
121 | struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; | 121 | struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; |
122 | struct kvm_task_sleep_node n, *e; | 122 | struct kvm_task_sleep_node n, *e; |
123 | DEFINE_WAIT(wait); | 123 | DEFINE_WAIT(wait); |
124 | 124 | ||
125 | rcu_irq_enter(); | 125 | rcu_irq_enter(); |
126 | 126 | ||
127 | spin_lock(&b->lock); | 127 | spin_lock(&b->lock); |
128 | e = _find_apf_task(b, token); | 128 | e = _find_apf_task(b, token); |
129 | if (e) { | 129 | if (e) { |
130 | /* dummy entry exist -> wake up was delivered ahead of PF */ | 130 | /* dummy entry exist -> wake up was delivered ahead of PF */ |
131 | hlist_del(&e->link); | 131 | hlist_del(&e->link); |
132 | kfree(e); | 132 | kfree(e); |
133 | spin_unlock(&b->lock); | 133 | spin_unlock(&b->lock); |
134 | 134 | ||
135 | rcu_irq_exit(); | 135 | rcu_irq_exit(); |
136 | return; | 136 | return; |
137 | } | 137 | } |
138 | 138 | ||
139 | n.token = token; | 139 | n.token = token; |
140 | n.cpu = smp_processor_id(); | 140 | n.cpu = smp_processor_id(); |
141 | n.halted = is_idle_task(current) || preempt_count() > 1; | 141 | n.halted = is_idle_task(current) || preempt_count() > 1; |
142 | init_waitqueue_head(&n.wq); | 142 | init_waitqueue_head(&n.wq); |
143 | hlist_add_head(&n.link, &b->list); | 143 | hlist_add_head(&n.link, &b->list); |
144 | spin_unlock(&b->lock); | 144 | spin_unlock(&b->lock); |
145 | 145 | ||
146 | for (;;) { | 146 | for (;;) { |
147 | if (!n.halted) | 147 | if (!n.halted) |
148 | prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); | 148 | prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); |
149 | if (hlist_unhashed(&n.link)) | 149 | if (hlist_unhashed(&n.link)) |
150 | break; | 150 | break; |
151 | 151 | ||
152 | if (!n.halted) { | 152 | if (!n.halted) { |
153 | local_irq_enable(); | 153 | local_irq_enable(); |
154 | schedule(); | 154 | schedule(); |
155 | local_irq_disable(); | 155 | local_irq_disable(); |
156 | } else { | 156 | } else { |
157 | /* | 157 | /* |
158 | * We cannot reschedule. So halt. | 158 | * We cannot reschedule. So halt. |
159 | */ | 159 | */ |
160 | rcu_irq_exit(); | 160 | rcu_irq_exit(); |
161 | native_safe_halt(); | 161 | native_safe_halt(); |
162 | rcu_irq_enter(); | 162 | rcu_irq_enter(); |
163 | local_irq_disable(); | 163 | local_irq_disable(); |
164 | } | 164 | } |
165 | } | 165 | } |
166 | if (!n.halted) | 166 | if (!n.halted) |
167 | finish_wait(&n.wq, &wait); | 167 | finish_wait(&n.wq, &wait); |
168 | 168 | ||
169 | rcu_irq_exit(); | 169 | rcu_irq_exit(); |
170 | return; | 170 | return; |
171 | } | 171 | } |
172 | EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); | 172 | EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); |
173 | 173 | ||
174 | static void apf_task_wake_one(struct kvm_task_sleep_node *n) | 174 | static void apf_task_wake_one(struct kvm_task_sleep_node *n) |
175 | { | 175 | { |
176 | hlist_del_init(&n->link); | 176 | hlist_del_init(&n->link); |
177 | if (n->halted) | 177 | if (n->halted) |
178 | smp_send_reschedule(n->cpu); | 178 | smp_send_reschedule(n->cpu); |
179 | else if (waitqueue_active(&n->wq)) | 179 | else if (waitqueue_active(&n->wq)) |
180 | wake_up(&n->wq); | 180 | wake_up(&n->wq); |
181 | } | 181 | } |
182 | 182 | ||
183 | static void apf_task_wake_all(void) | 183 | static void apf_task_wake_all(void) |
184 | { | 184 | { |
185 | int i; | 185 | int i; |
186 | 186 | ||
187 | for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) { | 187 | for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) { |
188 | struct hlist_node *p, *next; | 188 | struct hlist_node *p, *next; |
189 | struct kvm_task_sleep_head *b = &async_pf_sleepers[i]; | 189 | struct kvm_task_sleep_head *b = &async_pf_sleepers[i]; |
190 | spin_lock(&b->lock); | 190 | spin_lock(&b->lock); |
191 | hlist_for_each_safe(p, next, &b->list) { | 191 | hlist_for_each_safe(p, next, &b->list) { |
192 | struct kvm_task_sleep_node *n = | 192 | struct kvm_task_sleep_node *n = |
193 | hlist_entry(p, typeof(*n), link); | 193 | hlist_entry(p, typeof(*n), link); |
194 | if (n->cpu == smp_processor_id()) | 194 | if (n->cpu == smp_processor_id()) |
195 | apf_task_wake_one(n); | 195 | apf_task_wake_one(n); |
196 | } | 196 | } |
197 | spin_unlock(&b->lock); | 197 | spin_unlock(&b->lock); |
198 | } | 198 | } |
199 | } | 199 | } |
200 | 200 | ||
201 | void kvm_async_pf_task_wake(u32 token) | 201 | void kvm_async_pf_task_wake(u32 token) |
202 | { | 202 | { |
203 | u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); | 203 | u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); |
204 | struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; | 204 | struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; |
205 | struct kvm_task_sleep_node *n; | 205 | struct kvm_task_sleep_node *n; |
206 | 206 | ||
207 | if (token == ~0) { | 207 | if (token == ~0) { |
208 | apf_task_wake_all(); | 208 | apf_task_wake_all(); |
209 | return; | 209 | return; |
210 | } | 210 | } |
211 | 211 | ||
212 | again: | 212 | again: |
213 | spin_lock(&b->lock); | 213 | spin_lock(&b->lock); |
214 | n = _find_apf_task(b, token); | 214 | n = _find_apf_task(b, token); |
215 | if (!n) { | 215 | if (!n) { |
216 | /* | 216 | /* |
217 | * async PF was not yet handled. | 217 | * async PF was not yet handled. |
218 | * Add dummy entry for the token. | 218 | * Add dummy entry for the token. |
219 | */ | 219 | */ |
220 | n = kzalloc(sizeof(*n), GFP_ATOMIC); | 220 | n = kzalloc(sizeof(*n), GFP_ATOMIC); |
221 | if (!n) { | 221 | if (!n) { |
222 | /* | 222 | /* |
223 | * Allocation failed! Busy wait while other cpu | 223 | * Allocation failed! Busy wait while other cpu |
224 | * handles async PF. | 224 | * handles async PF. |
225 | */ | 225 | */ |
226 | spin_unlock(&b->lock); | 226 | spin_unlock(&b->lock); |
227 | cpu_relax(); | 227 | cpu_relax(); |
228 | goto again; | 228 | goto again; |
229 | } | 229 | } |
230 | n->token = token; | 230 | n->token = token; |
231 | n->cpu = smp_processor_id(); | 231 | n->cpu = smp_processor_id(); |
232 | init_waitqueue_head(&n->wq); | 232 | init_waitqueue_head(&n->wq); |
233 | hlist_add_head(&n->link, &b->list); | 233 | hlist_add_head(&n->link, &b->list); |
234 | } else | 234 | } else |
235 | apf_task_wake_one(n); | 235 | apf_task_wake_one(n); |
236 | spin_unlock(&b->lock); | 236 | spin_unlock(&b->lock); |
237 | return; | 237 | return; |
238 | } | 238 | } |
239 | EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); | 239 | EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); |
240 | 240 | ||
241 | u32 kvm_read_and_reset_pf_reason(void) | 241 | u32 kvm_read_and_reset_pf_reason(void) |
242 | { | 242 | { |
243 | u32 reason = 0; | 243 | u32 reason = 0; |
244 | 244 | ||
245 | if (__get_cpu_var(apf_reason).enabled) { | 245 | if (__get_cpu_var(apf_reason).enabled) { |
246 | reason = __get_cpu_var(apf_reason).reason; | 246 | reason = __get_cpu_var(apf_reason).reason; |
247 | __get_cpu_var(apf_reason).reason = 0; | 247 | __get_cpu_var(apf_reason).reason = 0; |
248 | } | 248 | } |
249 | 249 | ||
250 | return reason; | 250 | return reason; |
251 | } | 251 | } |
252 | EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason); | 252 | EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason); |
253 | 253 | ||
254 | dotraplinkage void __kprobes | 254 | dotraplinkage void __kprobes |
255 | do_async_page_fault(struct pt_regs *regs, unsigned long error_code) | 255 | do_async_page_fault(struct pt_regs *regs, unsigned long error_code) |
256 | { | 256 | { |
257 | switch (kvm_read_and_reset_pf_reason()) { | 257 | switch (kvm_read_and_reset_pf_reason()) { |
258 | default: | 258 | default: |
259 | do_page_fault(regs, error_code); | 259 | do_page_fault(regs, error_code); |
260 | break; | 260 | break; |
261 | case KVM_PV_REASON_PAGE_NOT_PRESENT: | 261 | case KVM_PV_REASON_PAGE_NOT_PRESENT: |
262 | /* page is swapped out by the host. */ | 262 | /* page is swapped out by the host. */ |
263 | exception_enter(regs); | 263 | exception_enter(regs); |
264 | exit_idle(); | 264 | exit_idle(); |
265 | kvm_async_pf_task_wait((u32)read_cr2()); | 265 | kvm_async_pf_task_wait((u32)read_cr2()); |
266 | exception_exit(regs); | 266 | exception_exit(regs); |
267 | break; | 267 | break; |
268 | case KVM_PV_REASON_PAGE_READY: | 268 | case KVM_PV_REASON_PAGE_READY: |
269 | rcu_irq_enter(); | 269 | rcu_irq_enter(); |
270 | exit_idle(); | 270 | exit_idle(); |
271 | kvm_async_pf_task_wake((u32)read_cr2()); | 271 | kvm_async_pf_task_wake((u32)read_cr2()); |
272 | rcu_irq_exit(); | 272 | rcu_irq_exit(); |
273 | break; | 273 | break; |
274 | } | 274 | } |
275 | } | 275 | } |
276 | 276 | ||
277 | static void __init paravirt_ops_setup(void) | 277 | static void __init paravirt_ops_setup(void) |
278 | { | 278 | { |
279 | pv_info.name = "KVM"; | 279 | pv_info.name = "KVM"; |
280 | pv_info.paravirt_enabled = 1; | 280 | pv_info.paravirt_enabled = 1; |
281 | 281 | ||
282 | if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) | 282 | if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) |
283 | pv_cpu_ops.io_delay = kvm_io_delay; | 283 | pv_cpu_ops.io_delay = kvm_io_delay; |
284 | 284 | ||
285 | #ifdef CONFIG_X86_IO_APIC | 285 | #ifdef CONFIG_X86_IO_APIC |
286 | no_timer_check = 1; | 286 | no_timer_check = 1; |
287 | #endif | 287 | #endif |
288 | } | 288 | } |
289 | 289 | ||
290 | static void kvm_register_steal_time(void) | 290 | static void kvm_register_steal_time(void) |
291 | { | 291 | { |
292 | int cpu = smp_processor_id(); | 292 | int cpu = smp_processor_id(); |
293 | struct kvm_steal_time *st = &per_cpu(steal_time, cpu); | 293 | struct kvm_steal_time *st = &per_cpu(steal_time, cpu); |
294 | 294 | ||
295 | if (!has_steal_clock) | 295 | if (!has_steal_clock) |
296 | return; | 296 | return; |
297 | 297 | ||
298 | memset(st, 0, sizeof(*st)); | 298 | memset(st, 0, sizeof(*st)); |
299 | 299 | ||
300 | wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); | 300 | wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); |
301 | pr_info("kvm-stealtime: cpu %d, msr %llx\n", | 301 | pr_info("kvm-stealtime: cpu %d, msr %llx\n", |
302 | cpu, (unsigned long long) slow_virt_to_phys(st)); | 302 | cpu, (unsigned long long) slow_virt_to_phys(st)); |
303 | } | 303 | } |
304 | 304 | ||
305 | static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; | 305 | static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; |
306 | 306 | ||
307 | static void kvm_guest_apic_eoi_write(u32 reg, u32 val) | 307 | static void kvm_guest_apic_eoi_write(u32 reg, u32 val) |
308 | { | 308 | { |
309 | /** | 309 | /** |
310 | * This relies on __test_and_clear_bit to modify the memory | 310 | * This relies on __test_and_clear_bit to modify the memory |
311 | * in a way that is atomic with respect to the local CPU. | 311 | * in a way that is atomic with respect to the local CPU. |
312 | * The hypervisor only accesses this memory from the local CPU so | 312 | * The hypervisor only accesses this memory from the local CPU so |
313 | * there's no need for lock or memory barriers. | 313 | * there's no need for lock or memory barriers. |
314 | * An optimization barrier is implied in apic write. | 314 | * An optimization barrier is implied in apic write. |
315 | */ | 315 | */ |
316 | if (__test_and_clear_bit(KVM_PV_EOI_BIT, &__get_cpu_var(kvm_apic_eoi))) | 316 | if (__test_and_clear_bit(KVM_PV_EOI_BIT, &__get_cpu_var(kvm_apic_eoi))) |
317 | return; | 317 | return; |
318 | apic_write(APIC_EOI, APIC_EOI_ACK); | 318 | apic_write(APIC_EOI, APIC_EOI_ACK); |
319 | } | 319 | } |
320 | 320 | ||
321 | void __cpuinit kvm_guest_cpu_init(void) | 321 | void __cpuinit kvm_guest_cpu_init(void) |
322 | { | 322 | { |
323 | if (!kvm_para_available()) | 323 | if (!kvm_para_available()) |
324 | return; | 324 | return; |
325 | 325 | ||
326 | if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { | 326 | if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { |
327 | u64 pa = slow_virt_to_phys(&__get_cpu_var(apf_reason)); | 327 | u64 pa = slow_virt_to_phys(&__get_cpu_var(apf_reason)); |
328 | 328 | ||
329 | #ifdef CONFIG_PREEMPT | 329 | #ifdef CONFIG_PREEMPT |
330 | pa |= KVM_ASYNC_PF_SEND_ALWAYS; | 330 | pa |= KVM_ASYNC_PF_SEND_ALWAYS; |
331 | #endif | 331 | #endif |
332 | wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED); | 332 | wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED); |
333 | __get_cpu_var(apf_reason).enabled = 1; | 333 | __get_cpu_var(apf_reason).enabled = 1; |
334 | printk(KERN_INFO"KVM setup async PF for cpu %d\n", | 334 | printk(KERN_INFO"KVM setup async PF for cpu %d\n", |
335 | smp_processor_id()); | 335 | smp_processor_id()); |
336 | } | 336 | } |
337 | 337 | ||
338 | if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { | 338 | if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { |
339 | unsigned long pa; | 339 | unsigned long pa; |
340 | /* Size alignment is implied but just to make it explicit. */ | 340 | /* Size alignment is implied but just to make it explicit. */ |
341 | BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); | 341 | BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); |
342 | __get_cpu_var(kvm_apic_eoi) = 0; | 342 | __get_cpu_var(kvm_apic_eoi) = 0; |
343 | pa = slow_virt_to_phys(&__get_cpu_var(kvm_apic_eoi)) | 343 | pa = slow_virt_to_phys(&__get_cpu_var(kvm_apic_eoi)) |
344 | | KVM_MSR_ENABLED; | 344 | | KVM_MSR_ENABLED; |
345 | wrmsrl(MSR_KVM_PV_EOI_EN, pa); | 345 | wrmsrl(MSR_KVM_PV_EOI_EN, pa); |
346 | } | 346 | } |
347 | 347 | ||
348 | if (has_steal_clock) | 348 | if (has_steal_clock) |
349 | kvm_register_steal_time(); | 349 | kvm_register_steal_time(); |
350 | } | 350 | } |
351 | 351 | ||
352 | static void kvm_pv_disable_apf(void) | 352 | static void kvm_pv_disable_apf(void) |
353 | { | 353 | { |
354 | if (!__get_cpu_var(apf_reason).enabled) | 354 | if (!__get_cpu_var(apf_reason).enabled) |
355 | return; | 355 | return; |
356 | 356 | ||
357 | wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); | 357 | wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); |
358 | __get_cpu_var(apf_reason).enabled = 0; | 358 | __get_cpu_var(apf_reason).enabled = 0; |
359 | 359 | ||
360 | printk(KERN_INFO"Unregister pv shared memory for cpu %d\n", | 360 | printk(KERN_INFO"Unregister pv shared memory for cpu %d\n", |
361 | smp_processor_id()); | 361 | smp_processor_id()); |
362 | } | 362 | } |
363 | 363 | ||
364 | static void kvm_pv_guest_cpu_reboot(void *unused) | 364 | static void kvm_pv_guest_cpu_reboot(void *unused) |
365 | { | 365 | { |
366 | /* | 366 | /* |
367 | * We disable PV EOI before we load a new kernel by kexec, | 367 | * We disable PV EOI before we load a new kernel by kexec, |
368 | * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory. | 368 | * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory. |
369 | * New kernel can re-enable when it boots. | 369 | * New kernel can re-enable when it boots. |
370 | */ | 370 | */ |
371 | if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) | 371 | if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) |
372 | wrmsrl(MSR_KVM_PV_EOI_EN, 0); | 372 | wrmsrl(MSR_KVM_PV_EOI_EN, 0); |
373 | kvm_pv_disable_apf(); | 373 | kvm_pv_disable_apf(); |
374 | kvm_disable_steal_time(); | 374 | kvm_disable_steal_time(); |
375 | } | 375 | } |
376 | 376 | ||
377 | static int kvm_pv_reboot_notify(struct notifier_block *nb, | 377 | static int kvm_pv_reboot_notify(struct notifier_block *nb, |
378 | unsigned long code, void *unused) | 378 | unsigned long code, void *unused) |
379 | { | 379 | { |
380 | if (code == SYS_RESTART) | 380 | if (code == SYS_RESTART) |
381 | on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1); | 381 | on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1); |
382 | return NOTIFY_DONE; | 382 | return NOTIFY_DONE; |
383 | } | 383 | } |
384 | 384 | ||
385 | static struct notifier_block kvm_pv_reboot_nb = { | 385 | static struct notifier_block kvm_pv_reboot_nb = { |
386 | .notifier_call = kvm_pv_reboot_notify, | 386 | .notifier_call = kvm_pv_reboot_notify, |
387 | }; | 387 | }; |
388 | 388 | ||
389 | static u64 kvm_steal_clock(int cpu) | 389 | static u64 kvm_steal_clock(int cpu) |
390 | { | 390 | { |
391 | u64 steal; | 391 | u64 steal; |
392 | struct kvm_steal_time *src; | 392 | struct kvm_steal_time *src; |
393 | int version; | 393 | int version; |
394 | 394 | ||
395 | src = &per_cpu(steal_time, cpu); | 395 | src = &per_cpu(steal_time, cpu); |
396 | do { | 396 | do { |
397 | version = src->version; | 397 | version = src->version; |
398 | rmb(); | 398 | rmb(); |
399 | steal = src->steal; | 399 | steal = src->steal; |
400 | rmb(); | 400 | rmb(); |
401 | } while ((version & 1) || (version != src->version)); | 401 | } while ((version & 1) || (version != src->version)); |
402 | 402 | ||
403 | return steal; | 403 | return steal; |
404 | } | 404 | } |
405 | 405 | ||
406 | void kvm_disable_steal_time(void) | 406 | void kvm_disable_steal_time(void) |
407 | { | 407 | { |
408 | if (!has_steal_clock) | 408 | if (!has_steal_clock) |
409 | return; | 409 | return; |
410 | 410 | ||
411 | wrmsr(MSR_KVM_STEAL_TIME, 0, 0); | 411 | wrmsr(MSR_KVM_STEAL_TIME, 0, 0); |
412 | } | 412 | } |
413 | 413 | ||
414 | #ifdef CONFIG_SMP | 414 | #ifdef CONFIG_SMP |
415 | static void __init kvm_smp_prepare_boot_cpu(void) | 415 | static void __init kvm_smp_prepare_boot_cpu(void) |
416 | { | 416 | { |
417 | WARN_ON(kvm_register_clock("primary cpu clock")); | 417 | WARN_ON(kvm_register_clock("primary cpu clock")); |
418 | kvm_guest_cpu_init(); | 418 | kvm_guest_cpu_init(); |
419 | native_smp_prepare_boot_cpu(); | 419 | native_smp_prepare_boot_cpu(); |
420 | } | 420 | } |
421 | 421 | ||
422 | static void __cpuinit kvm_guest_cpu_online(void *dummy) | 422 | static void __cpuinit kvm_guest_cpu_online(void *dummy) |
423 | { | 423 | { |
424 | kvm_guest_cpu_init(); | 424 | kvm_guest_cpu_init(); |
425 | } | 425 | } |
426 | 426 | ||
427 | static void kvm_guest_cpu_offline(void *dummy) | 427 | static void kvm_guest_cpu_offline(void *dummy) |
428 | { | 428 | { |
429 | kvm_disable_steal_time(); | 429 | kvm_disable_steal_time(); |
430 | if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) | 430 | if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) |
431 | wrmsrl(MSR_KVM_PV_EOI_EN, 0); | 431 | wrmsrl(MSR_KVM_PV_EOI_EN, 0); |
432 | kvm_pv_disable_apf(); | 432 | kvm_pv_disable_apf(); |
433 | apf_task_wake_all(); | 433 | apf_task_wake_all(); |
434 | } | 434 | } |
435 | 435 | ||
436 | static int __cpuinit kvm_cpu_notify(struct notifier_block *self, | 436 | static int __cpuinit kvm_cpu_notify(struct notifier_block *self, |
437 | unsigned long action, void *hcpu) | 437 | unsigned long action, void *hcpu) |
438 | { | 438 | { |
439 | int cpu = (unsigned long)hcpu; | 439 | int cpu = (unsigned long)hcpu; |
440 | switch (action) { | 440 | switch (action) { |
441 | case CPU_ONLINE: | 441 | case CPU_ONLINE: |
442 | case CPU_DOWN_FAILED: | 442 | case CPU_DOWN_FAILED: |
443 | case CPU_ONLINE_FROZEN: | 443 | case CPU_ONLINE_FROZEN: |
444 | smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0); | 444 | smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0); |
445 | break; | 445 | break; |
446 | case CPU_DOWN_PREPARE: | 446 | case CPU_DOWN_PREPARE: |
447 | case CPU_DOWN_PREPARE_FROZEN: | 447 | case CPU_DOWN_PREPARE_FROZEN: |
448 | smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1); | 448 | smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1); |
449 | break; | 449 | break; |
450 | default: | 450 | default: |
451 | break; | 451 | break; |
452 | } | 452 | } |
453 | return NOTIFY_OK; | 453 | return NOTIFY_OK; |
454 | } | 454 | } |
455 | 455 | ||
456 | static struct notifier_block __cpuinitdata kvm_cpu_notifier = { | 456 | static struct notifier_block __cpuinitdata kvm_cpu_notifier = { |
457 | .notifier_call = kvm_cpu_notify, | 457 | .notifier_call = kvm_cpu_notify, |
458 | }; | 458 | }; |
459 | #endif | 459 | #endif |
460 | 460 | ||
461 | static void __init kvm_apf_trap_init(void) | 461 | static void __init kvm_apf_trap_init(void) |
462 | { | 462 | { |
463 | set_intr_gate(14, &async_page_fault); | 463 | set_intr_gate(14, &async_page_fault); |
464 | } | 464 | } |
465 | 465 | ||
466 | void __init kvm_guest_init(void) | 466 | void __init kvm_guest_init(void) |
467 | { | 467 | { |
468 | int i; | 468 | int i; |
469 | 469 | ||
470 | if (!kvm_para_available()) | 470 | if (!kvm_para_available()) |
471 | return; | 471 | return; |
472 | 472 | ||
473 | paravirt_ops_setup(); | 473 | paravirt_ops_setup(); |
474 | register_reboot_notifier(&kvm_pv_reboot_nb); | 474 | register_reboot_notifier(&kvm_pv_reboot_nb); |
475 | for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) | 475 | for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) |
476 | spin_lock_init(&async_pf_sleepers[i].lock); | 476 | spin_lock_init(&async_pf_sleepers[i].lock); |
477 | if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) | 477 | if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) |
478 | x86_init.irqs.trap_init = kvm_apf_trap_init; | 478 | x86_init.irqs.trap_init = kvm_apf_trap_init; |
479 | 479 | ||
480 | if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { | 480 | if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { |
481 | has_steal_clock = 1; | 481 | has_steal_clock = 1; |
482 | pv_time_ops.steal_clock = kvm_steal_clock; | 482 | pv_time_ops.steal_clock = kvm_steal_clock; |
483 | } | 483 | } |
484 | 484 | ||
485 | if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) | 485 | if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) |
486 | apic_set_eoi_write(kvm_guest_apic_eoi_write); | 486 | apic_set_eoi_write(kvm_guest_apic_eoi_write); |
487 | 487 | ||
488 | if (kvmclock_vsyscall) | 488 | if (kvmclock_vsyscall) |
489 | kvm_setup_vsyscall_timeinfo(); | 489 | kvm_setup_vsyscall_timeinfo(); |
490 | 490 | ||
491 | #ifdef CONFIG_SMP | 491 | #ifdef CONFIG_SMP |
492 | smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; | 492 | smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; |
493 | register_cpu_notifier(&kvm_cpu_notifier); | 493 | register_cpu_notifier(&kvm_cpu_notifier); |
494 | #else | 494 | #else |
495 | kvm_guest_cpu_init(); | 495 | kvm_guest_cpu_init(); |
496 | #endif | 496 | #endif |
497 | } | 497 | } |
498 | 498 | ||
499 | static bool __init kvm_detect(void) | 499 | static bool __init kvm_detect(void) |
500 | { | 500 | { |
501 | if (!kvm_para_available()) | 501 | if (!kvm_para_available()) |
502 | return false; | 502 | return false; |
503 | return true; | 503 | return true; |
504 | } | 504 | } |
505 | 505 | ||
506 | const struct hypervisor_x86 x86_hyper_kvm __refconst = { | 506 | const struct hypervisor_x86 x86_hyper_kvm __refconst = { |
507 | .name = "KVM", | 507 | .name = "KVM", |
508 | .detect = kvm_detect, | 508 | .detect = kvm_detect, |
509 | .x2apic_available = kvm_para_available, | 509 | .x2apic_available = kvm_para_available, |
510 | }; | 510 | }; |
511 | EXPORT_SYMBOL_GPL(x86_hyper_kvm); | 511 | EXPORT_SYMBOL_GPL(x86_hyper_kvm); |
512 | 512 | ||
513 | static __init int activate_jump_labels(void) | 513 | static __init int activate_jump_labels(void) |
514 | { | 514 | { |
515 | if (has_steal_clock) { | 515 | if (has_steal_clock) { |
516 | static_key_slow_inc(¶virt_steal_enabled); | 516 | static_key_slow_inc(¶virt_steal_enabled); |
517 | if (steal_acc) | 517 | if (steal_acc) |
518 | static_key_slow_inc(¶virt_steal_rq_enabled); | 518 | static_key_slow_inc(¶virt_steal_rq_enabled); |
519 | } | 519 | } |
520 | 520 | ||
521 | return 0; | 521 | return 0; |
522 | } | 522 | } |
523 | arch_initcall(activate_jump_labels); | 523 | arch_initcall(activate_jump_labels); |
arch/x86/kernel/traps.c
1 | /* | 1 | /* |
2 | * Copyright (C) 1991, 1992 Linus Torvalds | 2 | * Copyright (C) 1991, 1992 Linus Torvalds |
3 | * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs | 3 | * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs |
4 | * | 4 | * |
5 | * Pentium III FXSR, SSE support | 5 | * Pentium III FXSR, SSE support |
6 | * Gareth Hughes <gareth@valinux.com>, May 2000 | 6 | * Gareth Hughes <gareth@valinux.com>, May 2000 |
7 | */ | 7 | */ |
8 | 8 | ||
9 | /* | 9 | /* |
10 | * Handle hardware traps and faults. | 10 | * Handle hardware traps and faults. |
11 | */ | 11 | */ |
12 | 12 | ||
13 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | 13 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
14 | 14 | ||
15 | #include <linux/context_tracking.h> | ||
15 | #include <linux/interrupt.h> | 16 | #include <linux/interrupt.h> |
16 | #include <linux/kallsyms.h> | 17 | #include <linux/kallsyms.h> |
17 | #include <linux/spinlock.h> | 18 | #include <linux/spinlock.h> |
18 | #include <linux/kprobes.h> | 19 | #include <linux/kprobes.h> |
19 | #include <linux/uaccess.h> | 20 | #include <linux/uaccess.h> |
20 | #include <linux/kdebug.h> | 21 | #include <linux/kdebug.h> |
21 | #include <linux/kgdb.h> | 22 | #include <linux/kgdb.h> |
22 | #include <linux/kernel.h> | 23 | #include <linux/kernel.h> |
23 | #include <linux/module.h> | 24 | #include <linux/module.h> |
24 | #include <linux/ptrace.h> | 25 | #include <linux/ptrace.h> |
25 | #include <linux/string.h> | 26 | #include <linux/string.h> |
26 | #include <linux/delay.h> | 27 | #include <linux/delay.h> |
27 | #include <linux/errno.h> | 28 | #include <linux/errno.h> |
28 | #include <linux/kexec.h> | 29 | #include <linux/kexec.h> |
29 | #include <linux/sched.h> | 30 | #include <linux/sched.h> |
30 | #include <linux/timer.h> | 31 | #include <linux/timer.h> |
31 | #include <linux/init.h> | 32 | #include <linux/init.h> |
32 | #include <linux/bug.h> | 33 | #include <linux/bug.h> |
33 | #include <linux/nmi.h> | 34 | #include <linux/nmi.h> |
34 | #include <linux/mm.h> | 35 | #include <linux/mm.h> |
35 | #include <linux/smp.h> | 36 | #include <linux/smp.h> |
36 | #include <linux/io.h> | 37 | #include <linux/io.h> |
37 | 38 | ||
38 | #ifdef CONFIG_EISA | 39 | #ifdef CONFIG_EISA |
39 | #include <linux/ioport.h> | 40 | #include <linux/ioport.h> |
40 | #include <linux/eisa.h> | 41 | #include <linux/eisa.h> |
41 | #endif | 42 | #endif |
42 | 43 | ||
43 | #if defined(CONFIG_EDAC) | 44 | #if defined(CONFIG_EDAC) |
44 | #include <linux/edac.h> | 45 | #include <linux/edac.h> |
45 | #endif | 46 | #endif |
46 | 47 | ||
47 | #include <asm/kmemcheck.h> | 48 | #include <asm/kmemcheck.h> |
48 | #include <asm/stacktrace.h> | 49 | #include <asm/stacktrace.h> |
49 | #include <asm/processor.h> | 50 | #include <asm/processor.h> |
50 | #include <asm/debugreg.h> | 51 | #include <asm/debugreg.h> |
51 | #include <linux/atomic.h> | 52 | #include <linux/atomic.h> |
52 | #include <asm/ftrace.h> | 53 | #include <asm/ftrace.h> |
53 | #include <asm/traps.h> | 54 | #include <asm/traps.h> |
54 | #include <asm/desc.h> | 55 | #include <asm/desc.h> |
55 | #include <asm/i387.h> | 56 | #include <asm/i387.h> |
56 | #include <asm/fpu-internal.h> | 57 | #include <asm/fpu-internal.h> |
57 | #include <asm/mce.h> | 58 | #include <asm/mce.h> |
58 | #include <asm/context_tracking.h> | ||
59 | |||
60 | #include <asm/mach_traps.h> | 59 | #include <asm/mach_traps.h> |
61 | 60 | ||
62 | #ifdef CONFIG_X86_64 | 61 | #ifdef CONFIG_X86_64 |
63 | #include <asm/x86_init.h> | 62 | #include <asm/x86_init.h> |
64 | #include <asm/pgalloc.h> | 63 | #include <asm/pgalloc.h> |
65 | #include <asm/proto.h> | 64 | #include <asm/proto.h> |
66 | #else | 65 | #else |
67 | #include <asm/processor-flags.h> | 66 | #include <asm/processor-flags.h> |
68 | #include <asm/setup.h> | 67 | #include <asm/setup.h> |
69 | 68 | ||
70 | asmlinkage int system_call(void); | 69 | asmlinkage int system_call(void); |
71 | 70 | ||
72 | /* | 71 | /* |
73 | * The IDT has to be page-aligned to simplify the Pentium | 72 | * The IDT has to be page-aligned to simplify the Pentium |
74 | * F0 0F bug workaround. | 73 | * F0 0F bug workaround. |
75 | */ | 74 | */ |
76 | gate_desc idt_table[NR_VECTORS] __page_aligned_data = { { { { 0, 0 } } }, }; | 75 | gate_desc idt_table[NR_VECTORS] __page_aligned_data = { { { { 0, 0 } } }, }; |
77 | #endif | 76 | #endif |
78 | 77 | ||
79 | DECLARE_BITMAP(used_vectors, NR_VECTORS); | 78 | DECLARE_BITMAP(used_vectors, NR_VECTORS); |
80 | EXPORT_SYMBOL_GPL(used_vectors); | 79 | EXPORT_SYMBOL_GPL(used_vectors); |
81 | 80 | ||
82 | static inline void conditional_sti(struct pt_regs *regs) | 81 | static inline void conditional_sti(struct pt_regs *regs) |
83 | { | 82 | { |
84 | if (regs->flags & X86_EFLAGS_IF) | 83 | if (regs->flags & X86_EFLAGS_IF) |
85 | local_irq_enable(); | 84 | local_irq_enable(); |
86 | } | 85 | } |
87 | 86 | ||
88 | static inline void preempt_conditional_sti(struct pt_regs *regs) | 87 | static inline void preempt_conditional_sti(struct pt_regs *regs) |
89 | { | 88 | { |
90 | inc_preempt_count(); | 89 | inc_preempt_count(); |
91 | if (regs->flags & X86_EFLAGS_IF) | 90 | if (regs->flags & X86_EFLAGS_IF) |
92 | local_irq_enable(); | 91 | local_irq_enable(); |
93 | } | 92 | } |
94 | 93 | ||
95 | static inline void conditional_cli(struct pt_regs *regs) | 94 | static inline void conditional_cli(struct pt_regs *regs) |
96 | { | 95 | { |
97 | if (regs->flags & X86_EFLAGS_IF) | 96 | if (regs->flags & X86_EFLAGS_IF) |
98 | local_irq_disable(); | 97 | local_irq_disable(); |
99 | } | 98 | } |
100 | 99 | ||
101 | static inline void preempt_conditional_cli(struct pt_regs *regs) | 100 | static inline void preempt_conditional_cli(struct pt_regs *regs) |
102 | { | 101 | { |
103 | if (regs->flags & X86_EFLAGS_IF) | 102 | if (regs->flags & X86_EFLAGS_IF) |
104 | local_irq_disable(); | 103 | local_irq_disable(); |
105 | dec_preempt_count(); | 104 | dec_preempt_count(); |
106 | } | 105 | } |
107 | 106 | ||
108 | static int __kprobes | 107 | static int __kprobes |
109 | do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, | 108 | do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, |
110 | struct pt_regs *regs, long error_code) | 109 | struct pt_regs *regs, long error_code) |
111 | { | 110 | { |
112 | #ifdef CONFIG_X86_32 | 111 | #ifdef CONFIG_X86_32 |
113 | if (regs->flags & X86_VM_MASK) { | 112 | if (regs->flags & X86_VM_MASK) { |
114 | /* | 113 | /* |
115 | * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86. | 114 | * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86. |
116 | * On nmi (interrupt 2), do_trap should not be called. | 115 | * On nmi (interrupt 2), do_trap should not be called. |
117 | */ | 116 | */ |
118 | if (trapnr < X86_TRAP_UD) { | 117 | if (trapnr < X86_TRAP_UD) { |
119 | if (!handle_vm86_trap((struct kernel_vm86_regs *) regs, | 118 | if (!handle_vm86_trap((struct kernel_vm86_regs *) regs, |
120 | error_code, trapnr)) | 119 | error_code, trapnr)) |
121 | return 0; | 120 | return 0; |
122 | } | 121 | } |
123 | return -1; | 122 | return -1; |
124 | } | 123 | } |
125 | #endif | 124 | #endif |
126 | if (!user_mode(regs)) { | 125 | if (!user_mode(regs)) { |
127 | if (!fixup_exception(regs)) { | 126 | if (!fixup_exception(regs)) { |
128 | tsk->thread.error_code = error_code; | 127 | tsk->thread.error_code = error_code; |
129 | tsk->thread.trap_nr = trapnr; | 128 | tsk->thread.trap_nr = trapnr; |
130 | die(str, regs, error_code); | 129 | die(str, regs, error_code); |
131 | } | 130 | } |
132 | return 0; | 131 | return 0; |
133 | } | 132 | } |
134 | 133 | ||
135 | return -1; | 134 | return -1; |
136 | } | 135 | } |
137 | 136 | ||
138 | static void __kprobes | 137 | static void __kprobes |
139 | do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, | 138 | do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, |
140 | long error_code, siginfo_t *info) | 139 | long error_code, siginfo_t *info) |
141 | { | 140 | { |
142 | struct task_struct *tsk = current; | 141 | struct task_struct *tsk = current; |
143 | 142 | ||
144 | 143 | ||
145 | if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code)) | 144 | if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code)) |
146 | return; | 145 | return; |
147 | /* | 146 | /* |
148 | * We want error_code and trap_nr set for userspace faults and | 147 | * We want error_code and trap_nr set for userspace faults and |
149 | * kernelspace faults which result in die(), but not | 148 | * kernelspace faults which result in die(), but not |
150 | * kernelspace faults which are fixed up. die() gives the | 149 | * kernelspace faults which are fixed up. die() gives the |
151 | * process no chance to handle the signal and notice the | 150 | * process no chance to handle the signal and notice the |
152 | * kernel fault information, so that won't result in polluting | 151 | * kernel fault information, so that won't result in polluting |
153 | * the information about previously queued, but not yet | 152 | * the information about previously queued, but not yet |
154 | * delivered, faults. See also do_general_protection below. | 153 | * delivered, faults. See also do_general_protection below. |
155 | */ | 154 | */ |
156 | tsk->thread.error_code = error_code; | 155 | tsk->thread.error_code = error_code; |
157 | tsk->thread.trap_nr = trapnr; | 156 | tsk->thread.trap_nr = trapnr; |
158 | 157 | ||
159 | #ifdef CONFIG_X86_64 | 158 | #ifdef CONFIG_X86_64 |
160 | if (show_unhandled_signals && unhandled_signal(tsk, signr) && | 159 | if (show_unhandled_signals && unhandled_signal(tsk, signr) && |
161 | printk_ratelimit()) { | 160 | printk_ratelimit()) { |
162 | pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx", | 161 | pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx", |
163 | tsk->comm, tsk->pid, str, | 162 | tsk->comm, tsk->pid, str, |
164 | regs->ip, regs->sp, error_code); | 163 | regs->ip, regs->sp, error_code); |
165 | print_vma_addr(" in ", regs->ip); | 164 | print_vma_addr(" in ", regs->ip); |
166 | pr_cont("\n"); | 165 | pr_cont("\n"); |
167 | } | 166 | } |
168 | #endif | 167 | #endif |
169 | 168 | ||
170 | if (info) | 169 | if (info) |
171 | force_sig_info(signr, info, tsk); | 170 | force_sig_info(signr, info, tsk); |
172 | else | 171 | else |
173 | force_sig(signr, tsk); | 172 | force_sig(signr, tsk); |
174 | } | 173 | } |
175 | 174 | ||
176 | #define DO_ERROR(trapnr, signr, str, name) \ | 175 | #define DO_ERROR(trapnr, signr, str, name) \ |
177 | dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ | 176 | dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ |
178 | { \ | 177 | { \ |
179 | exception_enter(regs); \ | 178 | exception_enter(regs); \ |
180 | if (notify_die(DIE_TRAP, str, regs, error_code, \ | 179 | if (notify_die(DIE_TRAP, str, regs, error_code, \ |
181 | trapnr, signr) == NOTIFY_STOP) { \ | 180 | trapnr, signr) == NOTIFY_STOP) { \ |
182 | exception_exit(regs); \ | 181 | exception_exit(regs); \ |
183 | return; \ | 182 | return; \ |
184 | } \ | 183 | } \ |
185 | conditional_sti(regs); \ | 184 | conditional_sti(regs); \ |
186 | do_trap(trapnr, signr, str, regs, error_code, NULL); \ | 185 | do_trap(trapnr, signr, str, regs, error_code, NULL); \ |
187 | exception_exit(regs); \ | 186 | exception_exit(regs); \ |
188 | } | 187 | } |
189 | 188 | ||
190 | #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ | 189 | #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ |
191 | dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ | 190 | dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ |
192 | { \ | 191 | { \ |
193 | siginfo_t info; \ | 192 | siginfo_t info; \ |
194 | info.si_signo = signr; \ | 193 | info.si_signo = signr; \ |
195 | info.si_errno = 0; \ | 194 | info.si_errno = 0; \ |
196 | info.si_code = sicode; \ | 195 | info.si_code = sicode; \ |
197 | info.si_addr = (void __user *)siaddr; \ | 196 | info.si_addr = (void __user *)siaddr; \ |
198 | exception_enter(regs); \ | 197 | exception_enter(regs); \ |
199 | if (notify_die(DIE_TRAP, str, regs, error_code, \ | 198 | if (notify_die(DIE_TRAP, str, regs, error_code, \ |
200 | trapnr, signr) == NOTIFY_STOP) { \ | 199 | trapnr, signr) == NOTIFY_STOP) { \ |
201 | exception_exit(regs); \ | 200 | exception_exit(regs); \ |
202 | return; \ | 201 | return; \ |
203 | } \ | 202 | } \ |
204 | conditional_sti(regs); \ | 203 | conditional_sti(regs); \ |
205 | do_trap(trapnr, signr, str, regs, error_code, &info); \ | 204 | do_trap(trapnr, signr, str, regs, error_code, &info); \ |
206 | exception_exit(regs); \ | 205 | exception_exit(regs); \ |
207 | } | 206 | } |
208 | 207 | ||
209 | DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, | 208 | DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, |
210 | regs->ip) | 209 | regs->ip) |
211 | DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow) | 210 | DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow) |
212 | DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds) | 211 | DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds) |
213 | DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, | 212 | DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, |
214 | regs->ip) | 213 | regs->ip) |
215 | DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun", | 214 | DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun", |
216 | coprocessor_segment_overrun) | 215 | coprocessor_segment_overrun) |
217 | DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS) | 216 | DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS) |
218 | DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) | 217 | DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) |
219 | #ifdef CONFIG_X86_32 | 218 | #ifdef CONFIG_X86_32 |
220 | DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) | 219 | DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) |
221 | #endif | 220 | #endif |
222 | DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check, | 221 | DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check, |
223 | BUS_ADRALN, 0) | 222 | BUS_ADRALN, 0) |
224 | 223 | ||
225 | #ifdef CONFIG_X86_64 | 224 | #ifdef CONFIG_X86_64 |
226 | /* Runs on IST stack */ | 225 | /* Runs on IST stack */ |
227 | dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) | 226 | dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) |
228 | { | 227 | { |
229 | exception_enter(regs); | 228 | exception_enter(regs); |
230 | if (notify_die(DIE_TRAP, "stack segment", regs, error_code, | 229 | if (notify_die(DIE_TRAP, "stack segment", regs, error_code, |
231 | X86_TRAP_SS, SIGBUS) != NOTIFY_STOP) { | 230 | X86_TRAP_SS, SIGBUS) != NOTIFY_STOP) { |
232 | preempt_conditional_sti(regs); | 231 | preempt_conditional_sti(regs); |
233 | do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL); | 232 | do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL); |
234 | preempt_conditional_cli(regs); | 233 | preempt_conditional_cli(regs); |
235 | } | 234 | } |
236 | exception_exit(regs); | 235 | exception_exit(regs); |
237 | } | 236 | } |
238 | 237 | ||
239 | dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) | 238 | dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) |
240 | { | 239 | { |
241 | static const char str[] = "double fault"; | 240 | static const char str[] = "double fault"; |
242 | struct task_struct *tsk = current; | 241 | struct task_struct *tsk = current; |
243 | 242 | ||
244 | exception_enter(regs); | 243 | exception_enter(regs); |
245 | /* Return not checked because double check cannot be ignored */ | 244 | /* Return not checked because double check cannot be ignored */ |
246 | notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); | 245 | notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); |
247 | 246 | ||
248 | tsk->thread.error_code = error_code; | 247 | tsk->thread.error_code = error_code; |
249 | tsk->thread.trap_nr = X86_TRAP_DF; | 248 | tsk->thread.trap_nr = X86_TRAP_DF; |
250 | 249 | ||
251 | /* | 250 | /* |
252 | * This is always a kernel trap and never fixable (and thus must | 251 | * This is always a kernel trap and never fixable (and thus must |
253 | * never return). | 252 | * never return). |
254 | */ | 253 | */ |
255 | for (;;) | 254 | for (;;) |
256 | die(str, regs, error_code); | 255 | die(str, regs, error_code); |
257 | } | 256 | } |
258 | #endif | 257 | #endif |
259 | 258 | ||
260 | dotraplinkage void __kprobes | 259 | dotraplinkage void __kprobes |
261 | do_general_protection(struct pt_regs *regs, long error_code) | 260 | do_general_protection(struct pt_regs *regs, long error_code) |
262 | { | 261 | { |
263 | struct task_struct *tsk; | 262 | struct task_struct *tsk; |
264 | 263 | ||
265 | exception_enter(regs); | 264 | exception_enter(regs); |
266 | conditional_sti(regs); | 265 | conditional_sti(regs); |
267 | 266 | ||
268 | #ifdef CONFIG_X86_32 | 267 | #ifdef CONFIG_X86_32 |
269 | if (regs->flags & X86_VM_MASK) { | 268 | if (regs->flags & X86_VM_MASK) { |
270 | local_irq_enable(); | 269 | local_irq_enable(); |
271 | handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); | 270 | handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); |
272 | goto exit; | 271 | goto exit; |
273 | } | 272 | } |
274 | #endif | 273 | #endif |
275 | 274 | ||
276 | tsk = current; | 275 | tsk = current; |
277 | if (!user_mode(regs)) { | 276 | if (!user_mode(regs)) { |
278 | if (fixup_exception(regs)) | 277 | if (fixup_exception(regs)) |
279 | goto exit; | 278 | goto exit; |
280 | 279 | ||
281 | tsk->thread.error_code = error_code; | 280 | tsk->thread.error_code = error_code; |
282 | tsk->thread.trap_nr = X86_TRAP_GP; | 281 | tsk->thread.trap_nr = X86_TRAP_GP; |
283 | if (notify_die(DIE_GPF, "general protection fault", regs, error_code, | 282 | if (notify_die(DIE_GPF, "general protection fault", regs, error_code, |
284 | X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP) | 283 | X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP) |
285 | die("general protection fault", regs, error_code); | 284 | die("general protection fault", regs, error_code); |
286 | goto exit; | 285 | goto exit; |
287 | } | 286 | } |
288 | 287 | ||
289 | tsk->thread.error_code = error_code; | 288 | tsk->thread.error_code = error_code; |
290 | tsk->thread.trap_nr = X86_TRAP_GP; | 289 | tsk->thread.trap_nr = X86_TRAP_GP; |
291 | 290 | ||
292 | if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && | 291 | if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && |
293 | printk_ratelimit()) { | 292 | printk_ratelimit()) { |
294 | pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx", | 293 | pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx", |
295 | tsk->comm, task_pid_nr(tsk), | 294 | tsk->comm, task_pid_nr(tsk), |
296 | regs->ip, regs->sp, error_code); | 295 | regs->ip, regs->sp, error_code); |
297 | print_vma_addr(" in ", regs->ip); | 296 | print_vma_addr(" in ", regs->ip); |
298 | pr_cont("\n"); | 297 | pr_cont("\n"); |
299 | } | 298 | } |
300 | 299 | ||
301 | force_sig(SIGSEGV, tsk); | 300 | force_sig(SIGSEGV, tsk); |
302 | exit: | 301 | exit: |
303 | exception_exit(regs); | 302 | exception_exit(regs); |
304 | } | 303 | } |
305 | 304 | ||
306 | /* May run on IST stack. */ | 305 | /* May run on IST stack. */ |
307 | dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code) | 306 | dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code) |
308 | { | 307 | { |
309 | #ifdef CONFIG_DYNAMIC_FTRACE | 308 | #ifdef CONFIG_DYNAMIC_FTRACE |
310 | /* | 309 | /* |
311 | * ftrace must be first, everything else may cause a recursive crash. | 310 | * ftrace must be first, everything else may cause a recursive crash. |
312 | * See note by declaration of modifying_ftrace_code in ftrace.c | 311 | * See note by declaration of modifying_ftrace_code in ftrace.c |
313 | */ | 312 | */ |
314 | if (unlikely(atomic_read(&modifying_ftrace_code)) && | 313 | if (unlikely(atomic_read(&modifying_ftrace_code)) && |
315 | ftrace_int3_handler(regs)) | 314 | ftrace_int3_handler(regs)) |
316 | return; | 315 | return; |
317 | #endif | 316 | #endif |
318 | exception_enter(regs); | 317 | exception_enter(regs); |
319 | #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP | 318 | #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP |
320 | if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, | 319 | if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, |
321 | SIGTRAP) == NOTIFY_STOP) | 320 | SIGTRAP) == NOTIFY_STOP) |
322 | goto exit; | 321 | goto exit; |
323 | #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ | 322 | #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ |
324 | 323 | ||
325 | if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, | 324 | if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, |
326 | SIGTRAP) == NOTIFY_STOP) | 325 | SIGTRAP) == NOTIFY_STOP) |
327 | goto exit; | 326 | goto exit; |
328 | 327 | ||
329 | /* | 328 | /* |
330 | * Let others (NMI) know that the debug stack is in use | 329 | * Let others (NMI) know that the debug stack is in use |
331 | * as we may switch to the interrupt stack. | 330 | * as we may switch to the interrupt stack. |
332 | */ | 331 | */ |
333 | debug_stack_usage_inc(); | 332 | debug_stack_usage_inc(); |
334 | preempt_conditional_sti(regs); | 333 | preempt_conditional_sti(regs); |
335 | do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL); | 334 | do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL); |
336 | preempt_conditional_cli(regs); | 335 | preempt_conditional_cli(regs); |
337 | debug_stack_usage_dec(); | 336 | debug_stack_usage_dec(); |
338 | exit: | 337 | exit: |
339 | exception_exit(regs); | 338 | exception_exit(regs); |
340 | } | 339 | } |
341 | 340 | ||
342 | #ifdef CONFIG_X86_64 | 341 | #ifdef CONFIG_X86_64 |
343 | /* | 342 | /* |
344 | * Help handler running on IST stack to switch back to user stack | 343 | * Help handler running on IST stack to switch back to user stack |
345 | * for scheduling or signal handling. The actual stack switch is done in | 344 | * for scheduling or signal handling. The actual stack switch is done in |
346 | * entry.S | 345 | * entry.S |
347 | */ | 346 | */ |
348 | asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) | 347 | asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) |
349 | { | 348 | { |
350 | struct pt_regs *regs = eregs; | 349 | struct pt_regs *regs = eregs; |
351 | /* Did already sync */ | 350 | /* Did already sync */ |
352 | if (eregs == (struct pt_regs *)eregs->sp) | 351 | if (eregs == (struct pt_regs *)eregs->sp) |
353 | ; | 352 | ; |
354 | /* Exception from user space */ | 353 | /* Exception from user space */ |
355 | else if (user_mode(eregs)) | 354 | else if (user_mode(eregs)) |
356 | regs = task_pt_regs(current); | 355 | regs = task_pt_regs(current); |
357 | /* | 356 | /* |
358 | * Exception from kernel and interrupts are enabled. Move to | 357 | * Exception from kernel and interrupts are enabled. Move to |
359 | * kernel process stack. | 358 | * kernel process stack. |
360 | */ | 359 | */ |
361 | else if (eregs->flags & X86_EFLAGS_IF) | 360 | else if (eregs->flags & X86_EFLAGS_IF) |
362 | regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); | 361 | regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); |
363 | if (eregs != regs) | 362 | if (eregs != regs) |
364 | *regs = *eregs; | 363 | *regs = *eregs; |
365 | return regs; | 364 | return regs; |
366 | } | 365 | } |
367 | #endif | 366 | #endif |
368 | 367 | ||
369 | /* | 368 | /* |
370 | * Our handling of the processor debug registers is non-trivial. | 369 | * Our handling of the processor debug registers is non-trivial. |
371 | * We do not clear them on entry and exit from the kernel. Therefore | 370 | * We do not clear them on entry and exit from the kernel. Therefore |
372 | * it is possible to get a watchpoint trap here from inside the kernel. | 371 | * it is possible to get a watchpoint trap here from inside the kernel. |
373 | * However, the code in ./ptrace.c has ensured that the user can | 372 | * However, the code in ./ptrace.c has ensured that the user can |
374 | * only set watchpoints on userspace addresses. Therefore the in-kernel | 373 | * only set watchpoints on userspace addresses. Therefore the in-kernel |
375 | * watchpoint trap can only occur in code which is reading/writing | 374 | * watchpoint trap can only occur in code which is reading/writing |
376 | * from user space. Such code must not hold kernel locks (since it | 375 | * from user space. Such code must not hold kernel locks (since it |
377 | * can equally take a page fault), therefore it is safe to call | 376 | * can equally take a page fault), therefore it is safe to call |
378 | * force_sig_info even though that claims and releases locks. | 377 | * force_sig_info even though that claims and releases locks. |
379 | * | 378 | * |
380 | * Code in ./signal.c ensures that the debug control register | 379 | * Code in ./signal.c ensures that the debug control register |
381 | * is restored before we deliver any signal, and therefore that | 380 | * is restored before we deliver any signal, and therefore that |
382 | * user code runs with the correct debug control register even though | 381 | * user code runs with the correct debug control register even though |
383 | * we clear it here. | 382 | * we clear it here. |
384 | * | 383 | * |
385 | * Being careful here means that we don't have to be as careful in a | 384 | * Being careful here means that we don't have to be as careful in a |
386 | * lot of more complicated places (task switching can be a bit lazy | 385 | * lot of more complicated places (task switching can be a bit lazy |
387 | * about restoring all the debug state, and ptrace doesn't have to | 386 | * about restoring all the debug state, and ptrace doesn't have to |
388 | * find every occurrence of the TF bit that could be saved away even | 387 | * find every occurrence of the TF bit that could be saved away even |
389 | * by user code) | 388 | * by user code) |
390 | * | 389 | * |
391 | * May run on IST stack. | 390 | * May run on IST stack. |
392 | */ | 391 | */ |
393 | dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) | 392 | dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) |
394 | { | 393 | { |
395 | struct task_struct *tsk = current; | 394 | struct task_struct *tsk = current; |
396 | int user_icebp = 0; | 395 | int user_icebp = 0; |
397 | unsigned long dr6; | 396 | unsigned long dr6; |
398 | int si_code; | 397 | int si_code; |
399 | 398 | ||
400 | exception_enter(regs); | 399 | exception_enter(regs); |
401 | 400 | ||
402 | get_debugreg(dr6, 6); | 401 | get_debugreg(dr6, 6); |
403 | 402 | ||
404 | /* Filter out all the reserved bits which are preset to 1 */ | 403 | /* Filter out all the reserved bits which are preset to 1 */ |
405 | dr6 &= ~DR6_RESERVED; | 404 | dr6 &= ~DR6_RESERVED; |
406 | 405 | ||
407 | /* | 406 | /* |
408 | * If dr6 has no reason to give us about the origin of this trap, | 407 | * If dr6 has no reason to give us about the origin of this trap, |
409 | * then it's very likely the result of an icebp/int01 trap. | 408 | * then it's very likely the result of an icebp/int01 trap. |
410 | * User wants a sigtrap for that. | 409 | * User wants a sigtrap for that. |
411 | */ | 410 | */ |
412 | if (!dr6 && user_mode(regs)) | 411 | if (!dr6 && user_mode(regs)) |
413 | user_icebp = 1; | 412 | user_icebp = 1; |
414 | 413 | ||
415 | /* Catch kmemcheck conditions first of all! */ | 414 | /* Catch kmemcheck conditions first of all! */ |
416 | if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) | 415 | if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) |
417 | goto exit; | 416 | goto exit; |
418 | 417 | ||
419 | /* DR6 may or may not be cleared by the CPU */ | 418 | /* DR6 may or may not be cleared by the CPU */ |
420 | set_debugreg(0, 6); | 419 | set_debugreg(0, 6); |
421 | 420 | ||
422 | /* | 421 | /* |
423 | * The processor cleared BTF, so don't mark that we need it set. | 422 | * The processor cleared BTF, so don't mark that we need it set. |
424 | */ | 423 | */ |
425 | clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP); | 424 | clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP); |
426 | 425 | ||
427 | /* Store the virtualized DR6 value */ | 426 | /* Store the virtualized DR6 value */ |
428 | tsk->thread.debugreg6 = dr6; | 427 | tsk->thread.debugreg6 = dr6; |
429 | 428 | ||
430 | if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code, | 429 | if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code, |
431 | SIGTRAP) == NOTIFY_STOP) | 430 | SIGTRAP) == NOTIFY_STOP) |
432 | goto exit; | 431 | goto exit; |
433 | 432 | ||
434 | /* | 433 | /* |
435 | * Let others (NMI) know that the debug stack is in use | 434 | * Let others (NMI) know that the debug stack is in use |
436 | * as we may switch to the interrupt stack. | 435 | * as we may switch to the interrupt stack. |
437 | */ | 436 | */ |
438 | debug_stack_usage_inc(); | 437 | debug_stack_usage_inc(); |
439 | 438 | ||
440 | /* It's safe to allow irq's after DR6 has been saved */ | 439 | /* It's safe to allow irq's after DR6 has been saved */ |
441 | preempt_conditional_sti(regs); | 440 | preempt_conditional_sti(regs); |
442 | 441 | ||
443 | if (regs->flags & X86_VM_MASK) { | 442 | if (regs->flags & X86_VM_MASK) { |
444 | handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, | 443 | handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, |
445 | X86_TRAP_DB); | 444 | X86_TRAP_DB); |
446 | preempt_conditional_cli(regs); | 445 | preempt_conditional_cli(regs); |
447 | debug_stack_usage_dec(); | 446 | debug_stack_usage_dec(); |
448 | goto exit; | 447 | goto exit; |
449 | } | 448 | } |
450 | 449 | ||
451 | /* | 450 | /* |
452 | * Single-stepping through system calls: ignore any exceptions in | 451 | * Single-stepping through system calls: ignore any exceptions in |
453 | * kernel space, but re-enable TF when returning to user mode. | 452 | * kernel space, but re-enable TF when returning to user mode. |
454 | * | 453 | * |
455 | * We already checked v86 mode above, so we can check for kernel mode | 454 | * We already checked v86 mode above, so we can check for kernel mode |
456 | * by just checking the CPL of CS. | 455 | * by just checking the CPL of CS. |
457 | */ | 456 | */ |
458 | if ((dr6 & DR_STEP) && !user_mode(regs)) { | 457 | if ((dr6 & DR_STEP) && !user_mode(regs)) { |
459 | tsk->thread.debugreg6 &= ~DR_STEP; | 458 | tsk->thread.debugreg6 &= ~DR_STEP; |
460 | set_tsk_thread_flag(tsk, TIF_SINGLESTEP); | 459 | set_tsk_thread_flag(tsk, TIF_SINGLESTEP); |
461 | regs->flags &= ~X86_EFLAGS_TF; | 460 | regs->flags &= ~X86_EFLAGS_TF; |
462 | } | 461 | } |
463 | si_code = get_si_code(tsk->thread.debugreg6); | 462 | si_code = get_si_code(tsk->thread.debugreg6); |
464 | if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) | 463 | if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) |
465 | send_sigtrap(tsk, regs, error_code, si_code); | 464 | send_sigtrap(tsk, regs, error_code, si_code); |
466 | preempt_conditional_cli(regs); | 465 | preempt_conditional_cli(regs); |
467 | debug_stack_usage_dec(); | 466 | debug_stack_usage_dec(); |
468 | 467 | ||
469 | exit: | 468 | exit: |
470 | exception_exit(regs); | 469 | exception_exit(regs); |
471 | } | 470 | } |
472 | 471 | ||
473 | /* | 472 | /* |
474 | * Note that we play around with the 'TS' bit in an attempt to get | 473 | * Note that we play around with the 'TS' bit in an attempt to get |
475 | * the correct behaviour even in the presence of the asynchronous | 474 | * the correct behaviour even in the presence of the asynchronous |
476 | * IRQ13 behaviour | 475 | * IRQ13 behaviour |
477 | */ | 476 | */ |
478 | void math_error(struct pt_regs *regs, int error_code, int trapnr) | 477 | void math_error(struct pt_regs *regs, int error_code, int trapnr) |
479 | { | 478 | { |
480 | struct task_struct *task = current; | 479 | struct task_struct *task = current; |
481 | siginfo_t info; | 480 | siginfo_t info; |
482 | unsigned short err; | 481 | unsigned short err; |
483 | char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" : | 482 | char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" : |
484 | "simd exception"; | 483 | "simd exception"; |
485 | 484 | ||
486 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP) | 485 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP) |
487 | return; | 486 | return; |
488 | conditional_sti(regs); | 487 | conditional_sti(regs); |
489 | 488 | ||
490 | if (!user_mode_vm(regs)) | 489 | if (!user_mode_vm(regs)) |
491 | { | 490 | { |
492 | if (!fixup_exception(regs)) { | 491 | if (!fixup_exception(regs)) { |
493 | task->thread.error_code = error_code; | 492 | task->thread.error_code = error_code; |
494 | task->thread.trap_nr = trapnr; | 493 | task->thread.trap_nr = trapnr; |
495 | die(str, regs, error_code); | 494 | die(str, regs, error_code); |
496 | } | 495 | } |
497 | return; | 496 | return; |
498 | } | 497 | } |
499 | 498 | ||
500 | /* | 499 | /* |
501 | * Save the info for the exception handler and clear the error. | 500 | * Save the info for the exception handler and clear the error. |
502 | */ | 501 | */ |
503 | save_init_fpu(task); | 502 | save_init_fpu(task); |
504 | task->thread.trap_nr = trapnr; | 503 | task->thread.trap_nr = trapnr; |
505 | task->thread.error_code = error_code; | 504 | task->thread.error_code = error_code; |
506 | info.si_signo = SIGFPE; | 505 | info.si_signo = SIGFPE; |
507 | info.si_errno = 0; | 506 | info.si_errno = 0; |
508 | info.si_addr = (void __user *)regs->ip; | 507 | info.si_addr = (void __user *)regs->ip; |
509 | if (trapnr == X86_TRAP_MF) { | 508 | if (trapnr == X86_TRAP_MF) { |
510 | unsigned short cwd, swd; | 509 | unsigned short cwd, swd; |
511 | /* | 510 | /* |
512 | * (~cwd & swd) will mask out exceptions that are not set to unmasked | 511 | * (~cwd & swd) will mask out exceptions that are not set to unmasked |
513 | * status. 0x3f is the exception bits in these regs, 0x200 is the | 512 | * status. 0x3f is the exception bits in these regs, 0x200 is the |
514 | * C1 reg you need in case of a stack fault, 0x040 is the stack | 513 | * C1 reg you need in case of a stack fault, 0x040 is the stack |
515 | * fault bit. We should only be taking one exception at a time, | 514 | * fault bit. We should only be taking one exception at a time, |
516 | * so if this combination doesn't produce any single exception, | 515 | * so if this combination doesn't produce any single exception, |
517 | * then we have a bad program that isn't synchronizing its FPU usage | 516 | * then we have a bad program that isn't synchronizing its FPU usage |
518 | * and it will suffer the consequences since we won't be able to | 517 | * and it will suffer the consequences since we won't be able to |
519 | * fully reproduce the context of the exception | 518 | * fully reproduce the context of the exception |
520 | */ | 519 | */ |
521 | cwd = get_fpu_cwd(task); | 520 | cwd = get_fpu_cwd(task); |
522 | swd = get_fpu_swd(task); | 521 | swd = get_fpu_swd(task); |
523 | 522 | ||
524 | err = swd & ~cwd; | 523 | err = swd & ~cwd; |
525 | } else { | 524 | } else { |
526 | /* | 525 | /* |
527 | * The SIMD FPU exceptions are handled a little differently, as there | 526 | * The SIMD FPU exceptions are handled a little differently, as there |
528 | * is only a single status/control register. Thus, to determine which | 527 | * is only a single status/control register. Thus, to determine which |
529 | * unmasked exception was caught we must mask the exception mask bits | 528 | * unmasked exception was caught we must mask the exception mask bits |
530 | * at 0x1f80, and then use these to mask the exception bits at 0x3f. | 529 | * at 0x1f80, and then use these to mask the exception bits at 0x3f. |
531 | */ | 530 | */ |
532 | unsigned short mxcsr = get_fpu_mxcsr(task); | 531 | unsigned short mxcsr = get_fpu_mxcsr(task); |
533 | err = ~(mxcsr >> 7) & mxcsr; | 532 | err = ~(mxcsr >> 7) & mxcsr; |
534 | } | 533 | } |
535 | 534 | ||
536 | if (err & 0x001) { /* Invalid op */ | 535 | if (err & 0x001) { /* Invalid op */ |
537 | /* | 536 | /* |
538 | * swd & 0x240 == 0x040: Stack Underflow | 537 | * swd & 0x240 == 0x040: Stack Underflow |
539 | * swd & 0x240 == 0x240: Stack Overflow | 538 | * swd & 0x240 == 0x240: Stack Overflow |
540 | * User must clear the SF bit (0x40) if set | 539 | * User must clear the SF bit (0x40) if set |
541 | */ | 540 | */ |
542 | info.si_code = FPE_FLTINV; | 541 | info.si_code = FPE_FLTINV; |
543 | } else if (err & 0x004) { /* Divide by Zero */ | 542 | } else if (err & 0x004) { /* Divide by Zero */ |
544 | info.si_code = FPE_FLTDIV; | 543 | info.si_code = FPE_FLTDIV; |
545 | } else if (err & 0x008) { /* Overflow */ | 544 | } else if (err & 0x008) { /* Overflow */ |
546 | info.si_code = FPE_FLTOVF; | 545 | info.si_code = FPE_FLTOVF; |
547 | } else if (err & 0x012) { /* Denormal, Underflow */ | 546 | } else if (err & 0x012) { /* Denormal, Underflow */ |
548 | info.si_code = FPE_FLTUND; | 547 | info.si_code = FPE_FLTUND; |
549 | } else if (err & 0x020) { /* Precision */ | 548 | } else if (err & 0x020) { /* Precision */ |
550 | info.si_code = FPE_FLTRES; | 549 | info.si_code = FPE_FLTRES; |
551 | } else { | 550 | } else { |
552 | /* | 551 | /* |
553 | * If we're using IRQ 13, or supposedly even some trap | 552 | * If we're using IRQ 13, or supposedly even some trap |
554 | * X86_TRAP_MF implementations, it's possible | 553 | * X86_TRAP_MF implementations, it's possible |
555 | * we get a spurious trap, which is not an error. | 554 | * we get a spurious trap, which is not an error. |
556 | */ | 555 | */ |
557 | return; | 556 | return; |
558 | } | 557 | } |
559 | force_sig_info(SIGFPE, &info, task); | 558 | force_sig_info(SIGFPE, &info, task); |
560 | } | 559 | } |
561 | 560 | ||
562 | dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) | 561 | dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) |
563 | { | 562 | { |
564 | exception_enter(regs); | 563 | exception_enter(regs); |
565 | math_error(regs, error_code, X86_TRAP_MF); | 564 | math_error(regs, error_code, X86_TRAP_MF); |
566 | exception_exit(regs); | 565 | exception_exit(regs); |
567 | } | 566 | } |
568 | 567 | ||
569 | dotraplinkage void | 568 | dotraplinkage void |
570 | do_simd_coprocessor_error(struct pt_regs *regs, long error_code) | 569 | do_simd_coprocessor_error(struct pt_regs *regs, long error_code) |
571 | { | 570 | { |
572 | exception_enter(regs); | 571 | exception_enter(regs); |
573 | math_error(regs, error_code, X86_TRAP_XF); | 572 | math_error(regs, error_code, X86_TRAP_XF); |
574 | exception_exit(regs); | 573 | exception_exit(regs); |
575 | } | 574 | } |
576 | 575 | ||
577 | dotraplinkage void | 576 | dotraplinkage void |
578 | do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) | 577 | do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) |
579 | { | 578 | { |
580 | conditional_sti(regs); | 579 | conditional_sti(regs); |
581 | #if 0 | 580 | #if 0 |
582 | /* No need to warn about this any longer. */ | 581 | /* No need to warn about this any longer. */ |
583 | pr_info("Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); | 582 | pr_info("Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); |
584 | #endif | 583 | #endif |
585 | } | 584 | } |
586 | 585 | ||
587 | asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) | 586 | asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) |
588 | { | 587 | { |
589 | } | 588 | } |
590 | 589 | ||
591 | asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) | 590 | asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) |
592 | { | 591 | { |
593 | } | 592 | } |
594 | 593 | ||
595 | /* | 594 | /* |
596 | * 'math_state_restore()' saves the current math information in the | 595 | * 'math_state_restore()' saves the current math information in the |
597 | * old math state array, and gets the new ones from the current task | 596 | * old math state array, and gets the new ones from the current task |
598 | * | 597 | * |
599 | * Careful.. There are problems with IBM-designed IRQ13 behaviour. | 598 | * Careful.. There are problems with IBM-designed IRQ13 behaviour. |
600 | * Don't touch unless you *really* know how it works. | 599 | * Don't touch unless you *really* know how it works. |
601 | * | 600 | * |
602 | * Must be called with kernel preemption disabled (eg with local | 601 | * Must be called with kernel preemption disabled (eg with local |
603 | * local interrupts as in the case of do_device_not_available). | 602 | * local interrupts as in the case of do_device_not_available). |
604 | */ | 603 | */ |
605 | void math_state_restore(void) | 604 | void math_state_restore(void) |
606 | { | 605 | { |
607 | struct task_struct *tsk = current; | 606 | struct task_struct *tsk = current; |
608 | 607 | ||
609 | if (!tsk_used_math(tsk)) { | 608 | if (!tsk_used_math(tsk)) { |
610 | local_irq_enable(); | 609 | local_irq_enable(); |
611 | /* | 610 | /* |
612 | * does a slab alloc which can sleep | 611 | * does a slab alloc which can sleep |
613 | */ | 612 | */ |
614 | if (init_fpu(tsk)) { | 613 | if (init_fpu(tsk)) { |
615 | /* | 614 | /* |
616 | * ran out of memory! | 615 | * ran out of memory! |
617 | */ | 616 | */ |
618 | do_group_exit(SIGKILL); | 617 | do_group_exit(SIGKILL); |
619 | return; | 618 | return; |
620 | } | 619 | } |
621 | local_irq_disable(); | 620 | local_irq_disable(); |
622 | } | 621 | } |
623 | 622 | ||
624 | __thread_fpu_begin(tsk); | 623 | __thread_fpu_begin(tsk); |
625 | 624 | ||
626 | /* | 625 | /* |
627 | * Paranoid restore. send a SIGSEGV if we fail to restore the state. | 626 | * Paranoid restore. send a SIGSEGV if we fail to restore the state. |
628 | */ | 627 | */ |
629 | if (unlikely(restore_fpu_checking(tsk))) { | 628 | if (unlikely(restore_fpu_checking(tsk))) { |
630 | drop_init_fpu(tsk); | 629 | drop_init_fpu(tsk); |
631 | force_sig(SIGSEGV, tsk); | 630 | force_sig(SIGSEGV, tsk); |
632 | return; | 631 | return; |
633 | } | 632 | } |
634 | 633 | ||
635 | tsk->fpu_counter++; | 634 | tsk->fpu_counter++; |
636 | } | 635 | } |
637 | EXPORT_SYMBOL_GPL(math_state_restore); | 636 | EXPORT_SYMBOL_GPL(math_state_restore); |
638 | 637 | ||
639 | dotraplinkage void __kprobes | 638 | dotraplinkage void __kprobes |
640 | do_device_not_available(struct pt_regs *regs, long error_code) | 639 | do_device_not_available(struct pt_regs *regs, long error_code) |
641 | { | 640 | { |
642 | exception_enter(regs); | 641 | exception_enter(regs); |
643 | BUG_ON(use_eager_fpu()); | 642 | BUG_ON(use_eager_fpu()); |
644 | 643 | ||
645 | #ifdef CONFIG_MATH_EMULATION | 644 | #ifdef CONFIG_MATH_EMULATION |
646 | if (read_cr0() & X86_CR0_EM) { | 645 | if (read_cr0() & X86_CR0_EM) { |
647 | struct math_emu_info info = { }; | 646 | struct math_emu_info info = { }; |
648 | 647 | ||
649 | conditional_sti(regs); | 648 | conditional_sti(regs); |
650 | 649 | ||
651 | info.regs = regs; | 650 | info.regs = regs; |
652 | math_emulate(&info); | 651 | math_emulate(&info); |
653 | exception_exit(regs); | 652 | exception_exit(regs); |
654 | return; | 653 | return; |
655 | } | 654 | } |
656 | #endif | 655 | #endif |
657 | math_state_restore(); /* interrupts still off */ | 656 | math_state_restore(); /* interrupts still off */ |
658 | #ifdef CONFIG_X86_32 | 657 | #ifdef CONFIG_X86_32 |
659 | conditional_sti(regs); | 658 | conditional_sti(regs); |
660 | #endif | 659 | #endif |
661 | exception_exit(regs); | 660 | exception_exit(regs); |
662 | } | 661 | } |
663 | 662 | ||
664 | #ifdef CONFIG_X86_32 | 663 | #ifdef CONFIG_X86_32 |
665 | dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) | 664 | dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) |
666 | { | 665 | { |
667 | siginfo_t info; | 666 | siginfo_t info; |
668 | 667 | ||
669 | exception_enter(regs); | 668 | exception_enter(regs); |
670 | local_irq_enable(); | 669 | local_irq_enable(); |
671 | 670 | ||
672 | info.si_signo = SIGILL; | 671 | info.si_signo = SIGILL; |
673 | info.si_errno = 0; | 672 | info.si_errno = 0; |
674 | info.si_code = ILL_BADSTK; | 673 | info.si_code = ILL_BADSTK; |
675 | info.si_addr = NULL; | 674 | info.si_addr = NULL; |
676 | if (notify_die(DIE_TRAP, "iret exception", regs, error_code, | 675 | if (notify_die(DIE_TRAP, "iret exception", regs, error_code, |
677 | X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) { | 676 | X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) { |
678 | do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code, | 677 | do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code, |
679 | &info); | 678 | &info); |
680 | } | 679 | } |
681 | exception_exit(regs); | 680 | exception_exit(regs); |
682 | } | 681 | } |
683 | #endif | 682 | #endif |
684 | 683 | ||
685 | /* Set of traps needed for early debugging. */ | 684 | /* Set of traps needed for early debugging. */ |
686 | void __init early_trap_init(void) | 685 | void __init early_trap_init(void) |
687 | { | 686 | { |
688 | set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); | 687 | set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); |
689 | /* int3 can be called from all */ | 688 | /* int3 can be called from all */ |
690 | set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); | 689 | set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); |
691 | #ifdef CONFIG_X86_32 | 690 | #ifdef CONFIG_X86_32 |
692 | set_intr_gate(X86_TRAP_PF, &page_fault); | 691 | set_intr_gate(X86_TRAP_PF, &page_fault); |
693 | #endif | 692 | #endif |
694 | load_idt(&idt_descr); | 693 | load_idt(&idt_descr); |
695 | } | 694 | } |
696 | 695 | ||
697 | void __init early_trap_pf_init(void) | 696 | void __init early_trap_pf_init(void) |
698 | { | 697 | { |
699 | #ifdef CONFIG_X86_64 | 698 | #ifdef CONFIG_X86_64 |
700 | set_intr_gate(X86_TRAP_PF, &page_fault); | 699 | set_intr_gate(X86_TRAP_PF, &page_fault); |
701 | #endif | 700 | #endif |
702 | } | 701 | } |
703 | 702 | ||
704 | void __init trap_init(void) | 703 | void __init trap_init(void) |
705 | { | 704 | { |
706 | int i; | 705 | int i; |
707 | 706 | ||
708 | #ifdef CONFIG_EISA | 707 | #ifdef CONFIG_EISA |
709 | void __iomem *p = early_ioremap(0x0FFFD9, 4); | 708 | void __iomem *p = early_ioremap(0x0FFFD9, 4); |
710 | 709 | ||
711 | if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24)) | 710 | if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24)) |
712 | EISA_bus = 1; | 711 | EISA_bus = 1; |
713 | early_iounmap(p, 4); | 712 | early_iounmap(p, 4); |
714 | #endif | 713 | #endif |
715 | 714 | ||
716 | set_intr_gate(X86_TRAP_DE, ÷_error); | 715 | set_intr_gate(X86_TRAP_DE, ÷_error); |
717 | set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK); | 716 | set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK); |
718 | /* int4 can be called from all */ | 717 | /* int4 can be called from all */ |
719 | set_system_intr_gate(X86_TRAP_OF, &overflow); | 718 | set_system_intr_gate(X86_TRAP_OF, &overflow); |
720 | set_intr_gate(X86_TRAP_BR, &bounds); | 719 | set_intr_gate(X86_TRAP_BR, &bounds); |
721 | set_intr_gate(X86_TRAP_UD, &invalid_op); | 720 | set_intr_gate(X86_TRAP_UD, &invalid_op); |
722 | set_intr_gate(X86_TRAP_NM, &device_not_available); | 721 | set_intr_gate(X86_TRAP_NM, &device_not_available); |
723 | #ifdef CONFIG_X86_32 | 722 | #ifdef CONFIG_X86_32 |
724 | set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS); | 723 | set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS); |
725 | #else | 724 | #else |
726 | set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK); | 725 | set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK); |
727 | #endif | 726 | #endif |
728 | set_intr_gate(X86_TRAP_OLD_MF, &coprocessor_segment_overrun); | 727 | set_intr_gate(X86_TRAP_OLD_MF, &coprocessor_segment_overrun); |
729 | set_intr_gate(X86_TRAP_TS, &invalid_TSS); | 728 | set_intr_gate(X86_TRAP_TS, &invalid_TSS); |
730 | set_intr_gate(X86_TRAP_NP, &segment_not_present); | 729 | set_intr_gate(X86_TRAP_NP, &segment_not_present); |
731 | set_intr_gate_ist(X86_TRAP_SS, &stack_segment, STACKFAULT_STACK); | 730 | set_intr_gate_ist(X86_TRAP_SS, &stack_segment, STACKFAULT_STACK); |
732 | set_intr_gate(X86_TRAP_GP, &general_protection); | 731 | set_intr_gate(X86_TRAP_GP, &general_protection); |
733 | set_intr_gate(X86_TRAP_SPURIOUS, &spurious_interrupt_bug); | 732 | set_intr_gate(X86_TRAP_SPURIOUS, &spurious_interrupt_bug); |
734 | set_intr_gate(X86_TRAP_MF, &coprocessor_error); | 733 | set_intr_gate(X86_TRAP_MF, &coprocessor_error); |
735 | set_intr_gate(X86_TRAP_AC, &alignment_check); | 734 | set_intr_gate(X86_TRAP_AC, &alignment_check); |
736 | #ifdef CONFIG_X86_MCE | 735 | #ifdef CONFIG_X86_MCE |
737 | set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK); | 736 | set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK); |
738 | #endif | 737 | #endif |
739 | set_intr_gate(X86_TRAP_XF, &simd_coprocessor_error); | 738 | set_intr_gate(X86_TRAP_XF, &simd_coprocessor_error); |
740 | 739 | ||
741 | /* Reserve all the builtin and the syscall vector: */ | 740 | /* Reserve all the builtin and the syscall vector: */ |
742 | for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) | 741 | for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) |
743 | set_bit(i, used_vectors); | 742 | set_bit(i, used_vectors); |
744 | 743 | ||
745 | #ifdef CONFIG_IA32_EMULATION | 744 | #ifdef CONFIG_IA32_EMULATION |
746 | set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); | 745 | set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); |
747 | set_bit(IA32_SYSCALL_VECTOR, used_vectors); | 746 | set_bit(IA32_SYSCALL_VECTOR, used_vectors); |
748 | #endif | 747 | #endif |
749 | 748 | ||
750 | #ifdef CONFIG_X86_32 | 749 | #ifdef CONFIG_X86_32 |
751 | set_system_trap_gate(SYSCALL_VECTOR, &system_call); | 750 | set_system_trap_gate(SYSCALL_VECTOR, &system_call); |
752 | set_bit(SYSCALL_VECTOR, used_vectors); | 751 | set_bit(SYSCALL_VECTOR, used_vectors); |
753 | #endif | 752 | #endif |
754 | 753 | ||
755 | /* | 754 | /* |
756 | * Should be a barrier for any external CPU state: | 755 | * Should be a barrier for any external CPU state: |
757 | */ | 756 | */ |
758 | cpu_init(); | 757 | cpu_init(); |
759 | 758 | ||
760 | x86_init.irqs.trap_init(); | 759 | x86_init.irqs.trap_init(); |
761 | 760 | ||
762 | #ifdef CONFIG_X86_64 | 761 | #ifdef CONFIG_X86_64 |
763 | memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16); | 762 | memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16); |
764 | set_nmi_gate(X86_TRAP_DB, &debug); | 763 | set_nmi_gate(X86_TRAP_DB, &debug); |
765 | set_nmi_gate(X86_TRAP_BP, &int3); | 764 | set_nmi_gate(X86_TRAP_BP, &int3); |
766 | #endif | 765 | #endif |
767 | } | 766 | } |
arch/x86/mm/fault.c
1 | /* | 1 | /* |
2 | * Copyright (C) 1995 Linus Torvalds | 2 | * Copyright (C) 1995 Linus Torvalds |
3 | * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. | 3 | * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. |
4 | * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar | 4 | * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar |
5 | */ | 5 | */ |
6 | #include <linux/magic.h> /* STACK_END_MAGIC */ | 6 | #include <linux/magic.h> /* STACK_END_MAGIC */ |
7 | #include <linux/sched.h> /* test_thread_flag(), ... */ | 7 | #include <linux/sched.h> /* test_thread_flag(), ... */ |
8 | #include <linux/kdebug.h> /* oops_begin/end, ... */ | 8 | #include <linux/kdebug.h> /* oops_begin/end, ... */ |
9 | #include <linux/module.h> /* search_exception_table */ | 9 | #include <linux/module.h> /* search_exception_table */ |
10 | #include <linux/bootmem.h> /* max_low_pfn */ | 10 | #include <linux/bootmem.h> /* max_low_pfn */ |
11 | #include <linux/kprobes.h> /* __kprobes, ... */ | 11 | #include <linux/kprobes.h> /* __kprobes, ... */ |
12 | #include <linux/mmiotrace.h> /* kmmio_handler, ... */ | 12 | #include <linux/mmiotrace.h> /* kmmio_handler, ... */ |
13 | #include <linux/perf_event.h> /* perf_sw_event */ | 13 | #include <linux/perf_event.h> /* perf_sw_event */ |
14 | #include <linux/hugetlb.h> /* hstate_index_to_shift */ | 14 | #include <linux/hugetlb.h> /* hstate_index_to_shift */ |
15 | #include <linux/prefetch.h> /* prefetchw */ | 15 | #include <linux/prefetch.h> /* prefetchw */ |
16 | #include <linux/context_tracking.h> /* exception_enter(), ... */ | ||
16 | 17 | ||
17 | #include <asm/traps.h> /* dotraplinkage, ... */ | 18 | #include <asm/traps.h> /* dotraplinkage, ... */ |
18 | #include <asm/pgalloc.h> /* pgd_*(), ... */ | 19 | #include <asm/pgalloc.h> /* pgd_*(), ... */ |
19 | #include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ | 20 | #include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ |
20 | #include <asm/fixmap.h> /* VSYSCALL_START */ | 21 | #include <asm/fixmap.h> /* VSYSCALL_START */ |
21 | #include <asm/context_tracking.h> /* exception_enter(), ... */ | ||
22 | 22 | ||
23 | /* | 23 | /* |
24 | * Page fault error code bits: | 24 | * Page fault error code bits: |
25 | * | 25 | * |
26 | * bit 0 == 0: no page found 1: protection fault | 26 | * bit 0 == 0: no page found 1: protection fault |
27 | * bit 1 == 0: read access 1: write access | 27 | * bit 1 == 0: read access 1: write access |
28 | * bit 2 == 0: kernel-mode access 1: user-mode access | 28 | * bit 2 == 0: kernel-mode access 1: user-mode access |
29 | * bit 3 == 1: use of reserved bit detected | 29 | * bit 3 == 1: use of reserved bit detected |
30 | * bit 4 == 1: fault was an instruction fetch | 30 | * bit 4 == 1: fault was an instruction fetch |
31 | */ | 31 | */ |
32 | enum x86_pf_error_code { | 32 | enum x86_pf_error_code { |
33 | 33 | ||
34 | PF_PROT = 1 << 0, | 34 | PF_PROT = 1 << 0, |
35 | PF_WRITE = 1 << 1, | 35 | PF_WRITE = 1 << 1, |
36 | PF_USER = 1 << 2, | 36 | PF_USER = 1 << 2, |
37 | PF_RSVD = 1 << 3, | 37 | PF_RSVD = 1 << 3, |
38 | PF_INSTR = 1 << 4, | 38 | PF_INSTR = 1 << 4, |
39 | }; | 39 | }; |
40 | 40 | ||
41 | /* | 41 | /* |
42 | * Returns 0 if mmiotrace is disabled, or if the fault is not | 42 | * Returns 0 if mmiotrace is disabled, or if the fault is not |
43 | * handled by mmiotrace: | 43 | * handled by mmiotrace: |
44 | */ | 44 | */ |
45 | static inline int __kprobes | 45 | static inline int __kprobes |
46 | kmmio_fault(struct pt_regs *regs, unsigned long addr) | 46 | kmmio_fault(struct pt_regs *regs, unsigned long addr) |
47 | { | 47 | { |
48 | if (unlikely(is_kmmio_active())) | 48 | if (unlikely(is_kmmio_active())) |
49 | if (kmmio_handler(regs, addr) == 1) | 49 | if (kmmio_handler(regs, addr) == 1) |
50 | return -1; | 50 | return -1; |
51 | return 0; | 51 | return 0; |
52 | } | 52 | } |
53 | 53 | ||
54 | static inline int __kprobes notify_page_fault(struct pt_regs *regs) | 54 | static inline int __kprobes notify_page_fault(struct pt_regs *regs) |
55 | { | 55 | { |
56 | int ret = 0; | 56 | int ret = 0; |
57 | 57 | ||
58 | /* kprobe_running() needs smp_processor_id() */ | 58 | /* kprobe_running() needs smp_processor_id() */ |
59 | if (kprobes_built_in() && !user_mode_vm(regs)) { | 59 | if (kprobes_built_in() && !user_mode_vm(regs)) { |
60 | preempt_disable(); | 60 | preempt_disable(); |
61 | if (kprobe_running() && kprobe_fault_handler(regs, 14)) | 61 | if (kprobe_running() && kprobe_fault_handler(regs, 14)) |
62 | ret = 1; | 62 | ret = 1; |
63 | preempt_enable(); | 63 | preempt_enable(); |
64 | } | 64 | } |
65 | 65 | ||
66 | return ret; | 66 | return ret; |
67 | } | 67 | } |
68 | 68 | ||
69 | /* | 69 | /* |
70 | * Prefetch quirks: | 70 | * Prefetch quirks: |
71 | * | 71 | * |
72 | * 32-bit mode: | 72 | * 32-bit mode: |
73 | * | 73 | * |
74 | * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. | 74 | * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. |
75 | * Check that here and ignore it. | 75 | * Check that here and ignore it. |
76 | * | 76 | * |
77 | * 64-bit mode: | 77 | * 64-bit mode: |
78 | * | 78 | * |
79 | * Sometimes the CPU reports invalid exceptions on prefetch. | 79 | * Sometimes the CPU reports invalid exceptions on prefetch. |
80 | * Check that here and ignore it. | 80 | * Check that here and ignore it. |
81 | * | 81 | * |
82 | * Opcode checker based on code by Richard Brunner. | 82 | * Opcode checker based on code by Richard Brunner. |
83 | */ | 83 | */ |
84 | static inline int | 84 | static inline int |
85 | check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, | 85 | check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, |
86 | unsigned char opcode, int *prefetch) | 86 | unsigned char opcode, int *prefetch) |
87 | { | 87 | { |
88 | unsigned char instr_hi = opcode & 0xf0; | 88 | unsigned char instr_hi = opcode & 0xf0; |
89 | unsigned char instr_lo = opcode & 0x0f; | 89 | unsigned char instr_lo = opcode & 0x0f; |
90 | 90 | ||
91 | switch (instr_hi) { | 91 | switch (instr_hi) { |
92 | case 0x20: | 92 | case 0x20: |
93 | case 0x30: | 93 | case 0x30: |
94 | /* | 94 | /* |
95 | * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. | 95 | * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. |
96 | * In X86_64 long mode, the CPU will signal invalid | 96 | * In X86_64 long mode, the CPU will signal invalid |
97 | * opcode if some of these prefixes are present so | 97 | * opcode if some of these prefixes are present so |
98 | * X86_64 will never get here anyway | 98 | * X86_64 will never get here anyway |
99 | */ | 99 | */ |
100 | return ((instr_lo & 7) == 0x6); | 100 | return ((instr_lo & 7) == 0x6); |
101 | #ifdef CONFIG_X86_64 | 101 | #ifdef CONFIG_X86_64 |
102 | case 0x40: | 102 | case 0x40: |
103 | /* | 103 | /* |
104 | * In AMD64 long mode 0x40..0x4F are valid REX prefixes | 104 | * In AMD64 long mode 0x40..0x4F are valid REX prefixes |
105 | * Need to figure out under what instruction mode the | 105 | * Need to figure out under what instruction mode the |
106 | * instruction was issued. Could check the LDT for lm, | 106 | * instruction was issued. Could check the LDT for lm, |
107 | * but for now it's good enough to assume that long | 107 | * but for now it's good enough to assume that long |
108 | * mode only uses well known segments or kernel. | 108 | * mode only uses well known segments or kernel. |
109 | */ | 109 | */ |
110 | return (!user_mode(regs) || user_64bit_mode(regs)); | 110 | return (!user_mode(regs) || user_64bit_mode(regs)); |
111 | #endif | 111 | #endif |
112 | case 0x60: | 112 | case 0x60: |
113 | /* 0x64 thru 0x67 are valid prefixes in all modes. */ | 113 | /* 0x64 thru 0x67 are valid prefixes in all modes. */ |
114 | return (instr_lo & 0xC) == 0x4; | 114 | return (instr_lo & 0xC) == 0x4; |
115 | case 0xF0: | 115 | case 0xF0: |
116 | /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ | 116 | /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ |
117 | return !instr_lo || (instr_lo>>1) == 1; | 117 | return !instr_lo || (instr_lo>>1) == 1; |
118 | case 0x00: | 118 | case 0x00: |
119 | /* Prefetch instruction is 0x0F0D or 0x0F18 */ | 119 | /* Prefetch instruction is 0x0F0D or 0x0F18 */ |
120 | if (probe_kernel_address(instr, opcode)) | 120 | if (probe_kernel_address(instr, opcode)) |
121 | return 0; | 121 | return 0; |
122 | 122 | ||
123 | *prefetch = (instr_lo == 0xF) && | 123 | *prefetch = (instr_lo == 0xF) && |
124 | (opcode == 0x0D || opcode == 0x18); | 124 | (opcode == 0x0D || opcode == 0x18); |
125 | return 0; | 125 | return 0; |
126 | default: | 126 | default: |
127 | return 0; | 127 | return 0; |
128 | } | 128 | } |
129 | } | 129 | } |
130 | 130 | ||
131 | static int | 131 | static int |
132 | is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) | 132 | is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) |
133 | { | 133 | { |
134 | unsigned char *max_instr; | 134 | unsigned char *max_instr; |
135 | unsigned char *instr; | 135 | unsigned char *instr; |
136 | int prefetch = 0; | 136 | int prefetch = 0; |
137 | 137 | ||
138 | /* | 138 | /* |
139 | * If it was a exec (instruction fetch) fault on NX page, then | 139 | * If it was a exec (instruction fetch) fault on NX page, then |
140 | * do not ignore the fault: | 140 | * do not ignore the fault: |
141 | */ | 141 | */ |
142 | if (error_code & PF_INSTR) | 142 | if (error_code & PF_INSTR) |
143 | return 0; | 143 | return 0; |
144 | 144 | ||
145 | instr = (void *)convert_ip_to_linear(current, regs); | 145 | instr = (void *)convert_ip_to_linear(current, regs); |
146 | max_instr = instr + 15; | 146 | max_instr = instr + 15; |
147 | 147 | ||
148 | if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) | 148 | if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) |
149 | return 0; | 149 | return 0; |
150 | 150 | ||
151 | while (instr < max_instr) { | 151 | while (instr < max_instr) { |
152 | unsigned char opcode; | 152 | unsigned char opcode; |
153 | 153 | ||
154 | if (probe_kernel_address(instr, opcode)) | 154 | if (probe_kernel_address(instr, opcode)) |
155 | break; | 155 | break; |
156 | 156 | ||
157 | instr++; | 157 | instr++; |
158 | 158 | ||
159 | if (!check_prefetch_opcode(regs, instr, opcode, &prefetch)) | 159 | if (!check_prefetch_opcode(regs, instr, opcode, &prefetch)) |
160 | break; | 160 | break; |
161 | } | 161 | } |
162 | return prefetch; | 162 | return prefetch; |
163 | } | 163 | } |
164 | 164 | ||
165 | static void | 165 | static void |
166 | force_sig_info_fault(int si_signo, int si_code, unsigned long address, | 166 | force_sig_info_fault(int si_signo, int si_code, unsigned long address, |
167 | struct task_struct *tsk, int fault) | 167 | struct task_struct *tsk, int fault) |
168 | { | 168 | { |
169 | unsigned lsb = 0; | 169 | unsigned lsb = 0; |
170 | siginfo_t info; | 170 | siginfo_t info; |
171 | 171 | ||
172 | info.si_signo = si_signo; | 172 | info.si_signo = si_signo; |
173 | info.si_errno = 0; | 173 | info.si_errno = 0; |
174 | info.si_code = si_code; | 174 | info.si_code = si_code; |
175 | info.si_addr = (void __user *)address; | 175 | info.si_addr = (void __user *)address; |
176 | if (fault & VM_FAULT_HWPOISON_LARGE) | 176 | if (fault & VM_FAULT_HWPOISON_LARGE) |
177 | lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); | 177 | lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); |
178 | if (fault & VM_FAULT_HWPOISON) | 178 | if (fault & VM_FAULT_HWPOISON) |
179 | lsb = PAGE_SHIFT; | 179 | lsb = PAGE_SHIFT; |
180 | info.si_addr_lsb = lsb; | 180 | info.si_addr_lsb = lsb; |
181 | 181 | ||
182 | force_sig_info(si_signo, &info, tsk); | 182 | force_sig_info(si_signo, &info, tsk); |
183 | } | 183 | } |
184 | 184 | ||
185 | DEFINE_SPINLOCK(pgd_lock); | 185 | DEFINE_SPINLOCK(pgd_lock); |
186 | LIST_HEAD(pgd_list); | 186 | LIST_HEAD(pgd_list); |
187 | 187 | ||
188 | #ifdef CONFIG_X86_32 | 188 | #ifdef CONFIG_X86_32 |
189 | static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) | 189 | static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) |
190 | { | 190 | { |
191 | unsigned index = pgd_index(address); | 191 | unsigned index = pgd_index(address); |
192 | pgd_t *pgd_k; | 192 | pgd_t *pgd_k; |
193 | pud_t *pud, *pud_k; | 193 | pud_t *pud, *pud_k; |
194 | pmd_t *pmd, *pmd_k; | 194 | pmd_t *pmd, *pmd_k; |
195 | 195 | ||
196 | pgd += index; | 196 | pgd += index; |
197 | pgd_k = init_mm.pgd + index; | 197 | pgd_k = init_mm.pgd + index; |
198 | 198 | ||
199 | if (!pgd_present(*pgd_k)) | 199 | if (!pgd_present(*pgd_k)) |
200 | return NULL; | 200 | return NULL; |
201 | 201 | ||
202 | /* | 202 | /* |
203 | * set_pgd(pgd, *pgd_k); here would be useless on PAE | 203 | * set_pgd(pgd, *pgd_k); here would be useless on PAE |
204 | * and redundant with the set_pmd() on non-PAE. As would | 204 | * and redundant with the set_pmd() on non-PAE. As would |
205 | * set_pud. | 205 | * set_pud. |
206 | */ | 206 | */ |
207 | pud = pud_offset(pgd, address); | 207 | pud = pud_offset(pgd, address); |
208 | pud_k = pud_offset(pgd_k, address); | 208 | pud_k = pud_offset(pgd_k, address); |
209 | if (!pud_present(*pud_k)) | 209 | if (!pud_present(*pud_k)) |
210 | return NULL; | 210 | return NULL; |
211 | 211 | ||
212 | pmd = pmd_offset(pud, address); | 212 | pmd = pmd_offset(pud, address); |
213 | pmd_k = pmd_offset(pud_k, address); | 213 | pmd_k = pmd_offset(pud_k, address); |
214 | if (!pmd_present(*pmd_k)) | 214 | if (!pmd_present(*pmd_k)) |
215 | return NULL; | 215 | return NULL; |
216 | 216 | ||
217 | if (!pmd_present(*pmd)) | 217 | if (!pmd_present(*pmd)) |
218 | set_pmd(pmd, *pmd_k); | 218 | set_pmd(pmd, *pmd_k); |
219 | else | 219 | else |
220 | BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); | 220 | BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); |
221 | 221 | ||
222 | return pmd_k; | 222 | return pmd_k; |
223 | } | 223 | } |
224 | 224 | ||
225 | void vmalloc_sync_all(void) | 225 | void vmalloc_sync_all(void) |
226 | { | 226 | { |
227 | unsigned long address; | 227 | unsigned long address; |
228 | 228 | ||
229 | if (SHARED_KERNEL_PMD) | 229 | if (SHARED_KERNEL_PMD) |
230 | return; | 230 | return; |
231 | 231 | ||
232 | for (address = VMALLOC_START & PMD_MASK; | 232 | for (address = VMALLOC_START & PMD_MASK; |
233 | address >= TASK_SIZE && address < FIXADDR_TOP; | 233 | address >= TASK_SIZE && address < FIXADDR_TOP; |
234 | address += PMD_SIZE) { | 234 | address += PMD_SIZE) { |
235 | struct page *page; | 235 | struct page *page; |
236 | 236 | ||
237 | spin_lock(&pgd_lock); | 237 | spin_lock(&pgd_lock); |
238 | list_for_each_entry(page, &pgd_list, lru) { | 238 | list_for_each_entry(page, &pgd_list, lru) { |
239 | spinlock_t *pgt_lock; | 239 | spinlock_t *pgt_lock; |
240 | pmd_t *ret; | 240 | pmd_t *ret; |
241 | 241 | ||
242 | /* the pgt_lock only for Xen */ | 242 | /* the pgt_lock only for Xen */ |
243 | pgt_lock = &pgd_page_get_mm(page)->page_table_lock; | 243 | pgt_lock = &pgd_page_get_mm(page)->page_table_lock; |
244 | 244 | ||
245 | spin_lock(pgt_lock); | 245 | spin_lock(pgt_lock); |
246 | ret = vmalloc_sync_one(page_address(page), address); | 246 | ret = vmalloc_sync_one(page_address(page), address); |
247 | spin_unlock(pgt_lock); | 247 | spin_unlock(pgt_lock); |
248 | 248 | ||
249 | if (!ret) | 249 | if (!ret) |
250 | break; | 250 | break; |
251 | } | 251 | } |
252 | spin_unlock(&pgd_lock); | 252 | spin_unlock(&pgd_lock); |
253 | } | 253 | } |
254 | } | 254 | } |
255 | 255 | ||
256 | /* | 256 | /* |
257 | * 32-bit: | 257 | * 32-bit: |
258 | * | 258 | * |
259 | * Handle a fault on the vmalloc or module mapping area | 259 | * Handle a fault on the vmalloc or module mapping area |
260 | */ | 260 | */ |
261 | static noinline __kprobes int vmalloc_fault(unsigned long address) | 261 | static noinline __kprobes int vmalloc_fault(unsigned long address) |
262 | { | 262 | { |
263 | unsigned long pgd_paddr; | 263 | unsigned long pgd_paddr; |
264 | pmd_t *pmd_k; | 264 | pmd_t *pmd_k; |
265 | pte_t *pte_k; | 265 | pte_t *pte_k; |
266 | 266 | ||
267 | /* Make sure we are in vmalloc area: */ | 267 | /* Make sure we are in vmalloc area: */ |
268 | if (!(address >= VMALLOC_START && address < VMALLOC_END)) | 268 | if (!(address >= VMALLOC_START && address < VMALLOC_END)) |
269 | return -1; | 269 | return -1; |
270 | 270 | ||
271 | WARN_ON_ONCE(in_nmi()); | 271 | WARN_ON_ONCE(in_nmi()); |
272 | 272 | ||
273 | /* | 273 | /* |
274 | * Synchronize this task's top level page-table | 274 | * Synchronize this task's top level page-table |
275 | * with the 'reference' page table. | 275 | * with the 'reference' page table. |
276 | * | 276 | * |
277 | * Do _not_ use "current" here. We might be inside | 277 | * Do _not_ use "current" here. We might be inside |
278 | * an interrupt in the middle of a task switch.. | 278 | * an interrupt in the middle of a task switch.. |
279 | */ | 279 | */ |
280 | pgd_paddr = read_cr3(); | 280 | pgd_paddr = read_cr3(); |
281 | pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); | 281 | pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); |
282 | if (!pmd_k) | 282 | if (!pmd_k) |
283 | return -1; | 283 | return -1; |
284 | 284 | ||
285 | pte_k = pte_offset_kernel(pmd_k, address); | 285 | pte_k = pte_offset_kernel(pmd_k, address); |
286 | if (!pte_present(*pte_k)) | 286 | if (!pte_present(*pte_k)) |
287 | return -1; | 287 | return -1; |
288 | 288 | ||
289 | return 0; | 289 | return 0; |
290 | } | 290 | } |
291 | 291 | ||
292 | /* | 292 | /* |
293 | * Did it hit the DOS screen memory VA from vm86 mode? | 293 | * Did it hit the DOS screen memory VA from vm86 mode? |
294 | */ | 294 | */ |
295 | static inline void | 295 | static inline void |
296 | check_v8086_mode(struct pt_regs *regs, unsigned long address, | 296 | check_v8086_mode(struct pt_regs *regs, unsigned long address, |
297 | struct task_struct *tsk) | 297 | struct task_struct *tsk) |
298 | { | 298 | { |
299 | unsigned long bit; | 299 | unsigned long bit; |
300 | 300 | ||
301 | if (!v8086_mode(regs)) | 301 | if (!v8086_mode(regs)) |
302 | return; | 302 | return; |
303 | 303 | ||
304 | bit = (address - 0xA0000) >> PAGE_SHIFT; | 304 | bit = (address - 0xA0000) >> PAGE_SHIFT; |
305 | if (bit < 32) | 305 | if (bit < 32) |
306 | tsk->thread.screen_bitmap |= 1 << bit; | 306 | tsk->thread.screen_bitmap |= 1 << bit; |
307 | } | 307 | } |
308 | 308 | ||
309 | static bool low_pfn(unsigned long pfn) | 309 | static bool low_pfn(unsigned long pfn) |
310 | { | 310 | { |
311 | return pfn < max_low_pfn; | 311 | return pfn < max_low_pfn; |
312 | } | 312 | } |
313 | 313 | ||
314 | static void dump_pagetable(unsigned long address) | 314 | static void dump_pagetable(unsigned long address) |
315 | { | 315 | { |
316 | pgd_t *base = __va(read_cr3()); | 316 | pgd_t *base = __va(read_cr3()); |
317 | pgd_t *pgd = &base[pgd_index(address)]; | 317 | pgd_t *pgd = &base[pgd_index(address)]; |
318 | pmd_t *pmd; | 318 | pmd_t *pmd; |
319 | pte_t *pte; | 319 | pte_t *pte; |
320 | 320 | ||
321 | #ifdef CONFIG_X86_PAE | 321 | #ifdef CONFIG_X86_PAE |
322 | printk("*pdpt = %016Lx ", pgd_val(*pgd)); | 322 | printk("*pdpt = %016Lx ", pgd_val(*pgd)); |
323 | if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) | 323 | if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) |
324 | goto out; | 324 | goto out; |
325 | #endif | 325 | #endif |
326 | pmd = pmd_offset(pud_offset(pgd, address), address); | 326 | pmd = pmd_offset(pud_offset(pgd, address), address); |
327 | printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); | 327 | printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); |
328 | 328 | ||
329 | /* | 329 | /* |
330 | * We must not directly access the pte in the highpte | 330 | * We must not directly access the pte in the highpte |
331 | * case if the page table is located in highmem. | 331 | * case if the page table is located in highmem. |
332 | * And let's rather not kmap-atomic the pte, just in case | 332 | * And let's rather not kmap-atomic the pte, just in case |
333 | * it's allocated already: | 333 | * it's allocated already: |
334 | */ | 334 | */ |
335 | if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd)) | 335 | if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd)) |
336 | goto out; | 336 | goto out; |
337 | 337 | ||
338 | pte = pte_offset_kernel(pmd, address); | 338 | pte = pte_offset_kernel(pmd, address); |
339 | printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); | 339 | printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); |
340 | out: | 340 | out: |
341 | printk("\n"); | 341 | printk("\n"); |
342 | } | 342 | } |
343 | 343 | ||
344 | #else /* CONFIG_X86_64: */ | 344 | #else /* CONFIG_X86_64: */ |
345 | 345 | ||
346 | void vmalloc_sync_all(void) | 346 | void vmalloc_sync_all(void) |
347 | { | 347 | { |
348 | sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); | 348 | sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); |
349 | } | 349 | } |
350 | 350 | ||
351 | /* | 351 | /* |
352 | * 64-bit: | 352 | * 64-bit: |
353 | * | 353 | * |
354 | * Handle a fault on the vmalloc area | 354 | * Handle a fault on the vmalloc area |
355 | * | 355 | * |
356 | * This assumes no large pages in there. | 356 | * This assumes no large pages in there. |
357 | */ | 357 | */ |
358 | static noinline __kprobes int vmalloc_fault(unsigned long address) | 358 | static noinline __kprobes int vmalloc_fault(unsigned long address) |
359 | { | 359 | { |
360 | pgd_t *pgd, *pgd_ref; | 360 | pgd_t *pgd, *pgd_ref; |
361 | pud_t *pud, *pud_ref; | 361 | pud_t *pud, *pud_ref; |
362 | pmd_t *pmd, *pmd_ref; | 362 | pmd_t *pmd, *pmd_ref; |
363 | pte_t *pte, *pte_ref; | 363 | pte_t *pte, *pte_ref; |
364 | 364 | ||
365 | /* Make sure we are in vmalloc area: */ | 365 | /* Make sure we are in vmalloc area: */ |
366 | if (!(address >= VMALLOC_START && address < VMALLOC_END)) | 366 | if (!(address >= VMALLOC_START && address < VMALLOC_END)) |
367 | return -1; | 367 | return -1; |
368 | 368 | ||
369 | WARN_ON_ONCE(in_nmi()); | 369 | WARN_ON_ONCE(in_nmi()); |
370 | 370 | ||
371 | /* | 371 | /* |
372 | * Copy kernel mappings over when needed. This can also | 372 | * Copy kernel mappings over when needed. This can also |
373 | * happen within a race in page table update. In the later | 373 | * happen within a race in page table update. In the later |
374 | * case just flush: | 374 | * case just flush: |
375 | */ | 375 | */ |
376 | pgd = pgd_offset(current->active_mm, address); | 376 | pgd = pgd_offset(current->active_mm, address); |
377 | pgd_ref = pgd_offset_k(address); | 377 | pgd_ref = pgd_offset_k(address); |
378 | if (pgd_none(*pgd_ref)) | 378 | if (pgd_none(*pgd_ref)) |
379 | return -1; | 379 | return -1; |
380 | 380 | ||
381 | if (pgd_none(*pgd)) | 381 | if (pgd_none(*pgd)) |
382 | set_pgd(pgd, *pgd_ref); | 382 | set_pgd(pgd, *pgd_ref); |
383 | else | 383 | else |
384 | BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); | 384 | BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); |
385 | 385 | ||
386 | /* | 386 | /* |
387 | * Below here mismatches are bugs because these lower tables | 387 | * Below here mismatches are bugs because these lower tables |
388 | * are shared: | 388 | * are shared: |
389 | */ | 389 | */ |
390 | 390 | ||
391 | pud = pud_offset(pgd, address); | 391 | pud = pud_offset(pgd, address); |
392 | pud_ref = pud_offset(pgd_ref, address); | 392 | pud_ref = pud_offset(pgd_ref, address); |
393 | if (pud_none(*pud_ref)) | 393 | if (pud_none(*pud_ref)) |
394 | return -1; | 394 | return -1; |
395 | 395 | ||
396 | if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) | 396 | if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) |
397 | BUG(); | 397 | BUG(); |
398 | 398 | ||
399 | pmd = pmd_offset(pud, address); | 399 | pmd = pmd_offset(pud, address); |
400 | pmd_ref = pmd_offset(pud_ref, address); | 400 | pmd_ref = pmd_offset(pud_ref, address); |
401 | if (pmd_none(*pmd_ref)) | 401 | if (pmd_none(*pmd_ref)) |
402 | return -1; | 402 | return -1; |
403 | 403 | ||
404 | if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) | 404 | if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) |
405 | BUG(); | 405 | BUG(); |
406 | 406 | ||
407 | pte_ref = pte_offset_kernel(pmd_ref, address); | 407 | pte_ref = pte_offset_kernel(pmd_ref, address); |
408 | if (!pte_present(*pte_ref)) | 408 | if (!pte_present(*pte_ref)) |
409 | return -1; | 409 | return -1; |
410 | 410 | ||
411 | pte = pte_offset_kernel(pmd, address); | 411 | pte = pte_offset_kernel(pmd, address); |
412 | 412 | ||
413 | /* | 413 | /* |
414 | * Don't use pte_page here, because the mappings can point | 414 | * Don't use pte_page here, because the mappings can point |
415 | * outside mem_map, and the NUMA hash lookup cannot handle | 415 | * outside mem_map, and the NUMA hash lookup cannot handle |
416 | * that: | 416 | * that: |
417 | */ | 417 | */ |
418 | if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) | 418 | if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) |
419 | BUG(); | 419 | BUG(); |
420 | 420 | ||
421 | return 0; | 421 | return 0; |
422 | } | 422 | } |
423 | 423 | ||
424 | #ifdef CONFIG_CPU_SUP_AMD | 424 | #ifdef CONFIG_CPU_SUP_AMD |
425 | static const char errata93_warning[] = | 425 | static const char errata93_warning[] = |
426 | KERN_ERR | 426 | KERN_ERR |
427 | "******* Your BIOS seems to not contain a fix for K8 errata #93\n" | 427 | "******* Your BIOS seems to not contain a fix for K8 errata #93\n" |
428 | "******* Working around it, but it may cause SEGVs or burn power.\n" | 428 | "******* Working around it, but it may cause SEGVs or burn power.\n" |
429 | "******* Please consider a BIOS update.\n" | 429 | "******* Please consider a BIOS update.\n" |
430 | "******* Disabling USB legacy in the BIOS may also help.\n"; | 430 | "******* Disabling USB legacy in the BIOS may also help.\n"; |
431 | #endif | 431 | #endif |
432 | 432 | ||
433 | /* | 433 | /* |
434 | * No vm86 mode in 64-bit mode: | 434 | * No vm86 mode in 64-bit mode: |
435 | */ | 435 | */ |
436 | static inline void | 436 | static inline void |
437 | check_v8086_mode(struct pt_regs *regs, unsigned long address, | 437 | check_v8086_mode(struct pt_regs *regs, unsigned long address, |
438 | struct task_struct *tsk) | 438 | struct task_struct *tsk) |
439 | { | 439 | { |
440 | } | 440 | } |
441 | 441 | ||
442 | static int bad_address(void *p) | 442 | static int bad_address(void *p) |
443 | { | 443 | { |
444 | unsigned long dummy; | 444 | unsigned long dummy; |
445 | 445 | ||
446 | return probe_kernel_address((unsigned long *)p, dummy); | 446 | return probe_kernel_address((unsigned long *)p, dummy); |
447 | } | 447 | } |
448 | 448 | ||
449 | static void dump_pagetable(unsigned long address) | 449 | static void dump_pagetable(unsigned long address) |
450 | { | 450 | { |
451 | pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); | 451 | pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); |
452 | pgd_t *pgd = base + pgd_index(address); | 452 | pgd_t *pgd = base + pgd_index(address); |
453 | pud_t *pud; | 453 | pud_t *pud; |
454 | pmd_t *pmd; | 454 | pmd_t *pmd; |
455 | pte_t *pte; | 455 | pte_t *pte; |
456 | 456 | ||
457 | if (bad_address(pgd)) | 457 | if (bad_address(pgd)) |
458 | goto bad; | 458 | goto bad; |
459 | 459 | ||
460 | printk("PGD %lx ", pgd_val(*pgd)); | 460 | printk("PGD %lx ", pgd_val(*pgd)); |
461 | 461 | ||
462 | if (!pgd_present(*pgd)) | 462 | if (!pgd_present(*pgd)) |
463 | goto out; | 463 | goto out; |
464 | 464 | ||
465 | pud = pud_offset(pgd, address); | 465 | pud = pud_offset(pgd, address); |
466 | if (bad_address(pud)) | 466 | if (bad_address(pud)) |
467 | goto bad; | 467 | goto bad; |
468 | 468 | ||
469 | printk("PUD %lx ", pud_val(*pud)); | 469 | printk("PUD %lx ", pud_val(*pud)); |
470 | if (!pud_present(*pud) || pud_large(*pud)) | 470 | if (!pud_present(*pud) || pud_large(*pud)) |
471 | goto out; | 471 | goto out; |
472 | 472 | ||
473 | pmd = pmd_offset(pud, address); | 473 | pmd = pmd_offset(pud, address); |
474 | if (bad_address(pmd)) | 474 | if (bad_address(pmd)) |
475 | goto bad; | 475 | goto bad; |
476 | 476 | ||
477 | printk("PMD %lx ", pmd_val(*pmd)); | 477 | printk("PMD %lx ", pmd_val(*pmd)); |
478 | if (!pmd_present(*pmd) || pmd_large(*pmd)) | 478 | if (!pmd_present(*pmd) || pmd_large(*pmd)) |
479 | goto out; | 479 | goto out; |
480 | 480 | ||
481 | pte = pte_offset_kernel(pmd, address); | 481 | pte = pte_offset_kernel(pmd, address); |
482 | if (bad_address(pte)) | 482 | if (bad_address(pte)) |
483 | goto bad; | 483 | goto bad; |
484 | 484 | ||
485 | printk("PTE %lx", pte_val(*pte)); | 485 | printk("PTE %lx", pte_val(*pte)); |
486 | out: | 486 | out: |
487 | printk("\n"); | 487 | printk("\n"); |
488 | return; | 488 | return; |
489 | bad: | 489 | bad: |
490 | printk("BAD\n"); | 490 | printk("BAD\n"); |
491 | } | 491 | } |
492 | 492 | ||
493 | #endif /* CONFIG_X86_64 */ | 493 | #endif /* CONFIG_X86_64 */ |
494 | 494 | ||
495 | /* | 495 | /* |
496 | * Workaround for K8 erratum #93 & buggy BIOS. | 496 | * Workaround for K8 erratum #93 & buggy BIOS. |
497 | * | 497 | * |
498 | * BIOS SMM functions are required to use a specific workaround | 498 | * BIOS SMM functions are required to use a specific workaround |
499 | * to avoid corruption of the 64bit RIP register on C stepping K8. | 499 | * to avoid corruption of the 64bit RIP register on C stepping K8. |
500 | * | 500 | * |
501 | * A lot of BIOS that didn't get tested properly miss this. | 501 | * A lot of BIOS that didn't get tested properly miss this. |
502 | * | 502 | * |
503 | * The OS sees this as a page fault with the upper 32bits of RIP cleared. | 503 | * The OS sees this as a page fault with the upper 32bits of RIP cleared. |
504 | * Try to work around it here. | 504 | * Try to work around it here. |
505 | * | 505 | * |
506 | * Note we only handle faults in kernel here. | 506 | * Note we only handle faults in kernel here. |
507 | * Does nothing on 32-bit. | 507 | * Does nothing on 32-bit. |
508 | */ | 508 | */ |
509 | static int is_errata93(struct pt_regs *regs, unsigned long address) | 509 | static int is_errata93(struct pt_regs *regs, unsigned long address) |
510 | { | 510 | { |
511 | #if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD) | 511 | #if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD) |
512 | if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD | 512 | if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD |
513 | || boot_cpu_data.x86 != 0xf) | 513 | || boot_cpu_data.x86 != 0xf) |
514 | return 0; | 514 | return 0; |
515 | 515 | ||
516 | if (address != regs->ip) | 516 | if (address != regs->ip) |
517 | return 0; | 517 | return 0; |
518 | 518 | ||
519 | if ((address >> 32) != 0) | 519 | if ((address >> 32) != 0) |
520 | return 0; | 520 | return 0; |
521 | 521 | ||
522 | address |= 0xffffffffUL << 32; | 522 | address |= 0xffffffffUL << 32; |
523 | if ((address >= (u64)_stext && address <= (u64)_etext) || | 523 | if ((address >= (u64)_stext && address <= (u64)_etext) || |
524 | (address >= MODULES_VADDR && address <= MODULES_END)) { | 524 | (address >= MODULES_VADDR && address <= MODULES_END)) { |
525 | printk_once(errata93_warning); | 525 | printk_once(errata93_warning); |
526 | regs->ip = address; | 526 | regs->ip = address; |
527 | return 1; | 527 | return 1; |
528 | } | 528 | } |
529 | #endif | 529 | #endif |
530 | return 0; | 530 | return 0; |
531 | } | 531 | } |
532 | 532 | ||
533 | /* | 533 | /* |
534 | * Work around K8 erratum #100 K8 in compat mode occasionally jumps | 534 | * Work around K8 erratum #100 K8 in compat mode occasionally jumps |
535 | * to illegal addresses >4GB. | 535 | * to illegal addresses >4GB. |
536 | * | 536 | * |
537 | * We catch this in the page fault handler because these addresses | 537 | * We catch this in the page fault handler because these addresses |
538 | * are not reachable. Just detect this case and return. Any code | 538 | * are not reachable. Just detect this case and return. Any code |
539 | * segment in LDT is compatibility mode. | 539 | * segment in LDT is compatibility mode. |
540 | */ | 540 | */ |
541 | static int is_errata100(struct pt_regs *regs, unsigned long address) | 541 | static int is_errata100(struct pt_regs *regs, unsigned long address) |
542 | { | 542 | { |
543 | #ifdef CONFIG_X86_64 | 543 | #ifdef CONFIG_X86_64 |
544 | if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32)) | 544 | if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32)) |
545 | return 1; | 545 | return 1; |
546 | #endif | 546 | #endif |
547 | return 0; | 547 | return 0; |
548 | } | 548 | } |
549 | 549 | ||
550 | static int is_f00f_bug(struct pt_regs *regs, unsigned long address) | 550 | static int is_f00f_bug(struct pt_regs *regs, unsigned long address) |
551 | { | 551 | { |
552 | #ifdef CONFIG_X86_F00F_BUG | 552 | #ifdef CONFIG_X86_F00F_BUG |
553 | unsigned long nr; | 553 | unsigned long nr; |
554 | 554 | ||
555 | /* | 555 | /* |
556 | * Pentium F0 0F C7 C8 bug workaround: | 556 | * Pentium F0 0F C7 C8 bug workaround: |
557 | */ | 557 | */ |
558 | if (boot_cpu_data.f00f_bug) { | 558 | if (boot_cpu_data.f00f_bug) { |
559 | nr = (address - idt_descr.address) >> 3; | 559 | nr = (address - idt_descr.address) >> 3; |
560 | 560 | ||
561 | if (nr == 6) { | 561 | if (nr == 6) { |
562 | do_invalid_op(regs, 0); | 562 | do_invalid_op(regs, 0); |
563 | return 1; | 563 | return 1; |
564 | } | 564 | } |
565 | } | 565 | } |
566 | #endif | 566 | #endif |
567 | return 0; | 567 | return 0; |
568 | } | 568 | } |
569 | 569 | ||
570 | static const char nx_warning[] = KERN_CRIT | 570 | static const char nx_warning[] = KERN_CRIT |
571 | "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n"; | 571 | "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n"; |
572 | 572 | ||
573 | static void | 573 | static void |
574 | show_fault_oops(struct pt_regs *regs, unsigned long error_code, | 574 | show_fault_oops(struct pt_regs *regs, unsigned long error_code, |
575 | unsigned long address) | 575 | unsigned long address) |
576 | { | 576 | { |
577 | if (!oops_may_print()) | 577 | if (!oops_may_print()) |
578 | return; | 578 | return; |
579 | 579 | ||
580 | if (error_code & PF_INSTR) { | 580 | if (error_code & PF_INSTR) { |
581 | unsigned int level; | 581 | unsigned int level; |
582 | 582 | ||
583 | pte_t *pte = lookup_address(address, &level); | 583 | pte_t *pte = lookup_address(address, &level); |
584 | 584 | ||
585 | if (pte && pte_present(*pte) && !pte_exec(*pte)) | 585 | if (pte && pte_present(*pte) && !pte_exec(*pte)) |
586 | printk(nx_warning, from_kuid(&init_user_ns, current_uid())); | 586 | printk(nx_warning, from_kuid(&init_user_ns, current_uid())); |
587 | } | 587 | } |
588 | 588 | ||
589 | printk(KERN_ALERT "BUG: unable to handle kernel "); | 589 | printk(KERN_ALERT "BUG: unable to handle kernel "); |
590 | if (address < PAGE_SIZE) | 590 | if (address < PAGE_SIZE) |
591 | printk(KERN_CONT "NULL pointer dereference"); | 591 | printk(KERN_CONT "NULL pointer dereference"); |
592 | else | 592 | else |
593 | printk(KERN_CONT "paging request"); | 593 | printk(KERN_CONT "paging request"); |
594 | 594 | ||
595 | printk(KERN_CONT " at %p\n", (void *) address); | 595 | printk(KERN_CONT " at %p\n", (void *) address); |
596 | printk(KERN_ALERT "IP:"); | 596 | printk(KERN_ALERT "IP:"); |
597 | printk_address(regs->ip, 1); | 597 | printk_address(regs->ip, 1); |
598 | 598 | ||
599 | dump_pagetable(address); | 599 | dump_pagetable(address); |
600 | } | 600 | } |
601 | 601 | ||
602 | static noinline void | 602 | static noinline void |
603 | pgtable_bad(struct pt_regs *regs, unsigned long error_code, | 603 | pgtable_bad(struct pt_regs *regs, unsigned long error_code, |
604 | unsigned long address) | 604 | unsigned long address) |
605 | { | 605 | { |
606 | struct task_struct *tsk; | 606 | struct task_struct *tsk; |
607 | unsigned long flags; | 607 | unsigned long flags; |
608 | int sig; | 608 | int sig; |
609 | 609 | ||
610 | flags = oops_begin(); | 610 | flags = oops_begin(); |
611 | tsk = current; | 611 | tsk = current; |
612 | sig = SIGKILL; | 612 | sig = SIGKILL; |
613 | 613 | ||
614 | printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", | 614 | printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", |
615 | tsk->comm, address); | 615 | tsk->comm, address); |
616 | dump_pagetable(address); | 616 | dump_pagetable(address); |
617 | 617 | ||
618 | tsk->thread.cr2 = address; | 618 | tsk->thread.cr2 = address; |
619 | tsk->thread.trap_nr = X86_TRAP_PF; | 619 | tsk->thread.trap_nr = X86_TRAP_PF; |
620 | tsk->thread.error_code = error_code; | 620 | tsk->thread.error_code = error_code; |
621 | 621 | ||
622 | if (__die("Bad pagetable", regs, error_code)) | 622 | if (__die("Bad pagetable", regs, error_code)) |
623 | sig = 0; | 623 | sig = 0; |
624 | 624 | ||
625 | oops_end(flags, regs, sig); | 625 | oops_end(flags, regs, sig); |
626 | } | 626 | } |
627 | 627 | ||
628 | static noinline void | 628 | static noinline void |
629 | no_context(struct pt_regs *regs, unsigned long error_code, | 629 | no_context(struct pt_regs *regs, unsigned long error_code, |
630 | unsigned long address, int signal, int si_code) | 630 | unsigned long address, int signal, int si_code) |
631 | { | 631 | { |
632 | struct task_struct *tsk = current; | 632 | struct task_struct *tsk = current; |
633 | unsigned long *stackend; | 633 | unsigned long *stackend; |
634 | unsigned long flags; | 634 | unsigned long flags; |
635 | int sig; | 635 | int sig; |
636 | 636 | ||
637 | /* Are we prepared to handle this kernel fault? */ | 637 | /* Are we prepared to handle this kernel fault? */ |
638 | if (fixup_exception(regs)) { | 638 | if (fixup_exception(regs)) { |
639 | if (current_thread_info()->sig_on_uaccess_error && signal) { | 639 | if (current_thread_info()->sig_on_uaccess_error && signal) { |
640 | tsk->thread.trap_nr = X86_TRAP_PF; | 640 | tsk->thread.trap_nr = X86_TRAP_PF; |
641 | tsk->thread.error_code = error_code | PF_USER; | 641 | tsk->thread.error_code = error_code | PF_USER; |
642 | tsk->thread.cr2 = address; | 642 | tsk->thread.cr2 = address; |
643 | 643 | ||
644 | /* XXX: hwpoison faults will set the wrong code. */ | 644 | /* XXX: hwpoison faults will set the wrong code. */ |
645 | force_sig_info_fault(signal, si_code, address, tsk, 0); | 645 | force_sig_info_fault(signal, si_code, address, tsk, 0); |
646 | } | 646 | } |
647 | return; | 647 | return; |
648 | } | 648 | } |
649 | 649 | ||
650 | /* | 650 | /* |
651 | * 32-bit: | 651 | * 32-bit: |
652 | * | 652 | * |
653 | * Valid to do another page fault here, because if this fault | 653 | * Valid to do another page fault here, because if this fault |
654 | * had been triggered by is_prefetch fixup_exception would have | 654 | * had been triggered by is_prefetch fixup_exception would have |
655 | * handled it. | 655 | * handled it. |
656 | * | 656 | * |
657 | * 64-bit: | 657 | * 64-bit: |
658 | * | 658 | * |
659 | * Hall of shame of CPU/BIOS bugs. | 659 | * Hall of shame of CPU/BIOS bugs. |
660 | */ | 660 | */ |
661 | if (is_prefetch(regs, error_code, address)) | 661 | if (is_prefetch(regs, error_code, address)) |
662 | return; | 662 | return; |
663 | 663 | ||
664 | if (is_errata93(regs, address)) | 664 | if (is_errata93(regs, address)) |
665 | return; | 665 | return; |
666 | 666 | ||
667 | /* | 667 | /* |
668 | * Oops. The kernel tried to access some bad page. We'll have to | 668 | * Oops. The kernel tried to access some bad page. We'll have to |
669 | * terminate things with extreme prejudice: | 669 | * terminate things with extreme prejudice: |
670 | */ | 670 | */ |
671 | flags = oops_begin(); | 671 | flags = oops_begin(); |
672 | 672 | ||
673 | show_fault_oops(regs, error_code, address); | 673 | show_fault_oops(regs, error_code, address); |
674 | 674 | ||
675 | stackend = end_of_stack(tsk); | 675 | stackend = end_of_stack(tsk); |
676 | if (tsk != &init_task && *stackend != STACK_END_MAGIC) | 676 | if (tsk != &init_task && *stackend != STACK_END_MAGIC) |
677 | printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); | 677 | printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); |
678 | 678 | ||
679 | tsk->thread.cr2 = address; | 679 | tsk->thread.cr2 = address; |
680 | tsk->thread.trap_nr = X86_TRAP_PF; | 680 | tsk->thread.trap_nr = X86_TRAP_PF; |
681 | tsk->thread.error_code = error_code; | 681 | tsk->thread.error_code = error_code; |
682 | 682 | ||
683 | sig = SIGKILL; | 683 | sig = SIGKILL; |
684 | if (__die("Oops", regs, error_code)) | 684 | if (__die("Oops", regs, error_code)) |
685 | sig = 0; | 685 | sig = 0; |
686 | 686 | ||
687 | /* Executive summary in case the body of the oops scrolled away */ | 687 | /* Executive summary in case the body of the oops scrolled away */ |
688 | printk(KERN_DEFAULT "CR2: %016lx\n", address); | 688 | printk(KERN_DEFAULT "CR2: %016lx\n", address); |
689 | 689 | ||
690 | oops_end(flags, regs, sig); | 690 | oops_end(flags, regs, sig); |
691 | } | 691 | } |
692 | 692 | ||
693 | /* | 693 | /* |
694 | * Print out info about fatal segfaults, if the show_unhandled_signals | 694 | * Print out info about fatal segfaults, if the show_unhandled_signals |
695 | * sysctl is set: | 695 | * sysctl is set: |
696 | */ | 696 | */ |
697 | static inline void | 697 | static inline void |
698 | show_signal_msg(struct pt_regs *regs, unsigned long error_code, | 698 | show_signal_msg(struct pt_regs *regs, unsigned long error_code, |
699 | unsigned long address, struct task_struct *tsk) | 699 | unsigned long address, struct task_struct *tsk) |
700 | { | 700 | { |
701 | if (!unhandled_signal(tsk, SIGSEGV)) | 701 | if (!unhandled_signal(tsk, SIGSEGV)) |
702 | return; | 702 | return; |
703 | 703 | ||
704 | if (!printk_ratelimit()) | 704 | if (!printk_ratelimit()) |
705 | return; | 705 | return; |
706 | 706 | ||
707 | printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx", | 707 | printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx", |
708 | task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, | 708 | task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, |
709 | tsk->comm, task_pid_nr(tsk), address, | 709 | tsk->comm, task_pid_nr(tsk), address, |
710 | (void *)regs->ip, (void *)regs->sp, error_code); | 710 | (void *)regs->ip, (void *)regs->sp, error_code); |
711 | 711 | ||
712 | print_vma_addr(KERN_CONT " in ", regs->ip); | 712 | print_vma_addr(KERN_CONT " in ", regs->ip); |
713 | 713 | ||
714 | printk(KERN_CONT "\n"); | 714 | printk(KERN_CONT "\n"); |
715 | } | 715 | } |
716 | 716 | ||
717 | static void | 717 | static void |
718 | __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, | 718 | __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, |
719 | unsigned long address, int si_code) | 719 | unsigned long address, int si_code) |
720 | { | 720 | { |
721 | struct task_struct *tsk = current; | 721 | struct task_struct *tsk = current; |
722 | 722 | ||
723 | /* User mode accesses just cause a SIGSEGV */ | 723 | /* User mode accesses just cause a SIGSEGV */ |
724 | if (error_code & PF_USER) { | 724 | if (error_code & PF_USER) { |
725 | /* | 725 | /* |
726 | * It's possible to have interrupts off here: | 726 | * It's possible to have interrupts off here: |
727 | */ | 727 | */ |
728 | local_irq_enable(); | 728 | local_irq_enable(); |
729 | 729 | ||
730 | /* | 730 | /* |
731 | * Valid to do another page fault here because this one came | 731 | * Valid to do another page fault here because this one came |
732 | * from user space: | 732 | * from user space: |
733 | */ | 733 | */ |
734 | if (is_prefetch(regs, error_code, address)) | 734 | if (is_prefetch(regs, error_code, address)) |
735 | return; | 735 | return; |
736 | 736 | ||
737 | if (is_errata100(regs, address)) | 737 | if (is_errata100(regs, address)) |
738 | return; | 738 | return; |
739 | 739 | ||
740 | #ifdef CONFIG_X86_64 | 740 | #ifdef CONFIG_X86_64 |
741 | /* | 741 | /* |
742 | * Instruction fetch faults in the vsyscall page might need | 742 | * Instruction fetch faults in the vsyscall page might need |
743 | * emulation. | 743 | * emulation. |
744 | */ | 744 | */ |
745 | if (unlikely((error_code & PF_INSTR) && | 745 | if (unlikely((error_code & PF_INSTR) && |
746 | ((address & ~0xfff) == VSYSCALL_START))) { | 746 | ((address & ~0xfff) == VSYSCALL_START))) { |
747 | if (emulate_vsyscall(regs, address)) | 747 | if (emulate_vsyscall(regs, address)) |
748 | return; | 748 | return; |
749 | } | 749 | } |
750 | #endif | 750 | #endif |
751 | /* Kernel addresses are always protection faults: */ | 751 | /* Kernel addresses are always protection faults: */ |
752 | if (address >= TASK_SIZE) | 752 | if (address >= TASK_SIZE) |
753 | error_code |= PF_PROT; | 753 | error_code |= PF_PROT; |
754 | 754 | ||
755 | if (likely(show_unhandled_signals)) | 755 | if (likely(show_unhandled_signals)) |
756 | show_signal_msg(regs, error_code, address, tsk); | 756 | show_signal_msg(regs, error_code, address, tsk); |
757 | 757 | ||
758 | tsk->thread.cr2 = address; | 758 | tsk->thread.cr2 = address; |
759 | tsk->thread.error_code = error_code; | 759 | tsk->thread.error_code = error_code; |
760 | tsk->thread.trap_nr = X86_TRAP_PF; | 760 | tsk->thread.trap_nr = X86_TRAP_PF; |
761 | 761 | ||
762 | force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0); | 762 | force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0); |
763 | 763 | ||
764 | return; | 764 | return; |
765 | } | 765 | } |
766 | 766 | ||
767 | if (is_f00f_bug(regs, address)) | 767 | if (is_f00f_bug(regs, address)) |
768 | return; | 768 | return; |
769 | 769 | ||
770 | no_context(regs, error_code, address, SIGSEGV, si_code); | 770 | no_context(regs, error_code, address, SIGSEGV, si_code); |
771 | } | 771 | } |
772 | 772 | ||
773 | static noinline void | 773 | static noinline void |
774 | bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, | 774 | bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, |
775 | unsigned long address) | 775 | unsigned long address) |
776 | { | 776 | { |
777 | __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR); | 777 | __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR); |
778 | } | 778 | } |
779 | 779 | ||
780 | static void | 780 | static void |
781 | __bad_area(struct pt_regs *regs, unsigned long error_code, | 781 | __bad_area(struct pt_regs *regs, unsigned long error_code, |
782 | unsigned long address, int si_code) | 782 | unsigned long address, int si_code) |
783 | { | 783 | { |
784 | struct mm_struct *mm = current->mm; | 784 | struct mm_struct *mm = current->mm; |
785 | 785 | ||
786 | /* | 786 | /* |
787 | * Something tried to access memory that isn't in our memory map.. | 787 | * Something tried to access memory that isn't in our memory map.. |
788 | * Fix it, but check if it's kernel or user first.. | 788 | * Fix it, but check if it's kernel or user first.. |
789 | */ | 789 | */ |
790 | up_read(&mm->mmap_sem); | 790 | up_read(&mm->mmap_sem); |
791 | 791 | ||
792 | __bad_area_nosemaphore(regs, error_code, address, si_code); | 792 | __bad_area_nosemaphore(regs, error_code, address, si_code); |
793 | } | 793 | } |
794 | 794 | ||
795 | static noinline void | 795 | static noinline void |
796 | bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) | 796 | bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) |
797 | { | 797 | { |
798 | __bad_area(regs, error_code, address, SEGV_MAPERR); | 798 | __bad_area(regs, error_code, address, SEGV_MAPERR); |
799 | } | 799 | } |
800 | 800 | ||
801 | static noinline void | 801 | static noinline void |
802 | bad_area_access_error(struct pt_regs *regs, unsigned long error_code, | 802 | bad_area_access_error(struct pt_regs *regs, unsigned long error_code, |
803 | unsigned long address) | 803 | unsigned long address) |
804 | { | 804 | { |
805 | __bad_area(regs, error_code, address, SEGV_ACCERR); | 805 | __bad_area(regs, error_code, address, SEGV_ACCERR); |
806 | } | 806 | } |
807 | 807 | ||
808 | static void | 808 | static void |
809 | do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, | 809 | do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, |
810 | unsigned int fault) | 810 | unsigned int fault) |
811 | { | 811 | { |
812 | struct task_struct *tsk = current; | 812 | struct task_struct *tsk = current; |
813 | struct mm_struct *mm = tsk->mm; | 813 | struct mm_struct *mm = tsk->mm; |
814 | int code = BUS_ADRERR; | 814 | int code = BUS_ADRERR; |
815 | 815 | ||
816 | up_read(&mm->mmap_sem); | 816 | up_read(&mm->mmap_sem); |
817 | 817 | ||
818 | /* Kernel mode? Handle exceptions or die: */ | 818 | /* Kernel mode? Handle exceptions or die: */ |
819 | if (!(error_code & PF_USER)) { | 819 | if (!(error_code & PF_USER)) { |
820 | no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); | 820 | no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); |
821 | return; | 821 | return; |
822 | } | 822 | } |
823 | 823 | ||
824 | /* User-space => ok to do another page fault: */ | 824 | /* User-space => ok to do another page fault: */ |
825 | if (is_prefetch(regs, error_code, address)) | 825 | if (is_prefetch(regs, error_code, address)) |
826 | return; | 826 | return; |
827 | 827 | ||
828 | tsk->thread.cr2 = address; | 828 | tsk->thread.cr2 = address; |
829 | tsk->thread.error_code = error_code; | 829 | tsk->thread.error_code = error_code; |
830 | tsk->thread.trap_nr = X86_TRAP_PF; | 830 | tsk->thread.trap_nr = X86_TRAP_PF; |
831 | 831 | ||
832 | #ifdef CONFIG_MEMORY_FAILURE | 832 | #ifdef CONFIG_MEMORY_FAILURE |
833 | if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { | 833 | if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { |
834 | printk(KERN_ERR | 834 | printk(KERN_ERR |
835 | "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", | 835 | "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", |
836 | tsk->comm, tsk->pid, address); | 836 | tsk->comm, tsk->pid, address); |
837 | code = BUS_MCEERR_AR; | 837 | code = BUS_MCEERR_AR; |
838 | } | 838 | } |
839 | #endif | 839 | #endif |
840 | force_sig_info_fault(SIGBUS, code, address, tsk, fault); | 840 | force_sig_info_fault(SIGBUS, code, address, tsk, fault); |
841 | } | 841 | } |
842 | 842 | ||
843 | static noinline int | 843 | static noinline int |
844 | mm_fault_error(struct pt_regs *regs, unsigned long error_code, | 844 | mm_fault_error(struct pt_regs *regs, unsigned long error_code, |
845 | unsigned long address, unsigned int fault) | 845 | unsigned long address, unsigned int fault) |
846 | { | 846 | { |
847 | /* | 847 | /* |
848 | * Pagefault was interrupted by SIGKILL. We have no reason to | 848 | * Pagefault was interrupted by SIGKILL. We have no reason to |
849 | * continue pagefault. | 849 | * continue pagefault. |
850 | */ | 850 | */ |
851 | if (fatal_signal_pending(current)) { | 851 | if (fatal_signal_pending(current)) { |
852 | if (!(fault & VM_FAULT_RETRY)) | 852 | if (!(fault & VM_FAULT_RETRY)) |
853 | up_read(¤t->mm->mmap_sem); | 853 | up_read(¤t->mm->mmap_sem); |
854 | if (!(error_code & PF_USER)) | 854 | if (!(error_code & PF_USER)) |
855 | no_context(regs, error_code, address, 0, 0); | 855 | no_context(regs, error_code, address, 0, 0); |
856 | return 1; | 856 | return 1; |
857 | } | 857 | } |
858 | if (!(fault & VM_FAULT_ERROR)) | 858 | if (!(fault & VM_FAULT_ERROR)) |
859 | return 0; | 859 | return 0; |
860 | 860 | ||
861 | if (fault & VM_FAULT_OOM) { | 861 | if (fault & VM_FAULT_OOM) { |
862 | /* Kernel mode? Handle exceptions or die: */ | 862 | /* Kernel mode? Handle exceptions or die: */ |
863 | if (!(error_code & PF_USER)) { | 863 | if (!(error_code & PF_USER)) { |
864 | up_read(¤t->mm->mmap_sem); | 864 | up_read(¤t->mm->mmap_sem); |
865 | no_context(regs, error_code, address, | 865 | no_context(regs, error_code, address, |
866 | SIGSEGV, SEGV_MAPERR); | 866 | SIGSEGV, SEGV_MAPERR); |
867 | return 1; | 867 | return 1; |
868 | } | 868 | } |
869 | 869 | ||
870 | up_read(¤t->mm->mmap_sem); | 870 | up_read(¤t->mm->mmap_sem); |
871 | 871 | ||
872 | /* | 872 | /* |
873 | * We ran out of memory, call the OOM killer, and return the | 873 | * We ran out of memory, call the OOM killer, and return the |
874 | * userspace (which will retry the fault, or kill us if we got | 874 | * userspace (which will retry the fault, or kill us if we got |
875 | * oom-killed): | 875 | * oom-killed): |
876 | */ | 876 | */ |
877 | pagefault_out_of_memory(); | 877 | pagefault_out_of_memory(); |
878 | } else { | 878 | } else { |
879 | if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| | 879 | if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| |
880 | VM_FAULT_HWPOISON_LARGE)) | 880 | VM_FAULT_HWPOISON_LARGE)) |
881 | do_sigbus(regs, error_code, address, fault); | 881 | do_sigbus(regs, error_code, address, fault); |
882 | else | 882 | else |
883 | BUG(); | 883 | BUG(); |
884 | } | 884 | } |
885 | return 1; | 885 | return 1; |
886 | } | 886 | } |
887 | 887 | ||
888 | static int spurious_fault_check(unsigned long error_code, pte_t *pte) | 888 | static int spurious_fault_check(unsigned long error_code, pte_t *pte) |
889 | { | 889 | { |
890 | if ((error_code & PF_WRITE) && !pte_write(*pte)) | 890 | if ((error_code & PF_WRITE) && !pte_write(*pte)) |
891 | return 0; | 891 | return 0; |
892 | 892 | ||
893 | if ((error_code & PF_INSTR) && !pte_exec(*pte)) | 893 | if ((error_code & PF_INSTR) && !pte_exec(*pte)) |
894 | return 0; | 894 | return 0; |
895 | 895 | ||
896 | return 1; | 896 | return 1; |
897 | } | 897 | } |
898 | 898 | ||
899 | /* | 899 | /* |
900 | * Handle a spurious fault caused by a stale TLB entry. | 900 | * Handle a spurious fault caused by a stale TLB entry. |
901 | * | 901 | * |
902 | * This allows us to lazily refresh the TLB when increasing the | 902 | * This allows us to lazily refresh the TLB when increasing the |
903 | * permissions of a kernel page (RO -> RW or NX -> X). Doing it | 903 | * permissions of a kernel page (RO -> RW or NX -> X). Doing it |
904 | * eagerly is very expensive since that implies doing a full | 904 | * eagerly is very expensive since that implies doing a full |
905 | * cross-processor TLB flush, even if no stale TLB entries exist | 905 | * cross-processor TLB flush, even if no stale TLB entries exist |
906 | * on other processors. | 906 | * on other processors. |
907 | * | 907 | * |
908 | * There are no security implications to leaving a stale TLB when | 908 | * There are no security implications to leaving a stale TLB when |
909 | * increasing the permissions on a page. | 909 | * increasing the permissions on a page. |
910 | */ | 910 | */ |
911 | static noinline __kprobes int | 911 | static noinline __kprobes int |
912 | spurious_fault(unsigned long error_code, unsigned long address) | 912 | spurious_fault(unsigned long error_code, unsigned long address) |
913 | { | 913 | { |
914 | pgd_t *pgd; | 914 | pgd_t *pgd; |
915 | pud_t *pud; | 915 | pud_t *pud; |
916 | pmd_t *pmd; | 916 | pmd_t *pmd; |
917 | pte_t *pte; | 917 | pte_t *pte; |
918 | int ret; | 918 | int ret; |
919 | 919 | ||
920 | /* Reserved-bit violation or user access to kernel space? */ | 920 | /* Reserved-bit violation or user access to kernel space? */ |
921 | if (error_code & (PF_USER | PF_RSVD)) | 921 | if (error_code & (PF_USER | PF_RSVD)) |
922 | return 0; | 922 | return 0; |
923 | 923 | ||
924 | pgd = init_mm.pgd + pgd_index(address); | 924 | pgd = init_mm.pgd + pgd_index(address); |
925 | if (!pgd_present(*pgd)) | 925 | if (!pgd_present(*pgd)) |
926 | return 0; | 926 | return 0; |
927 | 927 | ||
928 | pud = pud_offset(pgd, address); | 928 | pud = pud_offset(pgd, address); |
929 | if (!pud_present(*pud)) | 929 | if (!pud_present(*pud)) |
930 | return 0; | 930 | return 0; |
931 | 931 | ||
932 | if (pud_large(*pud)) | 932 | if (pud_large(*pud)) |
933 | return spurious_fault_check(error_code, (pte_t *) pud); | 933 | return spurious_fault_check(error_code, (pte_t *) pud); |
934 | 934 | ||
935 | pmd = pmd_offset(pud, address); | 935 | pmd = pmd_offset(pud, address); |
936 | if (!pmd_present(*pmd)) | 936 | if (!pmd_present(*pmd)) |
937 | return 0; | 937 | return 0; |
938 | 938 | ||
939 | if (pmd_large(*pmd)) | 939 | if (pmd_large(*pmd)) |
940 | return spurious_fault_check(error_code, (pte_t *) pmd); | 940 | return spurious_fault_check(error_code, (pte_t *) pmd); |
941 | 941 | ||
942 | pte = pte_offset_kernel(pmd, address); | 942 | pte = pte_offset_kernel(pmd, address); |
943 | if (!pte_present(*pte)) | 943 | if (!pte_present(*pte)) |
944 | return 0; | 944 | return 0; |
945 | 945 | ||
946 | ret = spurious_fault_check(error_code, pte); | 946 | ret = spurious_fault_check(error_code, pte); |
947 | if (!ret) | 947 | if (!ret) |
948 | return 0; | 948 | return 0; |
949 | 949 | ||
950 | /* | 950 | /* |
951 | * Make sure we have permissions in PMD. | 951 | * Make sure we have permissions in PMD. |
952 | * If not, then there's a bug in the page tables: | 952 | * If not, then there's a bug in the page tables: |
953 | */ | 953 | */ |
954 | ret = spurious_fault_check(error_code, (pte_t *) pmd); | 954 | ret = spurious_fault_check(error_code, (pte_t *) pmd); |
955 | WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); | 955 | WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); |
956 | 956 | ||
957 | return ret; | 957 | return ret; |
958 | } | 958 | } |
959 | 959 | ||
960 | int show_unhandled_signals = 1; | 960 | int show_unhandled_signals = 1; |
961 | 961 | ||
962 | static inline int | 962 | static inline int |
963 | access_error(unsigned long error_code, struct vm_area_struct *vma) | 963 | access_error(unsigned long error_code, struct vm_area_struct *vma) |
964 | { | 964 | { |
965 | if (error_code & PF_WRITE) { | 965 | if (error_code & PF_WRITE) { |
966 | /* write, present and write, not present: */ | 966 | /* write, present and write, not present: */ |
967 | if (unlikely(!(vma->vm_flags & VM_WRITE))) | 967 | if (unlikely(!(vma->vm_flags & VM_WRITE))) |
968 | return 1; | 968 | return 1; |
969 | return 0; | 969 | return 0; |
970 | } | 970 | } |
971 | 971 | ||
972 | /* read, present: */ | 972 | /* read, present: */ |
973 | if (unlikely(error_code & PF_PROT)) | 973 | if (unlikely(error_code & PF_PROT)) |
974 | return 1; | 974 | return 1; |
975 | 975 | ||
976 | /* read, not present: */ | 976 | /* read, not present: */ |
977 | if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) | 977 | if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) |
978 | return 1; | 978 | return 1; |
979 | 979 | ||
980 | return 0; | 980 | return 0; |
981 | } | 981 | } |
982 | 982 | ||
983 | static int fault_in_kernel_space(unsigned long address) | 983 | static int fault_in_kernel_space(unsigned long address) |
984 | { | 984 | { |
985 | return address >= TASK_SIZE_MAX; | 985 | return address >= TASK_SIZE_MAX; |
986 | } | 986 | } |
987 | 987 | ||
988 | static inline bool smap_violation(int error_code, struct pt_regs *regs) | 988 | static inline bool smap_violation(int error_code, struct pt_regs *regs) |
989 | { | 989 | { |
990 | if (error_code & PF_USER) | 990 | if (error_code & PF_USER) |
991 | return false; | 991 | return false; |
992 | 992 | ||
993 | if (!user_mode_vm(regs) && (regs->flags & X86_EFLAGS_AC)) | 993 | if (!user_mode_vm(regs) && (regs->flags & X86_EFLAGS_AC)) |
994 | return false; | 994 | return false; |
995 | 995 | ||
996 | return true; | 996 | return true; |
997 | } | 997 | } |
998 | 998 | ||
999 | /* | 999 | /* |
1000 | * This routine handles page faults. It determines the address, | 1000 | * This routine handles page faults. It determines the address, |
1001 | * and the problem, and then passes it off to one of the appropriate | 1001 | * and the problem, and then passes it off to one of the appropriate |
1002 | * routines. | 1002 | * routines. |
1003 | */ | 1003 | */ |
1004 | static void __kprobes | 1004 | static void __kprobes |
1005 | __do_page_fault(struct pt_regs *regs, unsigned long error_code) | 1005 | __do_page_fault(struct pt_regs *regs, unsigned long error_code) |
1006 | { | 1006 | { |
1007 | struct vm_area_struct *vma; | 1007 | struct vm_area_struct *vma; |
1008 | struct task_struct *tsk; | 1008 | struct task_struct *tsk; |
1009 | unsigned long address; | 1009 | unsigned long address; |
1010 | struct mm_struct *mm; | 1010 | struct mm_struct *mm; |
1011 | int fault; | 1011 | int fault; |
1012 | int write = error_code & PF_WRITE; | 1012 | int write = error_code & PF_WRITE; |
1013 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | | 1013 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | |
1014 | (write ? FAULT_FLAG_WRITE : 0); | 1014 | (write ? FAULT_FLAG_WRITE : 0); |
1015 | 1015 | ||
1016 | tsk = current; | 1016 | tsk = current; |
1017 | mm = tsk->mm; | 1017 | mm = tsk->mm; |
1018 | 1018 | ||
1019 | /* Get the faulting address: */ | 1019 | /* Get the faulting address: */ |
1020 | address = read_cr2(); | 1020 | address = read_cr2(); |
1021 | 1021 | ||
1022 | /* | 1022 | /* |
1023 | * Detect and handle instructions that would cause a page fault for | 1023 | * Detect and handle instructions that would cause a page fault for |
1024 | * both a tracked kernel page and a userspace page. | 1024 | * both a tracked kernel page and a userspace page. |
1025 | */ | 1025 | */ |
1026 | if (kmemcheck_active(regs)) | 1026 | if (kmemcheck_active(regs)) |
1027 | kmemcheck_hide(regs); | 1027 | kmemcheck_hide(regs); |
1028 | prefetchw(&mm->mmap_sem); | 1028 | prefetchw(&mm->mmap_sem); |
1029 | 1029 | ||
1030 | if (unlikely(kmmio_fault(regs, address))) | 1030 | if (unlikely(kmmio_fault(regs, address))) |
1031 | return; | 1031 | return; |
1032 | 1032 | ||
1033 | /* | 1033 | /* |
1034 | * We fault-in kernel-space virtual memory on-demand. The | 1034 | * We fault-in kernel-space virtual memory on-demand. The |
1035 | * 'reference' page table is init_mm.pgd. | 1035 | * 'reference' page table is init_mm.pgd. |
1036 | * | 1036 | * |
1037 | * NOTE! We MUST NOT take any locks for this case. We may | 1037 | * NOTE! We MUST NOT take any locks for this case. We may |
1038 | * be in an interrupt or a critical region, and should | 1038 | * be in an interrupt or a critical region, and should |
1039 | * only copy the information from the master page table, | 1039 | * only copy the information from the master page table, |
1040 | * nothing more. | 1040 | * nothing more. |
1041 | * | 1041 | * |
1042 | * This verifies that the fault happens in kernel space | 1042 | * This verifies that the fault happens in kernel space |
1043 | * (error_code & 4) == 0, and that the fault was not a | 1043 | * (error_code & 4) == 0, and that the fault was not a |
1044 | * protection error (error_code & 9) == 0. | 1044 | * protection error (error_code & 9) == 0. |
1045 | */ | 1045 | */ |
1046 | if (unlikely(fault_in_kernel_space(address))) { | 1046 | if (unlikely(fault_in_kernel_space(address))) { |
1047 | if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) { | 1047 | if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) { |
1048 | if (vmalloc_fault(address) >= 0) | 1048 | if (vmalloc_fault(address) >= 0) |
1049 | return; | 1049 | return; |
1050 | 1050 | ||
1051 | if (kmemcheck_fault(regs, address, error_code)) | 1051 | if (kmemcheck_fault(regs, address, error_code)) |
1052 | return; | 1052 | return; |
1053 | } | 1053 | } |
1054 | 1054 | ||
1055 | /* Can handle a stale RO->RW TLB: */ | 1055 | /* Can handle a stale RO->RW TLB: */ |
1056 | if (spurious_fault(error_code, address)) | 1056 | if (spurious_fault(error_code, address)) |
1057 | return; | 1057 | return; |
1058 | 1058 | ||
1059 | /* kprobes don't want to hook the spurious faults: */ | 1059 | /* kprobes don't want to hook the spurious faults: */ |
1060 | if (notify_page_fault(regs)) | 1060 | if (notify_page_fault(regs)) |
1061 | return; | 1061 | return; |
1062 | /* | 1062 | /* |
1063 | * Don't take the mm semaphore here. If we fixup a prefetch | 1063 | * Don't take the mm semaphore here. If we fixup a prefetch |
1064 | * fault we could otherwise deadlock: | 1064 | * fault we could otherwise deadlock: |
1065 | */ | 1065 | */ |
1066 | bad_area_nosemaphore(regs, error_code, address); | 1066 | bad_area_nosemaphore(regs, error_code, address); |
1067 | 1067 | ||
1068 | return; | 1068 | return; |
1069 | } | 1069 | } |
1070 | 1070 | ||
1071 | /* kprobes don't want to hook the spurious faults: */ | 1071 | /* kprobes don't want to hook the spurious faults: */ |
1072 | if (unlikely(notify_page_fault(regs))) | 1072 | if (unlikely(notify_page_fault(regs))) |
1073 | return; | 1073 | return; |
1074 | /* | 1074 | /* |
1075 | * It's safe to allow irq's after cr2 has been saved and the | 1075 | * It's safe to allow irq's after cr2 has been saved and the |
1076 | * vmalloc fault has been handled. | 1076 | * vmalloc fault has been handled. |
1077 | * | 1077 | * |
1078 | * User-mode registers count as a user access even for any | 1078 | * User-mode registers count as a user access even for any |
1079 | * potential system fault or CPU buglet: | 1079 | * potential system fault or CPU buglet: |
1080 | */ | 1080 | */ |
1081 | if (user_mode_vm(regs)) { | 1081 | if (user_mode_vm(regs)) { |
1082 | local_irq_enable(); | 1082 | local_irq_enable(); |
1083 | error_code |= PF_USER; | 1083 | error_code |= PF_USER; |
1084 | } else { | 1084 | } else { |
1085 | if (regs->flags & X86_EFLAGS_IF) | 1085 | if (regs->flags & X86_EFLAGS_IF) |
1086 | local_irq_enable(); | 1086 | local_irq_enable(); |
1087 | } | 1087 | } |
1088 | 1088 | ||
1089 | if (unlikely(error_code & PF_RSVD)) | 1089 | if (unlikely(error_code & PF_RSVD)) |
1090 | pgtable_bad(regs, error_code, address); | 1090 | pgtable_bad(regs, error_code, address); |
1091 | 1091 | ||
1092 | if (static_cpu_has(X86_FEATURE_SMAP)) { | 1092 | if (static_cpu_has(X86_FEATURE_SMAP)) { |
1093 | if (unlikely(smap_violation(error_code, regs))) { | 1093 | if (unlikely(smap_violation(error_code, regs))) { |
1094 | bad_area_nosemaphore(regs, error_code, address); | 1094 | bad_area_nosemaphore(regs, error_code, address); |
1095 | return; | 1095 | return; |
1096 | } | 1096 | } |
1097 | } | 1097 | } |
1098 | 1098 | ||
1099 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); | 1099 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); |
1100 | 1100 | ||
1101 | /* | 1101 | /* |
1102 | * If we're in an interrupt, have no user context or are running | 1102 | * If we're in an interrupt, have no user context or are running |
1103 | * in an atomic region then we must not take the fault: | 1103 | * in an atomic region then we must not take the fault: |
1104 | */ | 1104 | */ |
1105 | if (unlikely(in_atomic() || !mm)) { | 1105 | if (unlikely(in_atomic() || !mm)) { |
1106 | bad_area_nosemaphore(regs, error_code, address); | 1106 | bad_area_nosemaphore(regs, error_code, address); |
1107 | return; | 1107 | return; |
1108 | } | 1108 | } |
1109 | 1109 | ||
1110 | /* | 1110 | /* |
1111 | * When running in the kernel we expect faults to occur only to | 1111 | * When running in the kernel we expect faults to occur only to |
1112 | * addresses in user space. All other faults represent errors in | 1112 | * addresses in user space. All other faults represent errors in |
1113 | * the kernel and should generate an OOPS. Unfortunately, in the | 1113 | * the kernel and should generate an OOPS. Unfortunately, in the |
1114 | * case of an erroneous fault occurring in a code path which already | 1114 | * case of an erroneous fault occurring in a code path which already |
1115 | * holds mmap_sem we will deadlock attempting to validate the fault | 1115 | * holds mmap_sem we will deadlock attempting to validate the fault |
1116 | * against the address space. Luckily the kernel only validly | 1116 | * against the address space. Luckily the kernel only validly |
1117 | * references user space from well defined areas of code, which are | 1117 | * references user space from well defined areas of code, which are |
1118 | * listed in the exceptions table. | 1118 | * listed in the exceptions table. |
1119 | * | 1119 | * |
1120 | * As the vast majority of faults will be valid we will only perform | 1120 | * As the vast majority of faults will be valid we will only perform |
1121 | * the source reference check when there is a possibility of a | 1121 | * the source reference check when there is a possibility of a |
1122 | * deadlock. Attempt to lock the address space, if we cannot we then | 1122 | * deadlock. Attempt to lock the address space, if we cannot we then |
1123 | * validate the source. If this is invalid we can skip the address | 1123 | * validate the source. If this is invalid we can skip the address |
1124 | * space check, thus avoiding the deadlock: | 1124 | * space check, thus avoiding the deadlock: |
1125 | */ | 1125 | */ |
1126 | if (unlikely(!down_read_trylock(&mm->mmap_sem))) { | 1126 | if (unlikely(!down_read_trylock(&mm->mmap_sem))) { |
1127 | if ((error_code & PF_USER) == 0 && | 1127 | if ((error_code & PF_USER) == 0 && |
1128 | !search_exception_tables(regs->ip)) { | 1128 | !search_exception_tables(regs->ip)) { |
1129 | bad_area_nosemaphore(regs, error_code, address); | 1129 | bad_area_nosemaphore(regs, error_code, address); |
1130 | return; | 1130 | return; |
1131 | } | 1131 | } |
1132 | retry: | 1132 | retry: |
1133 | down_read(&mm->mmap_sem); | 1133 | down_read(&mm->mmap_sem); |
1134 | } else { | 1134 | } else { |
1135 | /* | 1135 | /* |
1136 | * The above down_read_trylock() might have succeeded in | 1136 | * The above down_read_trylock() might have succeeded in |
1137 | * which case we'll have missed the might_sleep() from | 1137 | * which case we'll have missed the might_sleep() from |
1138 | * down_read(): | 1138 | * down_read(): |
1139 | */ | 1139 | */ |
1140 | might_sleep(); | 1140 | might_sleep(); |
1141 | } | 1141 | } |
1142 | 1142 | ||
1143 | vma = find_vma(mm, address); | 1143 | vma = find_vma(mm, address); |
1144 | if (unlikely(!vma)) { | 1144 | if (unlikely(!vma)) { |
1145 | bad_area(regs, error_code, address); | 1145 | bad_area(regs, error_code, address); |
1146 | return; | 1146 | return; |
1147 | } | 1147 | } |
1148 | if (likely(vma->vm_start <= address)) | 1148 | if (likely(vma->vm_start <= address)) |
1149 | goto good_area; | 1149 | goto good_area; |
1150 | if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { | 1150 | if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { |
1151 | bad_area(regs, error_code, address); | 1151 | bad_area(regs, error_code, address); |
1152 | return; | 1152 | return; |
1153 | } | 1153 | } |
1154 | if (error_code & PF_USER) { | 1154 | if (error_code & PF_USER) { |
1155 | /* | 1155 | /* |
1156 | * Accessing the stack below %sp is always a bug. | 1156 | * Accessing the stack below %sp is always a bug. |
1157 | * The large cushion allows instructions like enter | 1157 | * The large cushion allows instructions like enter |
1158 | * and pusha to work. ("enter $65535, $31" pushes | 1158 | * and pusha to work. ("enter $65535, $31" pushes |
1159 | * 32 pointers and then decrements %sp by 65535.) | 1159 | * 32 pointers and then decrements %sp by 65535.) |
1160 | */ | 1160 | */ |
1161 | if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { | 1161 | if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { |
1162 | bad_area(regs, error_code, address); | 1162 | bad_area(regs, error_code, address); |
1163 | return; | 1163 | return; |
1164 | } | 1164 | } |
1165 | } | 1165 | } |
1166 | if (unlikely(expand_stack(vma, address))) { | 1166 | if (unlikely(expand_stack(vma, address))) { |
1167 | bad_area(regs, error_code, address); | 1167 | bad_area(regs, error_code, address); |
1168 | return; | 1168 | return; |
1169 | } | 1169 | } |
1170 | 1170 | ||
1171 | /* | 1171 | /* |
1172 | * Ok, we have a good vm_area for this memory access, so | 1172 | * Ok, we have a good vm_area for this memory access, so |
1173 | * we can handle it.. | 1173 | * we can handle it.. |
1174 | */ | 1174 | */ |
1175 | good_area: | 1175 | good_area: |
1176 | if (unlikely(access_error(error_code, vma))) { | 1176 | if (unlikely(access_error(error_code, vma))) { |
1177 | bad_area_access_error(regs, error_code, address); | 1177 | bad_area_access_error(regs, error_code, address); |
1178 | return; | 1178 | return; |
1179 | } | 1179 | } |
1180 | 1180 | ||
1181 | /* | 1181 | /* |
1182 | * If for any reason at all we couldn't handle the fault, | 1182 | * If for any reason at all we couldn't handle the fault, |
1183 | * make sure we exit gracefully rather than endlessly redo | 1183 | * make sure we exit gracefully rather than endlessly redo |
1184 | * the fault: | 1184 | * the fault: |
1185 | */ | 1185 | */ |
1186 | fault = handle_mm_fault(mm, vma, address, flags); | 1186 | fault = handle_mm_fault(mm, vma, address, flags); |
1187 | 1187 | ||
1188 | if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) { | 1188 | if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) { |
1189 | if (mm_fault_error(regs, error_code, address, fault)) | 1189 | if (mm_fault_error(regs, error_code, address, fault)) |
1190 | return; | 1190 | return; |
1191 | } | 1191 | } |
1192 | 1192 | ||
1193 | /* | 1193 | /* |
1194 | * Major/minor page fault accounting is only done on the | 1194 | * Major/minor page fault accounting is only done on the |
1195 | * initial attempt. If we go through a retry, it is extremely | 1195 | * initial attempt. If we go through a retry, it is extremely |
1196 | * likely that the page will be found in page cache at that point. | 1196 | * likely that the page will be found in page cache at that point. |
1197 | */ | 1197 | */ |
1198 | if (flags & FAULT_FLAG_ALLOW_RETRY) { | 1198 | if (flags & FAULT_FLAG_ALLOW_RETRY) { |
1199 | if (fault & VM_FAULT_MAJOR) { | 1199 | if (fault & VM_FAULT_MAJOR) { |
1200 | tsk->maj_flt++; | 1200 | tsk->maj_flt++; |
1201 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, | 1201 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, |
1202 | regs, address); | 1202 | regs, address); |
1203 | } else { | 1203 | } else { |
1204 | tsk->min_flt++; | 1204 | tsk->min_flt++; |
1205 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, | 1205 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, |
1206 | regs, address); | 1206 | regs, address); |
1207 | } | 1207 | } |
1208 | if (fault & VM_FAULT_RETRY) { | 1208 | if (fault & VM_FAULT_RETRY) { |
1209 | /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk | 1209 | /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk |
1210 | * of starvation. */ | 1210 | * of starvation. */ |
1211 | flags &= ~FAULT_FLAG_ALLOW_RETRY; | 1211 | flags &= ~FAULT_FLAG_ALLOW_RETRY; |
1212 | flags |= FAULT_FLAG_TRIED; | 1212 | flags |= FAULT_FLAG_TRIED; |
1213 | goto retry; | 1213 | goto retry; |
1214 | } | 1214 | } |
1215 | } | 1215 | } |
1216 | 1216 | ||
1217 | check_v8086_mode(regs, address, tsk); | 1217 | check_v8086_mode(regs, address, tsk); |
1218 | 1218 | ||
1219 | up_read(&mm->mmap_sem); | 1219 | up_read(&mm->mmap_sem); |
1220 | } | 1220 | } |
1221 | 1221 | ||
1222 | dotraplinkage void __kprobes | 1222 | dotraplinkage void __kprobes |
1223 | do_page_fault(struct pt_regs *regs, unsigned long error_code) | 1223 | do_page_fault(struct pt_regs *regs, unsigned long error_code) |
1224 | { | 1224 | { |
1225 | exception_enter(regs); | 1225 | exception_enter(regs); |
1226 | __do_page_fault(regs, error_code); | 1226 | __do_page_fault(regs, error_code); |
1227 | exception_exit(regs); | 1227 | exception_exit(regs); |
1228 | } | 1228 | } |
include/linux/context_tracking.h
1 | #ifndef _LINUX_CONTEXT_TRACKING_H | 1 | #ifndef _LINUX_CONTEXT_TRACKING_H |
2 | #define _LINUX_CONTEXT_TRACKING_H | 2 | #define _LINUX_CONTEXT_TRACKING_H |
3 | 3 | ||
4 | #ifdef CONFIG_CONTEXT_TRACKING | ||
5 | #include <linux/sched.h> | 4 | #include <linux/sched.h> |
6 | #include <linux/percpu.h> | 5 | #include <linux/percpu.h> |
6 | #include <asm/ptrace.h> | ||
7 | 7 | ||
8 | #ifdef CONFIG_CONTEXT_TRACKING | ||
8 | struct context_tracking { | 9 | struct context_tracking { |
9 | /* | 10 | /* |
10 | * When active is false, probes are unset in order | 11 | * When active is false, probes are unset in order |
11 | * to minimize overhead: TIF flags are cleared | 12 | * to minimize overhead: TIF flags are cleared |
12 | * and calls to user_enter/exit are ignored. This | 13 | * and calls to user_enter/exit are ignored. This |
13 | * may be further optimized using static keys. | 14 | * may be further optimized using static keys. |
14 | */ | 15 | */ |
15 | bool active; | 16 | bool active; |
16 | enum { | 17 | enum { |
17 | IN_KERNEL = 0, | 18 | IN_KERNEL = 0, |
18 | IN_USER, | 19 | IN_USER, |
19 | } state; | 20 | } state; |
20 | }; | 21 | }; |
21 | 22 | ||
22 | DECLARE_PER_CPU(struct context_tracking, context_tracking); | 23 | DECLARE_PER_CPU(struct context_tracking, context_tracking); |
23 | 24 | ||
24 | static inline bool context_tracking_in_user(void) | 25 | static inline bool context_tracking_in_user(void) |
25 | { | 26 | { |
26 | return __this_cpu_read(context_tracking.state) == IN_USER; | 27 | return __this_cpu_read(context_tracking.state) == IN_USER; |
27 | } | 28 | } |
28 | 29 | ||
29 | static inline bool context_tracking_active(void) | 30 | static inline bool context_tracking_active(void) |
30 | { | 31 | { |
31 | return __this_cpu_read(context_tracking.active); | 32 | return __this_cpu_read(context_tracking.active); |
32 | } | 33 | } |
33 | 34 | ||
34 | extern void user_enter(void); | 35 | extern void user_enter(void); |
35 | extern void user_exit(void); | 36 | extern void user_exit(void); |
37 | |||
38 | static inline void exception_enter(struct pt_regs *regs) | ||
39 | { | ||
40 | user_exit(); | ||
41 | } | ||
42 | |||
43 | static inline void exception_exit(struct pt_regs *regs) | ||
44 | { | ||
45 | if (user_mode(regs)) | ||
46 | user_enter(); | ||
47 | } | ||
48 | |||
36 | extern void context_tracking_task_switch(struct task_struct *prev, | 49 | extern void context_tracking_task_switch(struct task_struct *prev, |
37 | struct task_struct *next); | 50 | struct task_struct *next); |
38 | #else | 51 | #else |
39 | static inline bool context_tracking_in_user(void) { return false; } | 52 | static inline bool context_tracking_in_user(void) { return false; } |
40 | static inline void user_enter(void) { } | 53 | static inline void user_enter(void) { } |
41 | static inline void user_exit(void) { } | 54 | static inline void user_exit(void) { } |
55 | static inline void exception_enter(struct pt_regs *regs) { } | ||
56 | static inline void exception_exit(struct pt_regs *regs) { } | ||
42 | static inline void context_tracking_task_switch(struct task_struct *prev, | 57 | static inline void context_tracking_task_switch(struct task_struct *prev, |
43 | struct task_struct *next) { } | 58 | struct task_struct *next) { } |
44 | #endif /* !CONFIG_CONTEXT_TRACKING */ | 59 | #endif /* !CONFIG_CONTEXT_TRACKING */ |
45 | 60 | ||
46 | #endif | 61 | #endif |