Commit 56dd9470d7c8734f055da2a6bac553caf4a468eb

Authored by Frederic Weisbecker
1 parent 6dbe51c251

context_tracking: Move exception handling to generic code

Exceptions handling on context tracking should share common
treatment: on entry we exit user mode if the exception triggered
in that context. Then on exception exit we return to that previous
context.

Generalize this to avoid duplication across archs.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Li Zhong <zhong@linux.vnet.ibm.com>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Mats Liljegren <mats.liljegren@enea.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Namhyung Kim <namhyung.kim@lge.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

Showing 5 changed files with 19 additions and 26 deletions Inline Diff

arch/x86/include/asm/context_tracking.h
1 #ifndef _ASM_X86_CONTEXT_TRACKING_H 1 #ifndef _ASM_X86_CONTEXT_TRACKING_H
2 #define _ASM_X86_CONTEXT_TRACKING_H 2 #define _ASM_X86_CONTEXT_TRACKING_H
3 3
4 #ifndef __ASSEMBLY__
5 #include <linux/context_tracking.h>
6 #include <asm/ptrace.h>
7
8 static inline void exception_enter(struct pt_regs *regs)
9 {
10 user_exit();
11 }
12
13 static inline void exception_exit(struct pt_regs *regs)
14 {
15 #ifdef CONFIG_CONTEXT_TRACKING 4 #ifdef CONFIG_CONTEXT_TRACKING
16 if (user_mode(regs))
17 user_enter();
18 #endif
19 }
20
21 #else /* __ASSEMBLY__ */
22
23 #ifdef CONFIG_CONTEXT_TRACKING
24 # define SCHEDULE_USER call schedule_user 5 # define SCHEDULE_USER call schedule_user
25 #else 6 #else
26 # define SCHEDULE_USER call schedule 7 # define SCHEDULE_USER call schedule
27 #endif 8 #endif
28
29 #endif /* !__ASSEMBLY__ */
30 9
31 #endif 10 #endif
32 11
arch/x86/kernel/kvm.c
1 /* 1 /*
2 * KVM paravirt_ops implementation 2 * KVM paravirt_ops implementation
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify 4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by 5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or 6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version. 7 * (at your option) any later version.
8 * 8 *
9 * This program is distributed in the hope that it will be useful, 9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public License 14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software 15 * along with this program; if not, write to the Free Software
16 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 16 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 * 17 *
18 * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 18 * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
19 * Copyright IBM Corporation, 2007 19 * Copyright IBM Corporation, 2007
20 * Authors: Anthony Liguori <aliguori@us.ibm.com> 20 * Authors: Anthony Liguori <aliguori@us.ibm.com>
21 */ 21 */
22 22
23 #include <linux/context_tracking.h>
23 #include <linux/module.h> 24 #include <linux/module.h>
24 #include <linux/kernel.h> 25 #include <linux/kernel.h>
25 #include <linux/kvm_para.h> 26 #include <linux/kvm_para.h>
26 #include <linux/cpu.h> 27 #include <linux/cpu.h>
27 #include <linux/mm.h> 28 #include <linux/mm.h>
28 #include <linux/highmem.h> 29 #include <linux/highmem.h>
29 #include <linux/hardirq.h> 30 #include <linux/hardirq.h>
30 #include <linux/notifier.h> 31 #include <linux/notifier.h>
31 #include <linux/reboot.h> 32 #include <linux/reboot.h>
32 #include <linux/hash.h> 33 #include <linux/hash.h>
33 #include <linux/sched.h> 34 #include <linux/sched.h>
34 #include <linux/slab.h> 35 #include <linux/slab.h>
35 #include <linux/kprobes.h> 36 #include <linux/kprobes.h>
36 #include <asm/timer.h> 37 #include <asm/timer.h>
37 #include <asm/cpu.h> 38 #include <asm/cpu.h>
38 #include <asm/traps.h> 39 #include <asm/traps.h>
39 #include <asm/desc.h> 40 #include <asm/desc.h>
40 #include <asm/tlbflush.h> 41 #include <asm/tlbflush.h>
41 #include <asm/idle.h> 42 #include <asm/idle.h>
42 #include <asm/apic.h> 43 #include <asm/apic.h>
43 #include <asm/apicdef.h> 44 #include <asm/apicdef.h>
44 #include <asm/hypervisor.h> 45 #include <asm/hypervisor.h>
45 #include <asm/kvm_guest.h> 46 #include <asm/kvm_guest.h>
46 #include <asm/context_tracking.h>
47 47
48 static int kvmapf = 1; 48 static int kvmapf = 1;
49 49
50 static int parse_no_kvmapf(char *arg) 50 static int parse_no_kvmapf(char *arg)
51 { 51 {
52 kvmapf = 0; 52 kvmapf = 0;
53 return 0; 53 return 0;
54 } 54 }
55 55
56 early_param("no-kvmapf", parse_no_kvmapf); 56 early_param("no-kvmapf", parse_no_kvmapf);
57 57
58 static int steal_acc = 1; 58 static int steal_acc = 1;
59 static int parse_no_stealacc(char *arg) 59 static int parse_no_stealacc(char *arg)
60 { 60 {
61 steal_acc = 0; 61 steal_acc = 0;
62 return 0; 62 return 0;
63 } 63 }
64 64
65 early_param("no-steal-acc", parse_no_stealacc); 65 early_param("no-steal-acc", parse_no_stealacc);
66 66
67 static int kvmclock_vsyscall = 1; 67 static int kvmclock_vsyscall = 1;
68 static int parse_no_kvmclock_vsyscall(char *arg) 68 static int parse_no_kvmclock_vsyscall(char *arg)
69 { 69 {
70 kvmclock_vsyscall = 0; 70 kvmclock_vsyscall = 0;
71 return 0; 71 return 0;
72 } 72 }
73 73
74 early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); 74 early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
75 75
76 static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); 76 static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
77 static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); 77 static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
78 static int has_steal_clock = 0; 78 static int has_steal_clock = 0;
79 79
80 /* 80 /*
81 * No need for any "IO delay" on KVM 81 * No need for any "IO delay" on KVM
82 */ 82 */
83 static void kvm_io_delay(void) 83 static void kvm_io_delay(void)
84 { 84 {
85 } 85 }
86 86
87 #define KVM_TASK_SLEEP_HASHBITS 8 87 #define KVM_TASK_SLEEP_HASHBITS 8
88 #define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS) 88 #define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
89 89
90 struct kvm_task_sleep_node { 90 struct kvm_task_sleep_node {
91 struct hlist_node link; 91 struct hlist_node link;
92 wait_queue_head_t wq; 92 wait_queue_head_t wq;
93 u32 token; 93 u32 token;
94 int cpu; 94 int cpu;
95 bool halted; 95 bool halted;
96 }; 96 };
97 97
98 static struct kvm_task_sleep_head { 98 static struct kvm_task_sleep_head {
99 spinlock_t lock; 99 spinlock_t lock;
100 struct hlist_head list; 100 struct hlist_head list;
101 } async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE]; 101 } async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
102 102
103 static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b, 103 static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
104 u32 token) 104 u32 token)
105 { 105 {
106 struct hlist_node *p; 106 struct hlist_node *p;
107 107
108 hlist_for_each(p, &b->list) { 108 hlist_for_each(p, &b->list) {
109 struct kvm_task_sleep_node *n = 109 struct kvm_task_sleep_node *n =
110 hlist_entry(p, typeof(*n), link); 110 hlist_entry(p, typeof(*n), link);
111 if (n->token == token) 111 if (n->token == token)
112 return n; 112 return n;
113 } 113 }
114 114
115 return NULL; 115 return NULL;
116 } 116 }
117 117
118 void kvm_async_pf_task_wait(u32 token) 118 void kvm_async_pf_task_wait(u32 token)
119 { 119 {
120 u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); 120 u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
121 struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; 121 struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
122 struct kvm_task_sleep_node n, *e; 122 struct kvm_task_sleep_node n, *e;
123 DEFINE_WAIT(wait); 123 DEFINE_WAIT(wait);
124 124
125 rcu_irq_enter(); 125 rcu_irq_enter();
126 126
127 spin_lock(&b->lock); 127 spin_lock(&b->lock);
128 e = _find_apf_task(b, token); 128 e = _find_apf_task(b, token);
129 if (e) { 129 if (e) {
130 /* dummy entry exist -> wake up was delivered ahead of PF */ 130 /* dummy entry exist -> wake up was delivered ahead of PF */
131 hlist_del(&e->link); 131 hlist_del(&e->link);
132 kfree(e); 132 kfree(e);
133 spin_unlock(&b->lock); 133 spin_unlock(&b->lock);
134 134
135 rcu_irq_exit(); 135 rcu_irq_exit();
136 return; 136 return;
137 } 137 }
138 138
139 n.token = token; 139 n.token = token;
140 n.cpu = smp_processor_id(); 140 n.cpu = smp_processor_id();
141 n.halted = is_idle_task(current) || preempt_count() > 1; 141 n.halted = is_idle_task(current) || preempt_count() > 1;
142 init_waitqueue_head(&n.wq); 142 init_waitqueue_head(&n.wq);
143 hlist_add_head(&n.link, &b->list); 143 hlist_add_head(&n.link, &b->list);
144 spin_unlock(&b->lock); 144 spin_unlock(&b->lock);
145 145
146 for (;;) { 146 for (;;) {
147 if (!n.halted) 147 if (!n.halted)
148 prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); 148 prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
149 if (hlist_unhashed(&n.link)) 149 if (hlist_unhashed(&n.link))
150 break; 150 break;
151 151
152 if (!n.halted) { 152 if (!n.halted) {
153 local_irq_enable(); 153 local_irq_enable();
154 schedule(); 154 schedule();
155 local_irq_disable(); 155 local_irq_disable();
156 } else { 156 } else {
157 /* 157 /*
158 * We cannot reschedule. So halt. 158 * We cannot reschedule. So halt.
159 */ 159 */
160 rcu_irq_exit(); 160 rcu_irq_exit();
161 native_safe_halt(); 161 native_safe_halt();
162 rcu_irq_enter(); 162 rcu_irq_enter();
163 local_irq_disable(); 163 local_irq_disable();
164 } 164 }
165 } 165 }
166 if (!n.halted) 166 if (!n.halted)
167 finish_wait(&n.wq, &wait); 167 finish_wait(&n.wq, &wait);
168 168
169 rcu_irq_exit(); 169 rcu_irq_exit();
170 return; 170 return;
171 } 171 }
172 EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); 172 EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
173 173
174 static void apf_task_wake_one(struct kvm_task_sleep_node *n) 174 static void apf_task_wake_one(struct kvm_task_sleep_node *n)
175 { 175 {
176 hlist_del_init(&n->link); 176 hlist_del_init(&n->link);
177 if (n->halted) 177 if (n->halted)
178 smp_send_reschedule(n->cpu); 178 smp_send_reschedule(n->cpu);
179 else if (waitqueue_active(&n->wq)) 179 else if (waitqueue_active(&n->wq))
180 wake_up(&n->wq); 180 wake_up(&n->wq);
181 } 181 }
182 182
183 static void apf_task_wake_all(void) 183 static void apf_task_wake_all(void)
184 { 184 {
185 int i; 185 int i;
186 186
187 for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) { 187 for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
188 struct hlist_node *p, *next; 188 struct hlist_node *p, *next;
189 struct kvm_task_sleep_head *b = &async_pf_sleepers[i]; 189 struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
190 spin_lock(&b->lock); 190 spin_lock(&b->lock);
191 hlist_for_each_safe(p, next, &b->list) { 191 hlist_for_each_safe(p, next, &b->list) {
192 struct kvm_task_sleep_node *n = 192 struct kvm_task_sleep_node *n =
193 hlist_entry(p, typeof(*n), link); 193 hlist_entry(p, typeof(*n), link);
194 if (n->cpu == smp_processor_id()) 194 if (n->cpu == smp_processor_id())
195 apf_task_wake_one(n); 195 apf_task_wake_one(n);
196 } 196 }
197 spin_unlock(&b->lock); 197 spin_unlock(&b->lock);
198 } 198 }
199 } 199 }
200 200
201 void kvm_async_pf_task_wake(u32 token) 201 void kvm_async_pf_task_wake(u32 token)
202 { 202 {
203 u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); 203 u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
204 struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; 204 struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
205 struct kvm_task_sleep_node *n; 205 struct kvm_task_sleep_node *n;
206 206
207 if (token == ~0) { 207 if (token == ~0) {
208 apf_task_wake_all(); 208 apf_task_wake_all();
209 return; 209 return;
210 } 210 }
211 211
212 again: 212 again:
213 spin_lock(&b->lock); 213 spin_lock(&b->lock);
214 n = _find_apf_task(b, token); 214 n = _find_apf_task(b, token);
215 if (!n) { 215 if (!n) {
216 /* 216 /*
217 * async PF was not yet handled. 217 * async PF was not yet handled.
218 * Add dummy entry for the token. 218 * Add dummy entry for the token.
219 */ 219 */
220 n = kzalloc(sizeof(*n), GFP_ATOMIC); 220 n = kzalloc(sizeof(*n), GFP_ATOMIC);
221 if (!n) { 221 if (!n) {
222 /* 222 /*
223 * Allocation failed! Busy wait while other cpu 223 * Allocation failed! Busy wait while other cpu
224 * handles async PF. 224 * handles async PF.
225 */ 225 */
226 spin_unlock(&b->lock); 226 spin_unlock(&b->lock);
227 cpu_relax(); 227 cpu_relax();
228 goto again; 228 goto again;
229 } 229 }
230 n->token = token; 230 n->token = token;
231 n->cpu = smp_processor_id(); 231 n->cpu = smp_processor_id();
232 init_waitqueue_head(&n->wq); 232 init_waitqueue_head(&n->wq);
233 hlist_add_head(&n->link, &b->list); 233 hlist_add_head(&n->link, &b->list);
234 } else 234 } else
235 apf_task_wake_one(n); 235 apf_task_wake_one(n);
236 spin_unlock(&b->lock); 236 spin_unlock(&b->lock);
237 return; 237 return;
238 } 238 }
239 EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); 239 EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
240 240
241 u32 kvm_read_and_reset_pf_reason(void) 241 u32 kvm_read_and_reset_pf_reason(void)
242 { 242 {
243 u32 reason = 0; 243 u32 reason = 0;
244 244
245 if (__get_cpu_var(apf_reason).enabled) { 245 if (__get_cpu_var(apf_reason).enabled) {
246 reason = __get_cpu_var(apf_reason).reason; 246 reason = __get_cpu_var(apf_reason).reason;
247 __get_cpu_var(apf_reason).reason = 0; 247 __get_cpu_var(apf_reason).reason = 0;
248 } 248 }
249 249
250 return reason; 250 return reason;
251 } 251 }
252 EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason); 252 EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
253 253
254 dotraplinkage void __kprobes 254 dotraplinkage void __kprobes
255 do_async_page_fault(struct pt_regs *regs, unsigned long error_code) 255 do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
256 { 256 {
257 switch (kvm_read_and_reset_pf_reason()) { 257 switch (kvm_read_and_reset_pf_reason()) {
258 default: 258 default:
259 do_page_fault(regs, error_code); 259 do_page_fault(regs, error_code);
260 break; 260 break;
261 case KVM_PV_REASON_PAGE_NOT_PRESENT: 261 case KVM_PV_REASON_PAGE_NOT_PRESENT:
262 /* page is swapped out by the host. */ 262 /* page is swapped out by the host. */
263 exception_enter(regs); 263 exception_enter(regs);
264 exit_idle(); 264 exit_idle();
265 kvm_async_pf_task_wait((u32)read_cr2()); 265 kvm_async_pf_task_wait((u32)read_cr2());
266 exception_exit(regs); 266 exception_exit(regs);
267 break; 267 break;
268 case KVM_PV_REASON_PAGE_READY: 268 case KVM_PV_REASON_PAGE_READY:
269 rcu_irq_enter(); 269 rcu_irq_enter();
270 exit_idle(); 270 exit_idle();
271 kvm_async_pf_task_wake((u32)read_cr2()); 271 kvm_async_pf_task_wake((u32)read_cr2());
272 rcu_irq_exit(); 272 rcu_irq_exit();
273 break; 273 break;
274 } 274 }
275 } 275 }
276 276
277 static void __init paravirt_ops_setup(void) 277 static void __init paravirt_ops_setup(void)
278 { 278 {
279 pv_info.name = "KVM"; 279 pv_info.name = "KVM";
280 pv_info.paravirt_enabled = 1; 280 pv_info.paravirt_enabled = 1;
281 281
282 if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) 282 if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
283 pv_cpu_ops.io_delay = kvm_io_delay; 283 pv_cpu_ops.io_delay = kvm_io_delay;
284 284
285 #ifdef CONFIG_X86_IO_APIC 285 #ifdef CONFIG_X86_IO_APIC
286 no_timer_check = 1; 286 no_timer_check = 1;
287 #endif 287 #endif
288 } 288 }
289 289
290 static void kvm_register_steal_time(void) 290 static void kvm_register_steal_time(void)
291 { 291 {
292 int cpu = smp_processor_id(); 292 int cpu = smp_processor_id();
293 struct kvm_steal_time *st = &per_cpu(steal_time, cpu); 293 struct kvm_steal_time *st = &per_cpu(steal_time, cpu);
294 294
295 if (!has_steal_clock) 295 if (!has_steal_clock)
296 return; 296 return;
297 297
298 memset(st, 0, sizeof(*st)); 298 memset(st, 0, sizeof(*st));
299 299
300 wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); 300 wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
301 pr_info("kvm-stealtime: cpu %d, msr %llx\n", 301 pr_info("kvm-stealtime: cpu %d, msr %llx\n",
302 cpu, (unsigned long long) slow_virt_to_phys(st)); 302 cpu, (unsigned long long) slow_virt_to_phys(st));
303 } 303 }
304 304
305 static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; 305 static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
306 306
307 static void kvm_guest_apic_eoi_write(u32 reg, u32 val) 307 static void kvm_guest_apic_eoi_write(u32 reg, u32 val)
308 { 308 {
309 /** 309 /**
310 * This relies on __test_and_clear_bit to modify the memory 310 * This relies on __test_and_clear_bit to modify the memory
311 * in a way that is atomic with respect to the local CPU. 311 * in a way that is atomic with respect to the local CPU.
312 * The hypervisor only accesses this memory from the local CPU so 312 * The hypervisor only accesses this memory from the local CPU so
313 * there's no need for lock or memory barriers. 313 * there's no need for lock or memory barriers.
314 * An optimization barrier is implied in apic write. 314 * An optimization barrier is implied in apic write.
315 */ 315 */
316 if (__test_and_clear_bit(KVM_PV_EOI_BIT, &__get_cpu_var(kvm_apic_eoi))) 316 if (__test_and_clear_bit(KVM_PV_EOI_BIT, &__get_cpu_var(kvm_apic_eoi)))
317 return; 317 return;
318 apic_write(APIC_EOI, APIC_EOI_ACK); 318 apic_write(APIC_EOI, APIC_EOI_ACK);
319 } 319 }
320 320
321 void __cpuinit kvm_guest_cpu_init(void) 321 void __cpuinit kvm_guest_cpu_init(void)
322 { 322 {
323 if (!kvm_para_available()) 323 if (!kvm_para_available())
324 return; 324 return;
325 325
326 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { 326 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
327 u64 pa = slow_virt_to_phys(&__get_cpu_var(apf_reason)); 327 u64 pa = slow_virt_to_phys(&__get_cpu_var(apf_reason));
328 328
329 #ifdef CONFIG_PREEMPT 329 #ifdef CONFIG_PREEMPT
330 pa |= KVM_ASYNC_PF_SEND_ALWAYS; 330 pa |= KVM_ASYNC_PF_SEND_ALWAYS;
331 #endif 331 #endif
332 wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED); 332 wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED);
333 __get_cpu_var(apf_reason).enabled = 1; 333 __get_cpu_var(apf_reason).enabled = 1;
334 printk(KERN_INFO"KVM setup async PF for cpu %d\n", 334 printk(KERN_INFO"KVM setup async PF for cpu %d\n",
335 smp_processor_id()); 335 smp_processor_id());
336 } 336 }
337 337
338 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { 338 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
339 unsigned long pa; 339 unsigned long pa;
340 /* Size alignment is implied but just to make it explicit. */ 340 /* Size alignment is implied but just to make it explicit. */
341 BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); 341 BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
342 __get_cpu_var(kvm_apic_eoi) = 0; 342 __get_cpu_var(kvm_apic_eoi) = 0;
343 pa = slow_virt_to_phys(&__get_cpu_var(kvm_apic_eoi)) 343 pa = slow_virt_to_phys(&__get_cpu_var(kvm_apic_eoi))
344 | KVM_MSR_ENABLED; 344 | KVM_MSR_ENABLED;
345 wrmsrl(MSR_KVM_PV_EOI_EN, pa); 345 wrmsrl(MSR_KVM_PV_EOI_EN, pa);
346 } 346 }
347 347
348 if (has_steal_clock) 348 if (has_steal_clock)
349 kvm_register_steal_time(); 349 kvm_register_steal_time();
350 } 350 }
351 351
352 static void kvm_pv_disable_apf(void) 352 static void kvm_pv_disable_apf(void)
353 { 353 {
354 if (!__get_cpu_var(apf_reason).enabled) 354 if (!__get_cpu_var(apf_reason).enabled)
355 return; 355 return;
356 356
357 wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); 357 wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
358 __get_cpu_var(apf_reason).enabled = 0; 358 __get_cpu_var(apf_reason).enabled = 0;
359 359
360 printk(KERN_INFO"Unregister pv shared memory for cpu %d\n", 360 printk(KERN_INFO"Unregister pv shared memory for cpu %d\n",
361 smp_processor_id()); 361 smp_processor_id());
362 } 362 }
363 363
364 static void kvm_pv_guest_cpu_reboot(void *unused) 364 static void kvm_pv_guest_cpu_reboot(void *unused)
365 { 365 {
366 /* 366 /*
367 * We disable PV EOI before we load a new kernel by kexec, 367 * We disable PV EOI before we load a new kernel by kexec,
368 * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory. 368 * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory.
369 * New kernel can re-enable when it boots. 369 * New kernel can re-enable when it boots.
370 */ 370 */
371 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) 371 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
372 wrmsrl(MSR_KVM_PV_EOI_EN, 0); 372 wrmsrl(MSR_KVM_PV_EOI_EN, 0);
373 kvm_pv_disable_apf(); 373 kvm_pv_disable_apf();
374 kvm_disable_steal_time(); 374 kvm_disable_steal_time();
375 } 375 }
376 376
377 static int kvm_pv_reboot_notify(struct notifier_block *nb, 377 static int kvm_pv_reboot_notify(struct notifier_block *nb,
378 unsigned long code, void *unused) 378 unsigned long code, void *unused)
379 { 379 {
380 if (code == SYS_RESTART) 380 if (code == SYS_RESTART)
381 on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1); 381 on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
382 return NOTIFY_DONE; 382 return NOTIFY_DONE;
383 } 383 }
384 384
385 static struct notifier_block kvm_pv_reboot_nb = { 385 static struct notifier_block kvm_pv_reboot_nb = {
386 .notifier_call = kvm_pv_reboot_notify, 386 .notifier_call = kvm_pv_reboot_notify,
387 }; 387 };
388 388
389 static u64 kvm_steal_clock(int cpu) 389 static u64 kvm_steal_clock(int cpu)
390 { 390 {
391 u64 steal; 391 u64 steal;
392 struct kvm_steal_time *src; 392 struct kvm_steal_time *src;
393 int version; 393 int version;
394 394
395 src = &per_cpu(steal_time, cpu); 395 src = &per_cpu(steal_time, cpu);
396 do { 396 do {
397 version = src->version; 397 version = src->version;
398 rmb(); 398 rmb();
399 steal = src->steal; 399 steal = src->steal;
400 rmb(); 400 rmb();
401 } while ((version & 1) || (version != src->version)); 401 } while ((version & 1) || (version != src->version));
402 402
403 return steal; 403 return steal;
404 } 404 }
405 405
406 void kvm_disable_steal_time(void) 406 void kvm_disable_steal_time(void)
407 { 407 {
408 if (!has_steal_clock) 408 if (!has_steal_clock)
409 return; 409 return;
410 410
411 wrmsr(MSR_KVM_STEAL_TIME, 0, 0); 411 wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
412 } 412 }
413 413
414 #ifdef CONFIG_SMP 414 #ifdef CONFIG_SMP
415 static void __init kvm_smp_prepare_boot_cpu(void) 415 static void __init kvm_smp_prepare_boot_cpu(void)
416 { 416 {
417 WARN_ON(kvm_register_clock("primary cpu clock")); 417 WARN_ON(kvm_register_clock("primary cpu clock"));
418 kvm_guest_cpu_init(); 418 kvm_guest_cpu_init();
419 native_smp_prepare_boot_cpu(); 419 native_smp_prepare_boot_cpu();
420 } 420 }
421 421
422 static void __cpuinit kvm_guest_cpu_online(void *dummy) 422 static void __cpuinit kvm_guest_cpu_online(void *dummy)
423 { 423 {
424 kvm_guest_cpu_init(); 424 kvm_guest_cpu_init();
425 } 425 }
426 426
427 static void kvm_guest_cpu_offline(void *dummy) 427 static void kvm_guest_cpu_offline(void *dummy)
428 { 428 {
429 kvm_disable_steal_time(); 429 kvm_disable_steal_time();
430 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) 430 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
431 wrmsrl(MSR_KVM_PV_EOI_EN, 0); 431 wrmsrl(MSR_KVM_PV_EOI_EN, 0);
432 kvm_pv_disable_apf(); 432 kvm_pv_disable_apf();
433 apf_task_wake_all(); 433 apf_task_wake_all();
434 } 434 }
435 435
436 static int __cpuinit kvm_cpu_notify(struct notifier_block *self, 436 static int __cpuinit kvm_cpu_notify(struct notifier_block *self,
437 unsigned long action, void *hcpu) 437 unsigned long action, void *hcpu)
438 { 438 {
439 int cpu = (unsigned long)hcpu; 439 int cpu = (unsigned long)hcpu;
440 switch (action) { 440 switch (action) {
441 case CPU_ONLINE: 441 case CPU_ONLINE:
442 case CPU_DOWN_FAILED: 442 case CPU_DOWN_FAILED:
443 case CPU_ONLINE_FROZEN: 443 case CPU_ONLINE_FROZEN:
444 smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0); 444 smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0);
445 break; 445 break;
446 case CPU_DOWN_PREPARE: 446 case CPU_DOWN_PREPARE:
447 case CPU_DOWN_PREPARE_FROZEN: 447 case CPU_DOWN_PREPARE_FROZEN:
448 smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1); 448 smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1);
449 break; 449 break;
450 default: 450 default:
451 break; 451 break;
452 } 452 }
453 return NOTIFY_OK; 453 return NOTIFY_OK;
454 } 454 }
455 455
456 static struct notifier_block __cpuinitdata kvm_cpu_notifier = { 456 static struct notifier_block __cpuinitdata kvm_cpu_notifier = {
457 .notifier_call = kvm_cpu_notify, 457 .notifier_call = kvm_cpu_notify,
458 }; 458 };
459 #endif 459 #endif
460 460
461 static void __init kvm_apf_trap_init(void) 461 static void __init kvm_apf_trap_init(void)
462 { 462 {
463 set_intr_gate(14, &async_page_fault); 463 set_intr_gate(14, &async_page_fault);
464 } 464 }
465 465
466 void __init kvm_guest_init(void) 466 void __init kvm_guest_init(void)
467 { 467 {
468 int i; 468 int i;
469 469
470 if (!kvm_para_available()) 470 if (!kvm_para_available())
471 return; 471 return;
472 472
473 paravirt_ops_setup(); 473 paravirt_ops_setup();
474 register_reboot_notifier(&kvm_pv_reboot_nb); 474 register_reboot_notifier(&kvm_pv_reboot_nb);
475 for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) 475 for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
476 spin_lock_init(&async_pf_sleepers[i].lock); 476 spin_lock_init(&async_pf_sleepers[i].lock);
477 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) 477 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
478 x86_init.irqs.trap_init = kvm_apf_trap_init; 478 x86_init.irqs.trap_init = kvm_apf_trap_init;
479 479
480 if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { 480 if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
481 has_steal_clock = 1; 481 has_steal_clock = 1;
482 pv_time_ops.steal_clock = kvm_steal_clock; 482 pv_time_ops.steal_clock = kvm_steal_clock;
483 } 483 }
484 484
485 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) 485 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
486 apic_set_eoi_write(kvm_guest_apic_eoi_write); 486 apic_set_eoi_write(kvm_guest_apic_eoi_write);
487 487
488 if (kvmclock_vsyscall) 488 if (kvmclock_vsyscall)
489 kvm_setup_vsyscall_timeinfo(); 489 kvm_setup_vsyscall_timeinfo();
490 490
491 #ifdef CONFIG_SMP 491 #ifdef CONFIG_SMP
492 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; 492 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
493 register_cpu_notifier(&kvm_cpu_notifier); 493 register_cpu_notifier(&kvm_cpu_notifier);
494 #else 494 #else
495 kvm_guest_cpu_init(); 495 kvm_guest_cpu_init();
496 #endif 496 #endif
497 } 497 }
498 498
499 static bool __init kvm_detect(void) 499 static bool __init kvm_detect(void)
500 { 500 {
501 if (!kvm_para_available()) 501 if (!kvm_para_available())
502 return false; 502 return false;
503 return true; 503 return true;
504 } 504 }
505 505
506 const struct hypervisor_x86 x86_hyper_kvm __refconst = { 506 const struct hypervisor_x86 x86_hyper_kvm __refconst = {
507 .name = "KVM", 507 .name = "KVM",
508 .detect = kvm_detect, 508 .detect = kvm_detect,
509 .x2apic_available = kvm_para_available, 509 .x2apic_available = kvm_para_available,
510 }; 510 };
511 EXPORT_SYMBOL_GPL(x86_hyper_kvm); 511 EXPORT_SYMBOL_GPL(x86_hyper_kvm);
512 512
513 static __init int activate_jump_labels(void) 513 static __init int activate_jump_labels(void)
514 { 514 {
515 if (has_steal_clock) { 515 if (has_steal_clock) {
516 static_key_slow_inc(&paravirt_steal_enabled); 516 static_key_slow_inc(&paravirt_steal_enabled);
517 if (steal_acc) 517 if (steal_acc)
518 static_key_slow_inc(&paravirt_steal_rq_enabled); 518 static_key_slow_inc(&paravirt_steal_rq_enabled);
519 } 519 }
520 520
521 return 0; 521 return 0;
522 } 522 }
523 arch_initcall(activate_jump_labels); 523 arch_initcall(activate_jump_labels);
arch/x86/kernel/traps.c
1 /* 1 /*
2 * Copyright (C) 1991, 1992 Linus Torvalds 2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs 3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 * 4 *
5 * Pentium III FXSR, SSE support 5 * Pentium III FXSR, SSE support
6 * Gareth Hughes <gareth@valinux.com>, May 2000 6 * Gareth Hughes <gareth@valinux.com>, May 2000
7 */ 7 */
8 8
9 /* 9 /*
10 * Handle hardware traps and faults. 10 * Handle hardware traps and faults.
11 */ 11 */
12 12
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14 14
15 #include <linux/context_tracking.h>
15 #include <linux/interrupt.h> 16 #include <linux/interrupt.h>
16 #include <linux/kallsyms.h> 17 #include <linux/kallsyms.h>
17 #include <linux/spinlock.h> 18 #include <linux/spinlock.h>
18 #include <linux/kprobes.h> 19 #include <linux/kprobes.h>
19 #include <linux/uaccess.h> 20 #include <linux/uaccess.h>
20 #include <linux/kdebug.h> 21 #include <linux/kdebug.h>
21 #include <linux/kgdb.h> 22 #include <linux/kgdb.h>
22 #include <linux/kernel.h> 23 #include <linux/kernel.h>
23 #include <linux/module.h> 24 #include <linux/module.h>
24 #include <linux/ptrace.h> 25 #include <linux/ptrace.h>
25 #include <linux/string.h> 26 #include <linux/string.h>
26 #include <linux/delay.h> 27 #include <linux/delay.h>
27 #include <linux/errno.h> 28 #include <linux/errno.h>
28 #include <linux/kexec.h> 29 #include <linux/kexec.h>
29 #include <linux/sched.h> 30 #include <linux/sched.h>
30 #include <linux/timer.h> 31 #include <linux/timer.h>
31 #include <linux/init.h> 32 #include <linux/init.h>
32 #include <linux/bug.h> 33 #include <linux/bug.h>
33 #include <linux/nmi.h> 34 #include <linux/nmi.h>
34 #include <linux/mm.h> 35 #include <linux/mm.h>
35 #include <linux/smp.h> 36 #include <linux/smp.h>
36 #include <linux/io.h> 37 #include <linux/io.h>
37 38
38 #ifdef CONFIG_EISA 39 #ifdef CONFIG_EISA
39 #include <linux/ioport.h> 40 #include <linux/ioport.h>
40 #include <linux/eisa.h> 41 #include <linux/eisa.h>
41 #endif 42 #endif
42 43
43 #if defined(CONFIG_EDAC) 44 #if defined(CONFIG_EDAC)
44 #include <linux/edac.h> 45 #include <linux/edac.h>
45 #endif 46 #endif
46 47
47 #include <asm/kmemcheck.h> 48 #include <asm/kmemcheck.h>
48 #include <asm/stacktrace.h> 49 #include <asm/stacktrace.h>
49 #include <asm/processor.h> 50 #include <asm/processor.h>
50 #include <asm/debugreg.h> 51 #include <asm/debugreg.h>
51 #include <linux/atomic.h> 52 #include <linux/atomic.h>
52 #include <asm/ftrace.h> 53 #include <asm/ftrace.h>
53 #include <asm/traps.h> 54 #include <asm/traps.h>
54 #include <asm/desc.h> 55 #include <asm/desc.h>
55 #include <asm/i387.h> 56 #include <asm/i387.h>
56 #include <asm/fpu-internal.h> 57 #include <asm/fpu-internal.h>
57 #include <asm/mce.h> 58 #include <asm/mce.h>
58 #include <asm/context_tracking.h>
59
60 #include <asm/mach_traps.h> 59 #include <asm/mach_traps.h>
61 60
62 #ifdef CONFIG_X86_64 61 #ifdef CONFIG_X86_64
63 #include <asm/x86_init.h> 62 #include <asm/x86_init.h>
64 #include <asm/pgalloc.h> 63 #include <asm/pgalloc.h>
65 #include <asm/proto.h> 64 #include <asm/proto.h>
66 #else 65 #else
67 #include <asm/processor-flags.h> 66 #include <asm/processor-flags.h>
68 #include <asm/setup.h> 67 #include <asm/setup.h>
69 68
70 asmlinkage int system_call(void); 69 asmlinkage int system_call(void);
71 70
72 /* 71 /*
73 * The IDT has to be page-aligned to simplify the Pentium 72 * The IDT has to be page-aligned to simplify the Pentium
74 * F0 0F bug workaround. 73 * F0 0F bug workaround.
75 */ 74 */
76 gate_desc idt_table[NR_VECTORS] __page_aligned_data = { { { { 0, 0 } } }, }; 75 gate_desc idt_table[NR_VECTORS] __page_aligned_data = { { { { 0, 0 } } }, };
77 #endif 76 #endif
78 77
79 DECLARE_BITMAP(used_vectors, NR_VECTORS); 78 DECLARE_BITMAP(used_vectors, NR_VECTORS);
80 EXPORT_SYMBOL_GPL(used_vectors); 79 EXPORT_SYMBOL_GPL(used_vectors);
81 80
82 static inline void conditional_sti(struct pt_regs *regs) 81 static inline void conditional_sti(struct pt_regs *regs)
83 { 82 {
84 if (regs->flags & X86_EFLAGS_IF) 83 if (regs->flags & X86_EFLAGS_IF)
85 local_irq_enable(); 84 local_irq_enable();
86 } 85 }
87 86
88 static inline void preempt_conditional_sti(struct pt_regs *regs) 87 static inline void preempt_conditional_sti(struct pt_regs *regs)
89 { 88 {
90 inc_preempt_count(); 89 inc_preempt_count();
91 if (regs->flags & X86_EFLAGS_IF) 90 if (regs->flags & X86_EFLAGS_IF)
92 local_irq_enable(); 91 local_irq_enable();
93 } 92 }
94 93
95 static inline void conditional_cli(struct pt_regs *regs) 94 static inline void conditional_cli(struct pt_regs *regs)
96 { 95 {
97 if (regs->flags & X86_EFLAGS_IF) 96 if (regs->flags & X86_EFLAGS_IF)
98 local_irq_disable(); 97 local_irq_disable();
99 } 98 }
100 99
101 static inline void preempt_conditional_cli(struct pt_regs *regs) 100 static inline void preempt_conditional_cli(struct pt_regs *regs)
102 { 101 {
103 if (regs->flags & X86_EFLAGS_IF) 102 if (regs->flags & X86_EFLAGS_IF)
104 local_irq_disable(); 103 local_irq_disable();
105 dec_preempt_count(); 104 dec_preempt_count();
106 } 105 }
107 106
108 static int __kprobes 107 static int __kprobes
109 do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, 108 do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
110 struct pt_regs *regs, long error_code) 109 struct pt_regs *regs, long error_code)
111 { 110 {
112 #ifdef CONFIG_X86_32 111 #ifdef CONFIG_X86_32
113 if (regs->flags & X86_VM_MASK) { 112 if (regs->flags & X86_VM_MASK) {
114 /* 113 /*
115 * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86. 114 * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
116 * On nmi (interrupt 2), do_trap should not be called. 115 * On nmi (interrupt 2), do_trap should not be called.
117 */ 116 */
118 if (trapnr < X86_TRAP_UD) { 117 if (trapnr < X86_TRAP_UD) {
119 if (!handle_vm86_trap((struct kernel_vm86_regs *) regs, 118 if (!handle_vm86_trap((struct kernel_vm86_regs *) regs,
120 error_code, trapnr)) 119 error_code, trapnr))
121 return 0; 120 return 0;
122 } 121 }
123 return -1; 122 return -1;
124 } 123 }
125 #endif 124 #endif
126 if (!user_mode(regs)) { 125 if (!user_mode(regs)) {
127 if (!fixup_exception(regs)) { 126 if (!fixup_exception(regs)) {
128 tsk->thread.error_code = error_code; 127 tsk->thread.error_code = error_code;
129 tsk->thread.trap_nr = trapnr; 128 tsk->thread.trap_nr = trapnr;
130 die(str, regs, error_code); 129 die(str, regs, error_code);
131 } 130 }
132 return 0; 131 return 0;
133 } 132 }
134 133
135 return -1; 134 return -1;
136 } 135 }
137 136
138 static void __kprobes 137 static void __kprobes
139 do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, 138 do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
140 long error_code, siginfo_t *info) 139 long error_code, siginfo_t *info)
141 { 140 {
142 struct task_struct *tsk = current; 141 struct task_struct *tsk = current;
143 142
144 143
145 if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code)) 144 if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code))
146 return; 145 return;
147 /* 146 /*
148 * We want error_code and trap_nr set for userspace faults and 147 * We want error_code and trap_nr set for userspace faults and
149 * kernelspace faults which result in die(), but not 148 * kernelspace faults which result in die(), but not
150 * kernelspace faults which are fixed up. die() gives the 149 * kernelspace faults which are fixed up. die() gives the
151 * process no chance to handle the signal and notice the 150 * process no chance to handle the signal and notice the
152 * kernel fault information, so that won't result in polluting 151 * kernel fault information, so that won't result in polluting
153 * the information about previously queued, but not yet 152 * the information about previously queued, but not yet
154 * delivered, faults. See also do_general_protection below. 153 * delivered, faults. See also do_general_protection below.
155 */ 154 */
156 tsk->thread.error_code = error_code; 155 tsk->thread.error_code = error_code;
157 tsk->thread.trap_nr = trapnr; 156 tsk->thread.trap_nr = trapnr;
158 157
159 #ifdef CONFIG_X86_64 158 #ifdef CONFIG_X86_64
160 if (show_unhandled_signals && unhandled_signal(tsk, signr) && 159 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
161 printk_ratelimit()) { 160 printk_ratelimit()) {
162 pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx", 161 pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx",
163 tsk->comm, tsk->pid, str, 162 tsk->comm, tsk->pid, str,
164 regs->ip, regs->sp, error_code); 163 regs->ip, regs->sp, error_code);
165 print_vma_addr(" in ", regs->ip); 164 print_vma_addr(" in ", regs->ip);
166 pr_cont("\n"); 165 pr_cont("\n");
167 } 166 }
168 #endif 167 #endif
169 168
170 if (info) 169 if (info)
171 force_sig_info(signr, info, tsk); 170 force_sig_info(signr, info, tsk);
172 else 171 else
173 force_sig(signr, tsk); 172 force_sig(signr, tsk);
174 } 173 }
175 174
176 #define DO_ERROR(trapnr, signr, str, name) \ 175 #define DO_ERROR(trapnr, signr, str, name) \
177 dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ 176 dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
178 { \ 177 { \
179 exception_enter(regs); \ 178 exception_enter(regs); \
180 if (notify_die(DIE_TRAP, str, regs, error_code, \ 179 if (notify_die(DIE_TRAP, str, regs, error_code, \
181 trapnr, signr) == NOTIFY_STOP) { \ 180 trapnr, signr) == NOTIFY_STOP) { \
182 exception_exit(regs); \ 181 exception_exit(regs); \
183 return; \ 182 return; \
184 } \ 183 } \
185 conditional_sti(regs); \ 184 conditional_sti(regs); \
186 do_trap(trapnr, signr, str, regs, error_code, NULL); \ 185 do_trap(trapnr, signr, str, regs, error_code, NULL); \
187 exception_exit(regs); \ 186 exception_exit(regs); \
188 } 187 }
189 188
190 #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ 189 #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
191 dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ 190 dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
192 { \ 191 { \
193 siginfo_t info; \ 192 siginfo_t info; \
194 info.si_signo = signr; \ 193 info.si_signo = signr; \
195 info.si_errno = 0; \ 194 info.si_errno = 0; \
196 info.si_code = sicode; \ 195 info.si_code = sicode; \
197 info.si_addr = (void __user *)siaddr; \ 196 info.si_addr = (void __user *)siaddr; \
198 exception_enter(regs); \ 197 exception_enter(regs); \
199 if (notify_die(DIE_TRAP, str, regs, error_code, \ 198 if (notify_die(DIE_TRAP, str, regs, error_code, \
200 trapnr, signr) == NOTIFY_STOP) { \ 199 trapnr, signr) == NOTIFY_STOP) { \
201 exception_exit(regs); \ 200 exception_exit(regs); \
202 return; \ 201 return; \
203 } \ 202 } \
204 conditional_sti(regs); \ 203 conditional_sti(regs); \
205 do_trap(trapnr, signr, str, regs, error_code, &info); \ 204 do_trap(trapnr, signr, str, regs, error_code, &info); \
206 exception_exit(regs); \ 205 exception_exit(regs); \
207 } 206 }
208 207
209 DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, 208 DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV,
210 regs->ip) 209 regs->ip)
211 DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow) 210 DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow)
212 DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds) 211 DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds)
213 DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, 212 DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN,
214 regs->ip) 213 regs->ip)
215 DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun", 214 DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",
216 coprocessor_segment_overrun) 215 coprocessor_segment_overrun)
217 DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS) 216 DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS)
218 DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) 217 DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present)
219 #ifdef CONFIG_X86_32 218 #ifdef CONFIG_X86_32
220 DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) 219 DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment)
221 #endif 220 #endif
222 DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check, 221 DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check,
223 BUS_ADRALN, 0) 222 BUS_ADRALN, 0)
224 223
225 #ifdef CONFIG_X86_64 224 #ifdef CONFIG_X86_64
226 /* Runs on IST stack */ 225 /* Runs on IST stack */
227 dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) 226 dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code)
228 { 227 {
229 exception_enter(regs); 228 exception_enter(regs);
230 if (notify_die(DIE_TRAP, "stack segment", regs, error_code, 229 if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
231 X86_TRAP_SS, SIGBUS) != NOTIFY_STOP) { 230 X86_TRAP_SS, SIGBUS) != NOTIFY_STOP) {
232 preempt_conditional_sti(regs); 231 preempt_conditional_sti(regs);
233 do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL); 232 do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL);
234 preempt_conditional_cli(regs); 233 preempt_conditional_cli(regs);
235 } 234 }
236 exception_exit(regs); 235 exception_exit(regs);
237 } 236 }
238 237
239 dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) 238 dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
240 { 239 {
241 static const char str[] = "double fault"; 240 static const char str[] = "double fault";
242 struct task_struct *tsk = current; 241 struct task_struct *tsk = current;
243 242
244 exception_enter(regs); 243 exception_enter(regs);
245 /* Return not checked because double check cannot be ignored */ 244 /* Return not checked because double check cannot be ignored */
246 notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); 245 notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
247 246
248 tsk->thread.error_code = error_code; 247 tsk->thread.error_code = error_code;
249 tsk->thread.trap_nr = X86_TRAP_DF; 248 tsk->thread.trap_nr = X86_TRAP_DF;
250 249
251 /* 250 /*
252 * This is always a kernel trap and never fixable (and thus must 251 * This is always a kernel trap and never fixable (and thus must
253 * never return). 252 * never return).
254 */ 253 */
255 for (;;) 254 for (;;)
256 die(str, regs, error_code); 255 die(str, regs, error_code);
257 } 256 }
258 #endif 257 #endif
259 258
260 dotraplinkage void __kprobes 259 dotraplinkage void __kprobes
261 do_general_protection(struct pt_regs *regs, long error_code) 260 do_general_protection(struct pt_regs *regs, long error_code)
262 { 261 {
263 struct task_struct *tsk; 262 struct task_struct *tsk;
264 263
265 exception_enter(regs); 264 exception_enter(regs);
266 conditional_sti(regs); 265 conditional_sti(regs);
267 266
268 #ifdef CONFIG_X86_32 267 #ifdef CONFIG_X86_32
269 if (regs->flags & X86_VM_MASK) { 268 if (regs->flags & X86_VM_MASK) {
270 local_irq_enable(); 269 local_irq_enable();
271 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); 270 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
272 goto exit; 271 goto exit;
273 } 272 }
274 #endif 273 #endif
275 274
276 tsk = current; 275 tsk = current;
277 if (!user_mode(regs)) { 276 if (!user_mode(regs)) {
278 if (fixup_exception(regs)) 277 if (fixup_exception(regs))
279 goto exit; 278 goto exit;
280 279
281 tsk->thread.error_code = error_code; 280 tsk->thread.error_code = error_code;
282 tsk->thread.trap_nr = X86_TRAP_GP; 281 tsk->thread.trap_nr = X86_TRAP_GP;
283 if (notify_die(DIE_GPF, "general protection fault", regs, error_code, 282 if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
284 X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP) 283 X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
285 die("general protection fault", regs, error_code); 284 die("general protection fault", regs, error_code);
286 goto exit; 285 goto exit;
287 } 286 }
288 287
289 tsk->thread.error_code = error_code; 288 tsk->thread.error_code = error_code;
290 tsk->thread.trap_nr = X86_TRAP_GP; 289 tsk->thread.trap_nr = X86_TRAP_GP;
291 290
292 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && 291 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
293 printk_ratelimit()) { 292 printk_ratelimit()) {
294 pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx", 293 pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx",
295 tsk->comm, task_pid_nr(tsk), 294 tsk->comm, task_pid_nr(tsk),
296 regs->ip, regs->sp, error_code); 295 regs->ip, regs->sp, error_code);
297 print_vma_addr(" in ", regs->ip); 296 print_vma_addr(" in ", regs->ip);
298 pr_cont("\n"); 297 pr_cont("\n");
299 } 298 }
300 299
301 force_sig(SIGSEGV, tsk); 300 force_sig(SIGSEGV, tsk);
302 exit: 301 exit:
303 exception_exit(regs); 302 exception_exit(regs);
304 } 303 }
305 304
306 /* May run on IST stack. */ 305 /* May run on IST stack. */
307 dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code) 306 dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code)
308 { 307 {
309 #ifdef CONFIG_DYNAMIC_FTRACE 308 #ifdef CONFIG_DYNAMIC_FTRACE
310 /* 309 /*
311 * ftrace must be first, everything else may cause a recursive crash. 310 * ftrace must be first, everything else may cause a recursive crash.
312 * See note by declaration of modifying_ftrace_code in ftrace.c 311 * See note by declaration of modifying_ftrace_code in ftrace.c
313 */ 312 */
314 if (unlikely(atomic_read(&modifying_ftrace_code)) && 313 if (unlikely(atomic_read(&modifying_ftrace_code)) &&
315 ftrace_int3_handler(regs)) 314 ftrace_int3_handler(regs))
316 return; 315 return;
317 #endif 316 #endif
318 exception_enter(regs); 317 exception_enter(regs);
319 #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP 318 #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
320 if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, 319 if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
321 SIGTRAP) == NOTIFY_STOP) 320 SIGTRAP) == NOTIFY_STOP)
322 goto exit; 321 goto exit;
323 #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ 322 #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
324 323
325 if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, 324 if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
326 SIGTRAP) == NOTIFY_STOP) 325 SIGTRAP) == NOTIFY_STOP)
327 goto exit; 326 goto exit;
328 327
329 /* 328 /*
330 * Let others (NMI) know that the debug stack is in use 329 * Let others (NMI) know that the debug stack is in use
331 * as we may switch to the interrupt stack. 330 * as we may switch to the interrupt stack.
332 */ 331 */
333 debug_stack_usage_inc(); 332 debug_stack_usage_inc();
334 preempt_conditional_sti(regs); 333 preempt_conditional_sti(regs);
335 do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL); 334 do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
336 preempt_conditional_cli(regs); 335 preempt_conditional_cli(regs);
337 debug_stack_usage_dec(); 336 debug_stack_usage_dec();
338 exit: 337 exit:
339 exception_exit(regs); 338 exception_exit(regs);
340 } 339 }
341 340
342 #ifdef CONFIG_X86_64 341 #ifdef CONFIG_X86_64
343 /* 342 /*
344 * Help handler running on IST stack to switch back to user stack 343 * Help handler running on IST stack to switch back to user stack
345 * for scheduling or signal handling. The actual stack switch is done in 344 * for scheduling or signal handling. The actual stack switch is done in
346 * entry.S 345 * entry.S
347 */ 346 */
348 asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) 347 asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
349 { 348 {
350 struct pt_regs *regs = eregs; 349 struct pt_regs *regs = eregs;
351 /* Did already sync */ 350 /* Did already sync */
352 if (eregs == (struct pt_regs *)eregs->sp) 351 if (eregs == (struct pt_regs *)eregs->sp)
353 ; 352 ;
354 /* Exception from user space */ 353 /* Exception from user space */
355 else if (user_mode(eregs)) 354 else if (user_mode(eregs))
356 regs = task_pt_regs(current); 355 regs = task_pt_regs(current);
357 /* 356 /*
358 * Exception from kernel and interrupts are enabled. Move to 357 * Exception from kernel and interrupts are enabled. Move to
359 * kernel process stack. 358 * kernel process stack.
360 */ 359 */
361 else if (eregs->flags & X86_EFLAGS_IF) 360 else if (eregs->flags & X86_EFLAGS_IF)
362 regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); 361 regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
363 if (eregs != regs) 362 if (eregs != regs)
364 *regs = *eregs; 363 *regs = *eregs;
365 return regs; 364 return regs;
366 } 365 }
367 #endif 366 #endif
368 367
369 /* 368 /*
370 * Our handling of the processor debug registers is non-trivial. 369 * Our handling of the processor debug registers is non-trivial.
371 * We do not clear them on entry and exit from the kernel. Therefore 370 * We do not clear them on entry and exit from the kernel. Therefore
372 * it is possible to get a watchpoint trap here from inside the kernel. 371 * it is possible to get a watchpoint trap here from inside the kernel.
373 * However, the code in ./ptrace.c has ensured that the user can 372 * However, the code in ./ptrace.c has ensured that the user can
374 * only set watchpoints on userspace addresses. Therefore the in-kernel 373 * only set watchpoints on userspace addresses. Therefore the in-kernel
375 * watchpoint trap can only occur in code which is reading/writing 374 * watchpoint trap can only occur in code which is reading/writing
376 * from user space. Such code must not hold kernel locks (since it 375 * from user space. Such code must not hold kernel locks (since it
377 * can equally take a page fault), therefore it is safe to call 376 * can equally take a page fault), therefore it is safe to call
378 * force_sig_info even though that claims and releases locks. 377 * force_sig_info even though that claims and releases locks.
379 * 378 *
380 * Code in ./signal.c ensures that the debug control register 379 * Code in ./signal.c ensures that the debug control register
381 * is restored before we deliver any signal, and therefore that 380 * is restored before we deliver any signal, and therefore that
382 * user code runs with the correct debug control register even though 381 * user code runs with the correct debug control register even though
383 * we clear it here. 382 * we clear it here.
384 * 383 *
385 * Being careful here means that we don't have to be as careful in a 384 * Being careful here means that we don't have to be as careful in a
386 * lot of more complicated places (task switching can be a bit lazy 385 * lot of more complicated places (task switching can be a bit lazy
387 * about restoring all the debug state, and ptrace doesn't have to 386 * about restoring all the debug state, and ptrace doesn't have to
388 * find every occurrence of the TF bit that could be saved away even 387 * find every occurrence of the TF bit that could be saved away even
389 * by user code) 388 * by user code)
390 * 389 *
391 * May run on IST stack. 390 * May run on IST stack.
392 */ 391 */
393 dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) 392 dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
394 { 393 {
395 struct task_struct *tsk = current; 394 struct task_struct *tsk = current;
396 int user_icebp = 0; 395 int user_icebp = 0;
397 unsigned long dr6; 396 unsigned long dr6;
398 int si_code; 397 int si_code;
399 398
400 exception_enter(regs); 399 exception_enter(regs);
401 400
402 get_debugreg(dr6, 6); 401 get_debugreg(dr6, 6);
403 402
404 /* Filter out all the reserved bits which are preset to 1 */ 403 /* Filter out all the reserved bits which are preset to 1 */
405 dr6 &= ~DR6_RESERVED; 404 dr6 &= ~DR6_RESERVED;
406 405
407 /* 406 /*
408 * If dr6 has no reason to give us about the origin of this trap, 407 * If dr6 has no reason to give us about the origin of this trap,
409 * then it's very likely the result of an icebp/int01 trap. 408 * then it's very likely the result of an icebp/int01 trap.
410 * User wants a sigtrap for that. 409 * User wants a sigtrap for that.
411 */ 410 */
412 if (!dr6 && user_mode(regs)) 411 if (!dr6 && user_mode(regs))
413 user_icebp = 1; 412 user_icebp = 1;
414 413
415 /* Catch kmemcheck conditions first of all! */ 414 /* Catch kmemcheck conditions first of all! */
416 if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) 415 if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
417 goto exit; 416 goto exit;
418 417
419 /* DR6 may or may not be cleared by the CPU */ 418 /* DR6 may or may not be cleared by the CPU */
420 set_debugreg(0, 6); 419 set_debugreg(0, 6);
421 420
422 /* 421 /*
423 * The processor cleared BTF, so don't mark that we need it set. 422 * The processor cleared BTF, so don't mark that we need it set.
424 */ 423 */
425 clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP); 424 clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
426 425
427 /* Store the virtualized DR6 value */ 426 /* Store the virtualized DR6 value */
428 tsk->thread.debugreg6 = dr6; 427 tsk->thread.debugreg6 = dr6;
429 428
430 if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code, 429 if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code,
431 SIGTRAP) == NOTIFY_STOP) 430 SIGTRAP) == NOTIFY_STOP)
432 goto exit; 431 goto exit;
433 432
434 /* 433 /*
435 * Let others (NMI) know that the debug stack is in use 434 * Let others (NMI) know that the debug stack is in use
436 * as we may switch to the interrupt stack. 435 * as we may switch to the interrupt stack.
437 */ 436 */
438 debug_stack_usage_inc(); 437 debug_stack_usage_inc();
439 438
440 /* It's safe to allow irq's after DR6 has been saved */ 439 /* It's safe to allow irq's after DR6 has been saved */
441 preempt_conditional_sti(regs); 440 preempt_conditional_sti(regs);
442 441
443 if (regs->flags & X86_VM_MASK) { 442 if (regs->flags & X86_VM_MASK) {
444 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 443 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
445 X86_TRAP_DB); 444 X86_TRAP_DB);
446 preempt_conditional_cli(regs); 445 preempt_conditional_cli(regs);
447 debug_stack_usage_dec(); 446 debug_stack_usage_dec();
448 goto exit; 447 goto exit;
449 } 448 }
450 449
451 /* 450 /*
452 * Single-stepping through system calls: ignore any exceptions in 451 * Single-stepping through system calls: ignore any exceptions in
453 * kernel space, but re-enable TF when returning to user mode. 452 * kernel space, but re-enable TF when returning to user mode.
454 * 453 *
455 * We already checked v86 mode above, so we can check for kernel mode 454 * We already checked v86 mode above, so we can check for kernel mode
456 * by just checking the CPL of CS. 455 * by just checking the CPL of CS.
457 */ 456 */
458 if ((dr6 & DR_STEP) && !user_mode(regs)) { 457 if ((dr6 & DR_STEP) && !user_mode(regs)) {
459 tsk->thread.debugreg6 &= ~DR_STEP; 458 tsk->thread.debugreg6 &= ~DR_STEP;
460 set_tsk_thread_flag(tsk, TIF_SINGLESTEP); 459 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
461 regs->flags &= ~X86_EFLAGS_TF; 460 regs->flags &= ~X86_EFLAGS_TF;
462 } 461 }
463 si_code = get_si_code(tsk->thread.debugreg6); 462 si_code = get_si_code(tsk->thread.debugreg6);
464 if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) 463 if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
465 send_sigtrap(tsk, regs, error_code, si_code); 464 send_sigtrap(tsk, regs, error_code, si_code);
466 preempt_conditional_cli(regs); 465 preempt_conditional_cli(regs);
467 debug_stack_usage_dec(); 466 debug_stack_usage_dec();
468 467
469 exit: 468 exit:
470 exception_exit(regs); 469 exception_exit(regs);
471 } 470 }
472 471
473 /* 472 /*
474 * Note that we play around with the 'TS' bit in an attempt to get 473 * Note that we play around with the 'TS' bit in an attempt to get
475 * the correct behaviour even in the presence of the asynchronous 474 * the correct behaviour even in the presence of the asynchronous
476 * IRQ13 behaviour 475 * IRQ13 behaviour
477 */ 476 */
478 void math_error(struct pt_regs *regs, int error_code, int trapnr) 477 void math_error(struct pt_regs *regs, int error_code, int trapnr)
479 { 478 {
480 struct task_struct *task = current; 479 struct task_struct *task = current;
481 siginfo_t info; 480 siginfo_t info;
482 unsigned short err; 481 unsigned short err;
483 char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" : 482 char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" :
484 "simd exception"; 483 "simd exception";
485 484
486 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP) 485 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP)
487 return; 486 return;
488 conditional_sti(regs); 487 conditional_sti(regs);
489 488
490 if (!user_mode_vm(regs)) 489 if (!user_mode_vm(regs))
491 { 490 {
492 if (!fixup_exception(regs)) { 491 if (!fixup_exception(regs)) {
493 task->thread.error_code = error_code; 492 task->thread.error_code = error_code;
494 task->thread.trap_nr = trapnr; 493 task->thread.trap_nr = trapnr;
495 die(str, regs, error_code); 494 die(str, regs, error_code);
496 } 495 }
497 return; 496 return;
498 } 497 }
499 498
500 /* 499 /*
501 * Save the info for the exception handler and clear the error. 500 * Save the info for the exception handler and clear the error.
502 */ 501 */
503 save_init_fpu(task); 502 save_init_fpu(task);
504 task->thread.trap_nr = trapnr; 503 task->thread.trap_nr = trapnr;
505 task->thread.error_code = error_code; 504 task->thread.error_code = error_code;
506 info.si_signo = SIGFPE; 505 info.si_signo = SIGFPE;
507 info.si_errno = 0; 506 info.si_errno = 0;
508 info.si_addr = (void __user *)regs->ip; 507 info.si_addr = (void __user *)regs->ip;
509 if (trapnr == X86_TRAP_MF) { 508 if (trapnr == X86_TRAP_MF) {
510 unsigned short cwd, swd; 509 unsigned short cwd, swd;
511 /* 510 /*
512 * (~cwd & swd) will mask out exceptions that are not set to unmasked 511 * (~cwd & swd) will mask out exceptions that are not set to unmasked
513 * status. 0x3f is the exception bits in these regs, 0x200 is the 512 * status. 0x3f is the exception bits in these regs, 0x200 is the
514 * C1 reg you need in case of a stack fault, 0x040 is the stack 513 * C1 reg you need in case of a stack fault, 0x040 is the stack
515 * fault bit. We should only be taking one exception at a time, 514 * fault bit. We should only be taking one exception at a time,
516 * so if this combination doesn't produce any single exception, 515 * so if this combination doesn't produce any single exception,
517 * then we have a bad program that isn't synchronizing its FPU usage 516 * then we have a bad program that isn't synchronizing its FPU usage
518 * and it will suffer the consequences since we won't be able to 517 * and it will suffer the consequences since we won't be able to
519 * fully reproduce the context of the exception 518 * fully reproduce the context of the exception
520 */ 519 */
521 cwd = get_fpu_cwd(task); 520 cwd = get_fpu_cwd(task);
522 swd = get_fpu_swd(task); 521 swd = get_fpu_swd(task);
523 522
524 err = swd & ~cwd; 523 err = swd & ~cwd;
525 } else { 524 } else {
526 /* 525 /*
527 * The SIMD FPU exceptions are handled a little differently, as there 526 * The SIMD FPU exceptions are handled a little differently, as there
528 * is only a single status/control register. Thus, to determine which 527 * is only a single status/control register. Thus, to determine which
529 * unmasked exception was caught we must mask the exception mask bits 528 * unmasked exception was caught we must mask the exception mask bits
530 * at 0x1f80, and then use these to mask the exception bits at 0x3f. 529 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
531 */ 530 */
532 unsigned short mxcsr = get_fpu_mxcsr(task); 531 unsigned short mxcsr = get_fpu_mxcsr(task);
533 err = ~(mxcsr >> 7) & mxcsr; 532 err = ~(mxcsr >> 7) & mxcsr;
534 } 533 }
535 534
536 if (err & 0x001) { /* Invalid op */ 535 if (err & 0x001) { /* Invalid op */
537 /* 536 /*
538 * swd & 0x240 == 0x040: Stack Underflow 537 * swd & 0x240 == 0x040: Stack Underflow
539 * swd & 0x240 == 0x240: Stack Overflow 538 * swd & 0x240 == 0x240: Stack Overflow
540 * User must clear the SF bit (0x40) if set 539 * User must clear the SF bit (0x40) if set
541 */ 540 */
542 info.si_code = FPE_FLTINV; 541 info.si_code = FPE_FLTINV;
543 } else if (err & 0x004) { /* Divide by Zero */ 542 } else if (err & 0x004) { /* Divide by Zero */
544 info.si_code = FPE_FLTDIV; 543 info.si_code = FPE_FLTDIV;
545 } else if (err & 0x008) { /* Overflow */ 544 } else if (err & 0x008) { /* Overflow */
546 info.si_code = FPE_FLTOVF; 545 info.si_code = FPE_FLTOVF;
547 } else if (err & 0x012) { /* Denormal, Underflow */ 546 } else if (err & 0x012) { /* Denormal, Underflow */
548 info.si_code = FPE_FLTUND; 547 info.si_code = FPE_FLTUND;
549 } else if (err & 0x020) { /* Precision */ 548 } else if (err & 0x020) { /* Precision */
550 info.si_code = FPE_FLTRES; 549 info.si_code = FPE_FLTRES;
551 } else { 550 } else {
552 /* 551 /*
553 * If we're using IRQ 13, or supposedly even some trap 552 * If we're using IRQ 13, or supposedly even some trap
554 * X86_TRAP_MF implementations, it's possible 553 * X86_TRAP_MF implementations, it's possible
555 * we get a spurious trap, which is not an error. 554 * we get a spurious trap, which is not an error.
556 */ 555 */
557 return; 556 return;
558 } 557 }
559 force_sig_info(SIGFPE, &info, task); 558 force_sig_info(SIGFPE, &info, task);
560 } 559 }
561 560
562 dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) 561 dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
563 { 562 {
564 exception_enter(regs); 563 exception_enter(regs);
565 math_error(regs, error_code, X86_TRAP_MF); 564 math_error(regs, error_code, X86_TRAP_MF);
566 exception_exit(regs); 565 exception_exit(regs);
567 } 566 }
568 567
569 dotraplinkage void 568 dotraplinkage void
570 do_simd_coprocessor_error(struct pt_regs *regs, long error_code) 569 do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
571 { 570 {
572 exception_enter(regs); 571 exception_enter(regs);
573 math_error(regs, error_code, X86_TRAP_XF); 572 math_error(regs, error_code, X86_TRAP_XF);
574 exception_exit(regs); 573 exception_exit(regs);
575 } 574 }
576 575
577 dotraplinkage void 576 dotraplinkage void
578 do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) 577 do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
579 { 578 {
580 conditional_sti(regs); 579 conditional_sti(regs);
581 #if 0 580 #if 0
582 /* No need to warn about this any longer. */ 581 /* No need to warn about this any longer. */
583 pr_info("Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); 582 pr_info("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
584 #endif 583 #endif
585 } 584 }
586 585
587 asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) 586 asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
588 { 587 {
589 } 588 }
590 589
591 asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) 590 asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
592 { 591 {
593 } 592 }
594 593
595 /* 594 /*
596 * 'math_state_restore()' saves the current math information in the 595 * 'math_state_restore()' saves the current math information in the
597 * old math state array, and gets the new ones from the current task 596 * old math state array, and gets the new ones from the current task
598 * 597 *
599 * Careful.. There are problems with IBM-designed IRQ13 behaviour. 598 * Careful.. There are problems with IBM-designed IRQ13 behaviour.
600 * Don't touch unless you *really* know how it works. 599 * Don't touch unless you *really* know how it works.
601 * 600 *
602 * Must be called with kernel preemption disabled (eg with local 601 * Must be called with kernel preemption disabled (eg with local
603 * local interrupts as in the case of do_device_not_available). 602 * local interrupts as in the case of do_device_not_available).
604 */ 603 */
605 void math_state_restore(void) 604 void math_state_restore(void)
606 { 605 {
607 struct task_struct *tsk = current; 606 struct task_struct *tsk = current;
608 607
609 if (!tsk_used_math(tsk)) { 608 if (!tsk_used_math(tsk)) {
610 local_irq_enable(); 609 local_irq_enable();
611 /* 610 /*
612 * does a slab alloc which can sleep 611 * does a slab alloc which can sleep
613 */ 612 */
614 if (init_fpu(tsk)) { 613 if (init_fpu(tsk)) {
615 /* 614 /*
616 * ran out of memory! 615 * ran out of memory!
617 */ 616 */
618 do_group_exit(SIGKILL); 617 do_group_exit(SIGKILL);
619 return; 618 return;
620 } 619 }
621 local_irq_disable(); 620 local_irq_disable();
622 } 621 }
623 622
624 __thread_fpu_begin(tsk); 623 __thread_fpu_begin(tsk);
625 624
626 /* 625 /*
627 * Paranoid restore. send a SIGSEGV if we fail to restore the state. 626 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
628 */ 627 */
629 if (unlikely(restore_fpu_checking(tsk))) { 628 if (unlikely(restore_fpu_checking(tsk))) {
630 drop_init_fpu(tsk); 629 drop_init_fpu(tsk);
631 force_sig(SIGSEGV, tsk); 630 force_sig(SIGSEGV, tsk);
632 return; 631 return;
633 } 632 }
634 633
635 tsk->fpu_counter++; 634 tsk->fpu_counter++;
636 } 635 }
637 EXPORT_SYMBOL_GPL(math_state_restore); 636 EXPORT_SYMBOL_GPL(math_state_restore);
638 637
639 dotraplinkage void __kprobes 638 dotraplinkage void __kprobes
640 do_device_not_available(struct pt_regs *regs, long error_code) 639 do_device_not_available(struct pt_regs *regs, long error_code)
641 { 640 {
642 exception_enter(regs); 641 exception_enter(regs);
643 BUG_ON(use_eager_fpu()); 642 BUG_ON(use_eager_fpu());
644 643
645 #ifdef CONFIG_MATH_EMULATION 644 #ifdef CONFIG_MATH_EMULATION
646 if (read_cr0() & X86_CR0_EM) { 645 if (read_cr0() & X86_CR0_EM) {
647 struct math_emu_info info = { }; 646 struct math_emu_info info = { };
648 647
649 conditional_sti(regs); 648 conditional_sti(regs);
650 649
651 info.regs = regs; 650 info.regs = regs;
652 math_emulate(&info); 651 math_emulate(&info);
653 exception_exit(regs); 652 exception_exit(regs);
654 return; 653 return;
655 } 654 }
656 #endif 655 #endif
657 math_state_restore(); /* interrupts still off */ 656 math_state_restore(); /* interrupts still off */
658 #ifdef CONFIG_X86_32 657 #ifdef CONFIG_X86_32
659 conditional_sti(regs); 658 conditional_sti(regs);
660 #endif 659 #endif
661 exception_exit(regs); 660 exception_exit(regs);
662 } 661 }
663 662
664 #ifdef CONFIG_X86_32 663 #ifdef CONFIG_X86_32
665 dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) 664 dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
666 { 665 {
667 siginfo_t info; 666 siginfo_t info;
668 667
669 exception_enter(regs); 668 exception_enter(regs);
670 local_irq_enable(); 669 local_irq_enable();
671 670
672 info.si_signo = SIGILL; 671 info.si_signo = SIGILL;
673 info.si_errno = 0; 672 info.si_errno = 0;
674 info.si_code = ILL_BADSTK; 673 info.si_code = ILL_BADSTK;
675 info.si_addr = NULL; 674 info.si_addr = NULL;
676 if (notify_die(DIE_TRAP, "iret exception", regs, error_code, 675 if (notify_die(DIE_TRAP, "iret exception", regs, error_code,
677 X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) { 676 X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) {
678 do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code, 677 do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
679 &info); 678 &info);
680 } 679 }
681 exception_exit(regs); 680 exception_exit(regs);
682 } 681 }
683 #endif 682 #endif
684 683
685 /* Set of traps needed for early debugging. */ 684 /* Set of traps needed for early debugging. */
686 void __init early_trap_init(void) 685 void __init early_trap_init(void)
687 { 686 {
688 set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); 687 set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
689 /* int3 can be called from all */ 688 /* int3 can be called from all */
690 set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); 689 set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
691 #ifdef CONFIG_X86_32 690 #ifdef CONFIG_X86_32
692 set_intr_gate(X86_TRAP_PF, &page_fault); 691 set_intr_gate(X86_TRAP_PF, &page_fault);
693 #endif 692 #endif
694 load_idt(&idt_descr); 693 load_idt(&idt_descr);
695 } 694 }
696 695
697 void __init early_trap_pf_init(void) 696 void __init early_trap_pf_init(void)
698 { 697 {
699 #ifdef CONFIG_X86_64 698 #ifdef CONFIG_X86_64
700 set_intr_gate(X86_TRAP_PF, &page_fault); 699 set_intr_gate(X86_TRAP_PF, &page_fault);
701 #endif 700 #endif
702 } 701 }
703 702
704 void __init trap_init(void) 703 void __init trap_init(void)
705 { 704 {
706 int i; 705 int i;
707 706
708 #ifdef CONFIG_EISA 707 #ifdef CONFIG_EISA
709 void __iomem *p = early_ioremap(0x0FFFD9, 4); 708 void __iomem *p = early_ioremap(0x0FFFD9, 4);
710 709
711 if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24)) 710 if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24))
712 EISA_bus = 1; 711 EISA_bus = 1;
713 early_iounmap(p, 4); 712 early_iounmap(p, 4);
714 #endif 713 #endif
715 714
716 set_intr_gate(X86_TRAP_DE, &divide_error); 715 set_intr_gate(X86_TRAP_DE, &divide_error);
717 set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK); 716 set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK);
718 /* int4 can be called from all */ 717 /* int4 can be called from all */
719 set_system_intr_gate(X86_TRAP_OF, &overflow); 718 set_system_intr_gate(X86_TRAP_OF, &overflow);
720 set_intr_gate(X86_TRAP_BR, &bounds); 719 set_intr_gate(X86_TRAP_BR, &bounds);
721 set_intr_gate(X86_TRAP_UD, &invalid_op); 720 set_intr_gate(X86_TRAP_UD, &invalid_op);
722 set_intr_gate(X86_TRAP_NM, &device_not_available); 721 set_intr_gate(X86_TRAP_NM, &device_not_available);
723 #ifdef CONFIG_X86_32 722 #ifdef CONFIG_X86_32
724 set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS); 723 set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS);
725 #else 724 #else
726 set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK); 725 set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK);
727 #endif 726 #endif
728 set_intr_gate(X86_TRAP_OLD_MF, &coprocessor_segment_overrun); 727 set_intr_gate(X86_TRAP_OLD_MF, &coprocessor_segment_overrun);
729 set_intr_gate(X86_TRAP_TS, &invalid_TSS); 728 set_intr_gate(X86_TRAP_TS, &invalid_TSS);
730 set_intr_gate(X86_TRAP_NP, &segment_not_present); 729 set_intr_gate(X86_TRAP_NP, &segment_not_present);
731 set_intr_gate_ist(X86_TRAP_SS, &stack_segment, STACKFAULT_STACK); 730 set_intr_gate_ist(X86_TRAP_SS, &stack_segment, STACKFAULT_STACK);
732 set_intr_gate(X86_TRAP_GP, &general_protection); 731 set_intr_gate(X86_TRAP_GP, &general_protection);
733 set_intr_gate(X86_TRAP_SPURIOUS, &spurious_interrupt_bug); 732 set_intr_gate(X86_TRAP_SPURIOUS, &spurious_interrupt_bug);
734 set_intr_gate(X86_TRAP_MF, &coprocessor_error); 733 set_intr_gate(X86_TRAP_MF, &coprocessor_error);
735 set_intr_gate(X86_TRAP_AC, &alignment_check); 734 set_intr_gate(X86_TRAP_AC, &alignment_check);
736 #ifdef CONFIG_X86_MCE 735 #ifdef CONFIG_X86_MCE
737 set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK); 736 set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK);
738 #endif 737 #endif
739 set_intr_gate(X86_TRAP_XF, &simd_coprocessor_error); 738 set_intr_gate(X86_TRAP_XF, &simd_coprocessor_error);
740 739
741 /* Reserve all the builtin and the syscall vector: */ 740 /* Reserve all the builtin and the syscall vector: */
742 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) 741 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
743 set_bit(i, used_vectors); 742 set_bit(i, used_vectors);
744 743
745 #ifdef CONFIG_IA32_EMULATION 744 #ifdef CONFIG_IA32_EMULATION
746 set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); 745 set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
747 set_bit(IA32_SYSCALL_VECTOR, used_vectors); 746 set_bit(IA32_SYSCALL_VECTOR, used_vectors);
748 #endif 747 #endif
749 748
750 #ifdef CONFIG_X86_32 749 #ifdef CONFIG_X86_32
751 set_system_trap_gate(SYSCALL_VECTOR, &system_call); 750 set_system_trap_gate(SYSCALL_VECTOR, &system_call);
752 set_bit(SYSCALL_VECTOR, used_vectors); 751 set_bit(SYSCALL_VECTOR, used_vectors);
753 #endif 752 #endif
754 753
755 /* 754 /*
756 * Should be a barrier for any external CPU state: 755 * Should be a barrier for any external CPU state:
757 */ 756 */
758 cpu_init(); 757 cpu_init();
759 758
760 x86_init.irqs.trap_init(); 759 x86_init.irqs.trap_init();
761 760
762 #ifdef CONFIG_X86_64 761 #ifdef CONFIG_X86_64
763 memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16); 762 memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16);
764 set_nmi_gate(X86_TRAP_DB, &debug); 763 set_nmi_gate(X86_TRAP_DB, &debug);
765 set_nmi_gate(X86_TRAP_BP, &int3); 764 set_nmi_gate(X86_TRAP_BP, &int3);
766 #endif 765 #endif
767 } 766 }
1 /* 1 /*
2 * Copyright (C) 1995 Linus Torvalds 2 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. 3 * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
4 * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar 4 * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
5 */ 5 */
6 #include <linux/magic.h> /* STACK_END_MAGIC */ 6 #include <linux/magic.h> /* STACK_END_MAGIC */
7 #include <linux/sched.h> /* test_thread_flag(), ... */ 7 #include <linux/sched.h> /* test_thread_flag(), ... */
8 #include <linux/kdebug.h> /* oops_begin/end, ... */ 8 #include <linux/kdebug.h> /* oops_begin/end, ... */
9 #include <linux/module.h> /* search_exception_table */ 9 #include <linux/module.h> /* search_exception_table */
10 #include <linux/bootmem.h> /* max_low_pfn */ 10 #include <linux/bootmem.h> /* max_low_pfn */
11 #include <linux/kprobes.h> /* __kprobes, ... */ 11 #include <linux/kprobes.h> /* __kprobes, ... */
12 #include <linux/mmiotrace.h> /* kmmio_handler, ... */ 12 #include <linux/mmiotrace.h> /* kmmio_handler, ... */
13 #include <linux/perf_event.h> /* perf_sw_event */ 13 #include <linux/perf_event.h> /* perf_sw_event */
14 #include <linux/hugetlb.h> /* hstate_index_to_shift */ 14 #include <linux/hugetlb.h> /* hstate_index_to_shift */
15 #include <linux/prefetch.h> /* prefetchw */ 15 #include <linux/prefetch.h> /* prefetchw */
16 #include <linux/context_tracking.h> /* exception_enter(), ... */
16 17
17 #include <asm/traps.h> /* dotraplinkage, ... */ 18 #include <asm/traps.h> /* dotraplinkage, ... */
18 #include <asm/pgalloc.h> /* pgd_*(), ... */ 19 #include <asm/pgalloc.h> /* pgd_*(), ... */
19 #include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ 20 #include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
20 #include <asm/fixmap.h> /* VSYSCALL_START */ 21 #include <asm/fixmap.h> /* VSYSCALL_START */
21 #include <asm/context_tracking.h> /* exception_enter(), ... */
22 22
23 /* 23 /*
24 * Page fault error code bits: 24 * Page fault error code bits:
25 * 25 *
26 * bit 0 == 0: no page found 1: protection fault 26 * bit 0 == 0: no page found 1: protection fault
27 * bit 1 == 0: read access 1: write access 27 * bit 1 == 0: read access 1: write access
28 * bit 2 == 0: kernel-mode access 1: user-mode access 28 * bit 2 == 0: kernel-mode access 1: user-mode access
29 * bit 3 == 1: use of reserved bit detected 29 * bit 3 == 1: use of reserved bit detected
30 * bit 4 == 1: fault was an instruction fetch 30 * bit 4 == 1: fault was an instruction fetch
31 */ 31 */
32 enum x86_pf_error_code { 32 enum x86_pf_error_code {
33 33
34 PF_PROT = 1 << 0, 34 PF_PROT = 1 << 0,
35 PF_WRITE = 1 << 1, 35 PF_WRITE = 1 << 1,
36 PF_USER = 1 << 2, 36 PF_USER = 1 << 2,
37 PF_RSVD = 1 << 3, 37 PF_RSVD = 1 << 3,
38 PF_INSTR = 1 << 4, 38 PF_INSTR = 1 << 4,
39 }; 39 };
40 40
41 /* 41 /*
42 * Returns 0 if mmiotrace is disabled, or if the fault is not 42 * Returns 0 if mmiotrace is disabled, or if the fault is not
43 * handled by mmiotrace: 43 * handled by mmiotrace:
44 */ 44 */
45 static inline int __kprobes 45 static inline int __kprobes
46 kmmio_fault(struct pt_regs *regs, unsigned long addr) 46 kmmio_fault(struct pt_regs *regs, unsigned long addr)
47 { 47 {
48 if (unlikely(is_kmmio_active())) 48 if (unlikely(is_kmmio_active()))
49 if (kmmio_handler(regs, addr) == 1) 49 if (kmmio_handler(regs, addr) == 1)
50 return -1; 50 return -1;
51 return 0; 51 return 0;
52 } 52 }
53 53
54 static inline int __kprobes notify_page_fault(struct pt_regs *regs) 54 static inline int __kprobes notify_page_fault(struct pt_regs *regs)
55 { 55 {
56 int ret = 0; 56 int ret = 0;
57 57
58 /* kprobe_running() needs smp_processor_id() */ 58 /* kprobe_running() needs smp_processor_id() */
59 if (kprobes_built_in() && !user_mode_vm(regs)) { 59 if (kprobes_built_in() && !user_mode_vm(regs)) {
60 preempt_disable(); 60 preempt_disable();
61 if (kprobe_running() && kprobe_fault_handler(regs, 14)) 61 if (kprobe_running() && kprobe_fault_handler(regs, 14))
62 ret = 1; 62 ret = 1;
63 preempt_enable(); 63 preempt_enable();
64 } 64 }
65 65
66 return ret; 66 return ret;
67 } 67 }
68 68
69 /* 69 /*
70 * Prefetch quirks: 70 * Prefetch quirks:
71 * 71 *
72 * 32-bit mode: 72 * 32-bit mode:
73 * 73 *
74 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. 74 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
75 * Check that here and ignore it. 75 * Check that here and ignore it.
76 * 76 *
77 * 64-bit mode: 77 * 64-bit mode:
78 * 78 *
79 * Sometimes the CPU reports invalid exceptions on prefetch. 79 * Sometimes the CPU reports invalid exceptions on prefetch.
80 * Check that here and ignore it. 80 * Check that here and ignore it.
81 * 81 *
82 * Opcode checker based on code by Richard Brunner. 82 * Opcode checker based on code by Richard Brunner.
83 */ 83 */
84 static inline int 84 static inline int
85 check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, 85 check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
86 unsigned char opcode, int *prefetch) 86 unsigned char opcode, int *prefetch)
87 { 87 {
88 unsigned char instr_hi = opcode & 0xf0; 88 unsigned char instr_hi = opcode & 0xf0;
89 unsigned char instr_lo = opcode & 0x0f; 89 unsigned char instr_lo = opcode & 0x0f;
90 90
91 switch (instr_hi) { 91 switch (instr_hi) {
92 case 0x20: 92 case 0x20:
93 case 0x30: 93 case 0x30:
94 /* 94 /*
95 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. 95 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
96 * In X86_64 long mode, the CPU will signal invalid 96 * In X86_64 long mode, the CPU will signal invalid
97 * opcode if some of these prefixes are present so 97 * opcode if some of these prefixes are present so
98 * X86_64 will never get here anyway 98 * X86_64 will never get here anyway
99 */ 99 */
100 return ((instr_lo & 7) == 0x6); 100 return ((instr_lo & 7) == 0x6);
101 #ifdef CONFIG_X86_64 101 #ifdef CONFIG_X86_64
102 case 0x40: 102 case 0x40:
103 /* 103 /*
104 * In AMD64 long mode 0x40..0x4F are valid REX prefixes 104 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
105 * Need to figure out under what instruction mode the 105 * Need to figure out under what instruction mode the
106 * instruction was issued. Could check the LDT for lm, 106 * instruction was issued. Could check the LDT for lm,
107 * but for now it's good enough to assume that long 107 * but for now it's good enough to assume that long
108 * mode only uses well known segments or kernel. 108 * mode only uses well known segments or kernel.
109 */ 109 */
110 return (!user_mode(regs) || user_64bit_mode(regs)); 110 return (!user_mode(regs) || user_64bit_mode(regs));
111 #endif 111 #endif
112 case 0x60: 112 case 0x60:
113 /* 0x64 thru 0x67 are valid prefixes in all modes. */ 113 /* 0x64 thru 0x67 are valid prefixes in all modes. */
114 return (instr_lo & 0xC) == 0x4; 114 return (instr_lo & 0xC) == 0x4;
115 case 0xF0: 115 case 0xF0:
116 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ 116 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
117 return !instr_lo || (instr_lo>>1) == 1; 117 return !instr_lo || (instr_lo>>1) == 1;
118 case 0x00: 118 case 0x00:
119 /* Prefetch instruction is 0x0F0D or 0x0F18 */ 119 /* Prefetch instruction is 0x0F0D or 0x0F18 */
120 if (probe_kernel_address(instr, opcode)) 120 if (probe_kernel_address(instr, opcode))
121 return 0; 121 return 0;
122 122
123 *prefetch = (instr_lo == 0xF) && 123 *prefetch = (instr_lo == 0xF) &&
124 (opcode == 0x0D || opcode == 0x18); 124 (opcode == 0x0D || opcode == 0x18);
125 return 0; 125 return 0;
126 default: 126 default:
127 return 0; 127 return 0;
128 } 128 }
129 } 129 }
130 130
131 static int 131 static int
132 is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) 132 is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
133 { 133 {
134 unsigned char *max_instr; 134 unsigned char *max_instr;
135 unsigned char *instr; 135 unsigned char *instr;
136 int prefetch = 0; 136 int prefetch = 0;
137 137
138 /* 138 /*
139 * If it was a exec (instruction fetch) fault on NX page, then 139 * If it was a exec (instruction fetch) fault on NX page, then
140 * do not ignore the fault: 140 * do not ignore the fault:
141 */ 141 */
142 if (error_code & PF_INSTR) 142 if (error_code & PF_INSTR)
143 return 0; 143 return 0;
144 144
145 instr = (void *)convert_ip_to_linear(current, regs); 145 instr = (void *)convert_ip_to_linear(current, regs);
146 max_instr = instr + 15; 146 max_instr = instr + 15;
147 147
148 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) 148 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
149 return 0; 149 return 0;
150 150
151 while (instr < max_instr) { 151 while (instr < max_instr) {
152 unsigned char opcode; 152 unsigned char opcode;
153 153
154 if (probe_kernel_address(instr, opcode)) 154 if (probe_kernel_address(instr, opcode))
155 break; 155 break;
156 156
157 instr++; 157 instr++;
158 158
159 if (!check_prefetch_opcode(regs, instr, opcode, &prefetch)) 159 if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
160 break; 160 break;
161 } 161 }
162 return prefetch; 162 return prefetch;
163 } 163 }
164 164
165 static void 165 static void
166 force_sig_info_fault(int si_signo, int si_code, unsigned long address, 166 force_sig_info_fault(int si_signo, int si_code, unsigned long address,
167 struct task_struct *tsk, int fault) 167 struct task_struct *tsk, int fault)
168 { 168 {
169 unsigned lsb = 0; 169 unsigned lsb = 0;
170 siginfo_t info; 170 siginfo_t info;
171 171
172 info.si_signo = si_signo; 172 info.si_signo = si_signo;
173 info.si_errno = 0; 173 info.si_errno = 0;
174 info.si_code = si_code; 174 info.si_code = si_code;
175 info.si_addr = (void __user *)address; 175 info.si_addr = (void __user *)address;
176 if (fault & VM_FAULT_HWPOISON_LARGE) 176 if (fault & VM_FAULT_HWPOISON_LARGE)
177 lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); 177 lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
178 if (fault & VM_FAULT_HWPOISON) 178 if (fault & VM_FAULT_HWPOISON)
179 lsb = PAGE_SHIFT; 179 lsb = PAGE_SHIFT;
180 info.si_addr_lsb = lsb; 180 info.si_addr_lsb = lsb;
181 181
182 force_sig_info(si_signo, &info, tsk); 182 force_sig_info(si_signo, &info, tsk);
183 } 183 }
184 184
185 DEFINE_SPINLOCK(pgd_lock); 185 DEFINE_SPINLOCK(pgd_lock);
186 LIST_HEAD(pgd_list); 186 LIST_HEAD(pgd_list);
187 187
188 #ifdef CONFIG_X86_32 188 #ifdef CONFIG_X86_32
189 static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) 189 static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
190 { 190 {
191 unsigned index = pgd_index(address); 191 unsigned index = pgd_index(address);
192 pgd_t *pgd_k; 192 pgd_t *pgd_k;
193 pud_t *pud, *pud_k; 193 pud_t *pud, *pud_k;
194 pmd_t *pmd, *pmd_k; 194 pmd_t *pmd, *pmd_k;
195 195
196 pgd += index; 196 pgd += index;
197 pgd_k = init_mm.pgd + index; 197 pgd_k = init_mm.pgd + index;
198 198
199 if (!pgd_present(*pgd_k)) 199 if (!pgd_present(*pgd_k))
200 return NULL; 200 return NULL;
201 201
202 /* 202 /*
203 * set_pgd(pgd, *pgd_k); here would be useless on PAE 203 * set_pgd(pgd, *pgd_k); here would be useless on PAE
204 * and redundant with the set_pmd() on non-PAE. As would 204 * and redundant with the set_pmd() on non-PAE. As would
205 * set_pud. 205 * set_pud.
206 */ 206 */
207 pud = pud_offset(pgd, address); 207 pud = pud_offset(pgd, address);
208 pud_k = pud_offset(pgd_k, address); 208 pud_k = pud_offset(pgd_k, address);
209 if (!pud_present(*pud_k)) 209 if (!pud_present(*pud_k))
210 return NULL; 210 return NULL;
211 211
212 pmd = pmd_offset(pud, address); 212 pmd = pmd_offset(pud, address);
213 pmd_k = pmd_offset(pud_k, address); 213 pmd_k = pmd_offset(pud_k, address);
214 if (!pmd_present(*pmd_k)) 214 if (!pmd_present(*pmd_k))
215 return NULL; 215 return NULL;
216 216
217 if (!pmd_present(*pmd)) 217 if (!pmd_present(*pmd))
218 set_pmd(pmd, *pmd_k); 218 set_pmd(pmd, *pmd_k);
219 else 219 else
220 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); 220 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
221 221
222 return pmd_k; 222 return pmd_k;
223 } 223 }
224 224
225 void vmalloc_sync_all(void) 225 void vmalloc_sync_all(void)
226 { 226 {
227 unsigned long address; 227 unsigned long address;
228 228
229 if (SHARED_KERNEL_PMD) 229 if (SHARED_KERNEL_PMD)
230 return; 230 return;
231 231
232 for (address = VMALLOC_START & PMD_MASK; 232 for (address = VMALLOC_START & PMD_MASK;
233 address >= TASK_SIZE && address < FIXADDR_TOP; 233 address >= TASK_SIZE && address < FIXADDR_TOP;
234 address += PMD_SIZE) { 234 address += PMD_SIZE) {
235 struct page *page; 235 struct page *page;
236 236
237 spin_lock(&pgd_lock); 237 spin_lock(&pgd_lock);
238 list_for_each_entry(page, &pgd_list, lru) { 238 list_for_each_entry(page, &pgd_list, lru) {
239 spinlock_t *pgt_lock; 239 spinlock_t *pgt_lock;
240 pmd_t *ret; 240 pmd_t *ret;
241 241
242 /* the pgt_lock only for Xen */ 242 /* the pgt_lock only for Xen */
243 pgt_lock = &pgd_page_get_mm(page)->page_table_lock; 243 pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
244 244
245 spin_lock(pgt_lock); 245 spin_lock(pgt_lock);
246 ret = vmalloc_sync_one(page_address(page), address); 246 ret = vmalloc_sync_one(page_address(page), address);
247 spin_unlock(pgt_lock); 247 spin_unlock(pgt_lock);
248 248
249 if (!ret) 249 if (!ret)
250 break; 250 break;
251 } 251 }
252 spin_unlock(&pgd_lock); 252 spin_unlock(&pgd_lock);
253 } 253 }
254 } 254 }
255 255
256 /* 256 /*
257 * 32-bit: 257 * 32-bit:
258 * 258 *
259 * Handle a fault on the vmalloc or module mapping area 259 * Handle a fault on the vmalloc or module mapping area
260 */ 260 */
261 static noinline __kprobes int vmalloc_fault(unsigned long address) 261 static noinline __kprobes int vmalloc_fault(unsigned long address)
262 { 262 {
263 unsigned long pgd_paddr; 263 unsigned long pgd_paddr;
264 pmd_t *pmd_k; 264 pmd_t *pmd_k;
265 pte_t *pte_k; 265 pte_t *pte_k;
266 266
267 /* Make sure we are in vmalloc area: */ 267 /* Make sure we are in vmalloc area: */
268 if (!(address >= VMALLOC_START && address < VMALLOC_END)) 268 if (!(address >= VMALLOC_START && address < VMALLOC_END))
269 return -1; 269 return -1;
270 270
271 WARN_ON_ONCE(in_nmi()); 271 WARN_ON_ONCE(in_nmi());
272 272
273 /* 273 /*
274 * Synchronize this task's top level page-table 274 * Synchronize this task's top level page-table
275 * with the 'reference' page table. 275 * with the 'reference' page table.
276 * 276 *
277 * Do _not_ use "current" here. We might be inside 277 * Do _not_ use "current" here. We might be inside
278 * an interrupt in the middle of a task switch.. 278 * an interrupt in the middle of a task switch..
279 */ 279 */
280 pgd_paddr = read_cr3(); 280 pgd_paddr = read_cr3();
281 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); 281 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
282 if (!pmd_k) 282 if (!pmd_k)
283 return -1; 283 return -1;
284 284
285 pte_k = pte_offset_kernel(pmd_k, address); 285 pte_k = pte_offset_kernel(pmd_k, address);
286 if (!pte_present(*pte_k)) 286 if (!pte_present(*pte_k))
287 return -1; 287 return -1;
288 288
289 return 0; 289 return 0;
290 } 290 }
291 291
292 /* 292 /*
293 * Did it hit the DOS screen memory VA from vm86 mode? 293 * Did it hit the DOS screen memory VA from vm86 mode?
294 */ 294 */
295 static inline void 295 static inline void
296 check_v8086_mode(struct pt_regs *regs, unsigned long address, 296 check_v8086_mode(struct pt_regs *regs, unsigned long address,
297 struct task_struct *tsk) 297 struct task_struct *tsk)
298 { 298 {
299 unsigned long bit; 299 unsigned long bit;
300 300
301 if (!v8086_mode(regs)) 301 if (!v8086_mode(regs))
302 return; 302 return;
303 303
304 bit = (address - 0xA0000) >> PAGE_SHIFT; 304 bit = (address - 0xA0000) >> PAGE_SHIFT;
305 if (bit < 32) 305 if (bit < 32)
306 tsk->thread.screen_bitmap |= 1 << bit; 306 tsk->thread.screen_bitmap |= 1 << bit;
307 } 307 }
308 308
309 static bool low_pfn(unsigned long pfn) 309 static bool low_pfn(unsigned long pfn)
310 { 310 {
311 return pfn < max_low_pfn; 311 return pfn < max_low_pfn;
312 } 312 }
313 313
314 static void dump_pagetable(unsigned long address) 314 static void dump_pagetable(unsigned long address)
315 { 315 {
316 pgd_t *base = __va(read_cr3()); 316 pgd_t *base = __va(read_cr3());
317 pgd_t *pgd = &base[pgd_index(address)]; 317 pgd_t *pgd = &base[pgd_index(address)];
318 pmd_t *pmd; 318 pmd_t *pmd;
319 pte_t *pte; 319 pte_t *pte;
320 320
321 #ifdef CONFIG_X86_PAE 321 #ifdef CONFIG_X86_PAE
322 printk("*pdpt = %016Lx ", pgd_val(*pgd)); 322 printk("*pdpt = %016Lx ", pgd_val(*pgd));
323 if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) 323 if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
324 goto out; 324 goto out;
325 #endif 325 #endif
326 pmd = pmd_offset(pud_offset(pgd, address), address); 326 pmd = pmd_offset(pud_offset(pgd, address), address);
327 printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); 327 printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
328 328
329 /* 329 /*
330 * We must not directly access the pte in the highpte 330 * We must not directly access the pte in the highpte
331 * case if the page table is located in highmem. 331 * case if the page table is located in highmem.
332 * And let's rather not kmap-atomic the pte, just in case 332 * And let's rather not kmap-atomic the pte, just in case
333 * it's allocated already: 333 * it's allocated already:
334 */ 334 */
335 if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd)) 335 if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd))
336 goto out; 336 goto out;
337 337
338 pte = pte_offset_kernel(pmd, address); 338 pte = pte_offset_kernel(pmd, address);
339 printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); 339 printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
340 out: 340 out:
341 printk("\n"); 341 printk("\n");
342 } 342 }
343 343
344 #else /* CONFIG_X86_64: */ 344 #else /* CONFIG_X86_64: */
345 345
346 void vmalloc_sync_all(void) 346 void vmalloc_sync_all(void)
347 { 347 {
348 sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); 348 sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
349 } 349 }
350 350
351 /* 351 /*
352 * 64-bit: 352 * 64-bit:
353 * 353 *
354 * Handle a fault on the vmalloc area 354 * Handle a fault on the vmalloc area
355 * 355 *
356 * This assumes no large pages in there. 356 * This assumes no large pages in there.
357 */ 357 */
358 static noinline __kprobes int vmalloc_fault(unsigned long address) 358 static noinline __kprobes int vmalloc_fault(unsigned long address)
359 { 359 {
360 pgd_t *pgd, *pgd_ref; 360 pgd_t *pgd, *pgd_ref;
361 pud_t *pud, *pud_ref; 361 pud_t *pud, *pud_ref;
362 pmd_t *pmd, *pmd_ref; 362 pmd_t *pmd, *pmd_ref;
363 pte_t *pte, *pte_ref; 363 pte_t *pte, *pte_ref;
364 364
365 /* Make sure we are in vmalloc area: */ 365 /* Make sure we are in vmalloc area: */
366 if (!(address >= VMALLOC_START && address < VMALLOC_END)) 366 if (!(address >= VMALLOC_START && address < VMALLOC_END))
367 return -1; 367 return -1;
368 368
369 WARN_ON_ONCE(in_nmi()); 369 WARN_ON_ONCE(in_nmi());
370 370
371 /* 371 /*
372 * Copy kernel mappings over when needed. This can also 372 * Copy kernel mappings over when needed. This can also
373 * happen within a race in page table update. In the later 373 * happen within a race in page table update. In the later
374 * case just flush: 374 * case just flush:
375 */ 375 */
376 pgd = pgd_offset(current->active_mm, address); 376 pgd = pgd_offset(current->active_mm, address);
377 pgd_ref = pgd_offset_k(address); 377 pgd_ref = pgd_offset_k(address);
378 if (pgd_none(*pgd_ref)) 378 if (pgd_none(*pgd_ref))
379 return -1; 379 return -1;
380 380
381 if (pgd_none(*pgd)) 381 if (pgd_none(*pgd))
382 set_pgd(pgd, *pgd_ref); 382 set_pgd(pgd, *pgd_ref);
383 else 383 else
384 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); 384 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
385 385
386 /* 386 /*
387 * Below here mismatches are bugs because these lower tables 387 * Below here mismatches are bugs because these lower tables
388 * are shared: 388 * are shared:
389 */ 389 */
390 390
391 pud = pud_offset(pgd, address); 391 pud = pud_offset(pgd, address);
392 pud_ref = pud_offset(pgd_ref, address); 392 pud_ref = pud_offset(pgd_ref, address);
393 if (pud_none(*pud_ref)) 393 if (pud_none(*pud_ref))
394 return -1; 394 return -1;
395 395
396 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) 396 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
397 BUG(); 397 BUG();
398 398
399 pmd = pmd_offset(pud, address); 399 pmd = pmd_offset(pud, address);
400 pmd_ref = pmd_offset(pud_ref, address); 400 pmd_ref = pmd_offset(pud_ref, address);
401 if (pmd_none(*pmd_ref)) 401 if (pmd_none(*pmd_ref))
402 return -1; 402 return -1;
403 403
404 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) 404 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
405 BUG(); 405 BUG();
406 406
407 pte_ref = pte_offset_kernel(pmd_ref, address); 407 pte_ref = pte_offset_kernel(pmd_ref, address);
408 if (!pte_present(*pte_ref)) 408 if (!pte_present(*pte_ref))
409 return -1; 409 return -1;
410 410
411 pte = pte_offset_kernel(pmd, address); 411 pte = pte_offset_kernel(pmd, address);
412 412
413 /* 413 /*
414 * Don't use pte_page here, because the mappings can point 414 * Don't use pte_page here, because the mappings can point
415 * outside mem_map, and the NUMA hash lookup cannot handle 415 * outside mem_map, and the NUMA hash lookup cannot handle
416 * that: 416 * that:
417 */ 417 */
418 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) 418 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
419 BUG(); 419 BUG();
420 420
421 return 0; 421 return 0;
422 } 422 }
423 423
424 #ifdef CONFIG_CPU_SUP_AMD 424 #ifdef CONFIG_CPU_SUP_AMD
425 static const char errata93_warning[] = 425 static const char errata93_warning[] =
426 KERN_ERR 426 KERN_ERR
427 "******* Your BIOS seems to not contain a fix for K8 errata #93\n" 427 "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
428 "******* Working around it, but it may cause SEGVs or burn power.\n" 428 "******* Working around it, but it may cause SEGVs or burn power.\n"
429 "******* Please consider a BIOS update.\n" 429 "******* Please consider a BIOS update.\n"
430 "******* Disabling USB legacy in the BIOS may also help.\n"; 430 "******* Disabling USB legacy in the BIOS may also help.\n";
431 #endif 431 #endif
432 432
433 /* 433 /*
434 * No vm86 mode in 64-bit mode: 434 * No vm86 mode in 64-bit mode:
435 */ 435 */
436 static inline void 436 static inline void
437 check_v8086_mode(struct pt_regs *regs, unsigned long address, 437 check_v8086_mode(struct pt_regs *regs, unsigned long address,
438 struct task_struct *tsk) 438 struct task_struct *tsk)
439 { 439 {
440 } 440 }
441 441
442 static int bad_address(void *p) 442 static int bad_address(void *p)
443 { 443 {
444 unsigned long dummy; 444 unsigned long dummy;
445 445
446 return probe_kernel_address((unsigned long *)p, dummy); 446 return probe_kernel_address((unsigned long *)p, dummy);
447 } 447 }
448 448
449 static void dump_pagetable(unsigned long address) 449 static void dump_pagetable(unsigned long address)
450 { 450 {
451 pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); 451 pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
452 pgd_t *pgd = base + pgd_index(address); 452 pgd_t *pgd = base + pgd_index(address);
453 pud_t *pud; 453 pud_t *pud;
454 pmd_t *pmd; 454 pmd_t *pmd;
455 pte_t *pte; 455 pte_t *pte;
456 456
457 if (bad_address(pgd)) 457 if (bad_address(pgd))
458 goto bad; 458 goto bad;
459 459
460 printk("PGD %lx ", pgd_val(*pgd)); 460 printk("PGD %lx ", pgd_val(*pgd));
461 461
462 if (!pgd_present(*pgd)) 462 if (!pgd_present(*pgd))
463 goto out; 463 goto out;
464 464
465 pud = pud_offset(pgd, address); 465 pud = pud_offset(pgd, address);
466 if (bad_address(pud)) 466 if (bad_address(pud))
467 goto bad; 467 goto bad;
468 468
469 printk("PUD %lx ", pud_val(*pud)); 469 printk("PUD %lx ", pud_val(*pud));
470 if (!pud_present(*pud) || pud_large(*pud)) 470 if (!pud_present(*pud) || pud_large(*pud))
471 goto out; 471 goto out;
472 472
473 pmd = pmd_offset(pud, address); 473 pmd = pmd_offset(pud, address);
474 if (bad_address(pmd)) 474 if (bad_address(pmd))
475 goto bad; 475 goto bad;
476 476
477 printk("PMD %lx ", pmd_val(*pmd)); 477 printk("PMD %lx ", pmd_val(*pmd));
478 if (!pmd_present(*pmd) || pmd_large(*pmd)) 478 if (!pmd_present(*pmd) || pmd_large(*pmd))
479 goto out; 479 goto out;
480 480
481 pte = pte_offset_kernel(pmd, address); 481 pte = pte_offset_kernel(pmd, address);
482 if (bad_address(pte)) 482 if (bad_address(pte))
483 goto bad; 483 goto bad;
484 484
485 printk("PTE %lx", pte_val(*pte)); 485 printk("PTE %lx", pte_val(*pte));
486 out: 486 out:
487 printk("\n"); 487 printk("\n");
488 return; 488 return;
489 bad: 489 bad:
490 printk("BAD\n"); 490 printk("BAD\n");
491 } 491 }
492 492
493 #endif /* CONFIG_X86_64 */ 493 #endif /* CONFIG_X86_64 */
494 494
495 /* 495 /*
496 * Workaround for K8 erratum #93 & buggy BIOS. 496 * Workaround for K8 erratum #93 & buggy BIOS.
497 * 497 *
498 * BIOS SMM functions are required to use a specific workaround 498 * BIOS SMM functions are required to use a specific workaround
499 * to avoid corruption of the 64bit RIP register on C stepping K8. 499 * to avoid corruption of the 64bit RIP register on C stepping K8.
500 * 500 *
501 * A lot of BIOS that didn't get tested properly miss this. 501 * A lot of BIOS that didn't get tested properly miss this.
502 * 502 *
503 * The OS sees this as a page fault with the upper 32bits of RIP cleared. 503 * The OS sees this as a page fault with the upper 32bits of RIP cleared.
504 * Try to work around it here. 504 * Try to work around it here.
505 * 505 *
506 * Note we only handle faults in kernel here. 506 * Note we only handle faults in kernel here.
507 * Does nothing on 32-bit. 507 * Does nothing on 32-bit.
508 */ 508 */
509 static int is_errata93(struct pt_regs *regs, unsigned long address) 509 static int is_errata93(struct pt_regs *regs, unsigned long address)
510 { 510 {
511 #if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD) 511 #if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
512 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD 512 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
513 || boot_cpu_data.x86 != 0xf) 513 || boot_cpu_data.x86 != 0xf)
514 return 0; 514 return 0;
515 515
516 if (address != regs->ip) 516 if (address != regs->ip)
517 return 0; 517 return 0;
518 518
519 if ((address >> 32) != 0) 519 if ((address >> 32) != 0)
520 return 0; 520 return 0;
521 521
522 address |= 0xffffffffUL << 32; 522 address |= 0xffffffffUL << 32;
523 if ((address >= (u64)_stext && address <= (u64)_etext) || 523 if ((address >= (u64)_stext && address <= (u64)_etext) ||
524 (address >= MODULES_VADDR && address <= MODULES_END)) { 524 (address >= MODULES_VADDR && address <= MODULES_END)) {
525 printk_once(errata93_warning); 525 printk_once(errata93_warning);
526 regs->ip = address; 526 regs->ip = address;
527 return 1; 527 return 1;
528 } 528 }
529 #endif 529 #endif
530 return 0; 530 return 0;
531 } 531 }
532 532
533 /* 533 /*
534 * Work around K8 erratum #100 K8 in compat mode occasionally jumps 534 * Work around K8 erratum #100 K8 in compat mode occasionally jumps
535 * to illegal addresses >4GB. 535 * to illegal addresses >4GB.
536 * 536 *
537 * We catch this in the page fault handler because these addresses 537 * We catch this in the page fault handler because these addresses
538 * are not reachable. Just detect this case and return. Any code 538 * are not reachable. Just detect this case and return. Any code
539 * segment in LDT is compatibility mode. 539 * segment in LDT is compatibility mode.
540 */ 540 */
541 static int is_errata100(struct pt_regs *regs, unsigned long address) 541 static int is_errata100(struct pt_regs *regs, unsigned long address)
542 { 542 {
543 #ifdef CONFIG_X86_64 543 #ifdef CONFIG_X86_64
544 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32)) 544 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
545 return 1; 545 return 1;
546 #endif 546 #endif
547 return 0; 547 return 0;
548 } 548 }
549 549
550 static int is_f00f_bug(struct pt_regs *regs, unsigned long address) 550 static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
551 { 551 {
552 #ifdef CONFIG_X86_F00F_BUG 552 #ifdef CONFIG_X86_F00F_BUG
553 unsigned long nr; 553 unsigned long nr;
554 554
555 /* 555 /*
556 * Pentium F0 0F C7 C8 bug workaround: 556 * Pentium F0 0F C7 C8 bug workaround:
557 */ 557 */
558 if (boot_cpu_data.f00f_bug) { 558 if (boot_cpu_data.f00f_bug) {
559 nr = (address - idt_descr.address) >> 3; 559 nr = (address - idt_descr.address) >> 3;
560 560
561 if (nr == 6) { 561 if (nr == 6) {
562 do_invalid_op(regs, 0); 562 do_invalid_op(regs, 0);
563 return 1; 563 return 1;
564 } 564 }
565 } 565 }
566 #endif 566 #endif
567 return 0; 567 return 0;
568 } 568 }
569 569
570 static const char nx_warning[] = KERN_CRIT 570 static const char nx_warning[] = KERN_CRIT
571 "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n"; 571 "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n";
572 572
573 static void 573 static void
574 show_fault_oops(struct pt_regs *regs, unsigned long error_code, 574 show_fault_oops(struct pt_regs *regs, unsigned long error_code,
575 unsigned long address) 575 unsigned long address)
576 { 576 {
577 if (!oops_may_print()) 577 if (!oops_may_print())
578 return; 578 return;
579 579
580 if (error_code & PF_INSTR) { 580 if (error_code & PF_INSTR) {
581 unsigned int level; 581 unsigned int level;
582 582
583 pte_t *pte = lookup_address(address, &level); 583 pte_t *pte = lookup_address(address, &level);
584 584
585 if (pte && pte_present(*pte) && !pte_exec(*pte)) 585 if (pte && pte_present(*pte) && !pte_exec(*pte))
586 printk(nx_warning, from_kuid(&init_user_ns, current_uid())); 586 printk(nx_warning, from_kuid(&init_user_ns, current_uid()));
587 } 587 }
588 588
589 printk(KERN_ALERT "BUG: unable to handle kernel "); 589 printk(KERN_ALERT "BUG: unable to handle kernel ");
590 if (address < PAGE_SIZE) 590 if (address < PAGE_SIZE)
591 printk(KERN_CONT "NULL pointer dereference"); 591 printk(KERN_CONT "NULL pointer dereference");
592 else 592 else
593 printk(KERN_CONT "paging request"); 593 printk(KERN_CONT "paging request");
594 594
595 printk(KERN_CONT " at %p\n", (void *) address); 595 printk(KERN_CONT " at %p\n", (void *) address);
596 printk(KERN_ALERT "IP:"); 596 printk(KERN_ALERT "IP:");
597 printk_address(regs->ip, 1); 597 printk_address(regs->ip, 1);
598 598
599 dump_pagetable(address); 599 dump_pagetable(address);
600 } 600 }
601 601
602 static noinline void 602 static noinline void
603 pgtable_bad(struct pt_regs *regs, unsigned long error_code, 603 pgtable_bad(struct pt_regs *regs, unsigned long error_code,
604 unsigned long address) 604 unsigned long address)
605 { 605 {
606 struct task_struct *tsk; 606 struct task_struct *tsk;
607 unsigned long flags; 607 unsigned long flags;
608 int sig; 608 int sig;
609 609
610 flags = oops_begin(); 610 flags = oops_begin();
611 tsk = current; 611 tsk = current;
612 sig = SIGKILL; 612 sig = SIGKILL;
613 613
614 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", 614 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
615 tsk->comm, address); 615 tsk->comm, address);
616 dump_pagetable(address); 616 dump_pagetable(address);
617 617
618 tsk->thread.cr2 = address; 618 tsk->thread.cr2 = address;
619 tsk->thread.trap_nr = X86_TRAP_PF; 619 tsk->thread.trap_nr = X86_TRAP_PF;
620 tsk->thread.error_code = error_code; 620 tsk->thread.error_code = error_code;
621 621
622 if (__die("Bad pagetable", regs, error_code)) 622 if (__die("Bad pagetable", regs, error_code))
623 sig = 0; 623 sig = 0;
624 624
625 oops_end(flags, regs, sig); 625 oops_end(flags, regs, sig);
626 } 626 }
627 627
628 static noinline void 628 static noinline void
629 no_context(struct pt_regs *regs, unsigned long error_code, 629 no_context(struct pt_regs *regs, unsigned long error_code,
630 unsigned long address, int signal, int si_code) 630 unsigned long address, int signal, int si_code)
631 { 631 {
632 struct task_struct *tsk = current; 632 struct task_struct *tsk = current;
633 unsigned long *stackend; 633 unsigned long *stackend;
634 unsigned long flags; 634 unsigned long flags;
635 int sig; 635 int sig;
636 636
637 /* Are we prepared to handle this kernel fault? */ 637 /* Are we prepared to handle this kernel fault? */
638 if (fixup_exception(regs)) { 638 if (fixup_exception(regs)) {
639 if (current_thread_info()->sig_on_uaccess_error && signal) { 639 if (current_thread_info()->sig_on_uaccess_error && signal) {
640 tsk->thread.trap_nr = X86_TRAP_PF; 640 tsk->thread.trap_nr = X86_TRAP_PF;
641 tsk->thread.error_code = error_code | PF_USER; 641 tsk->thread.error_code = error_code | PF_USER;
642 tsk->thread.cr2 = address; 642 tsk->thread.cr2 = address;
643 643
644 /* XXX: hwpoison faults will set the wrong code. */ 644 /* XXX: hwpoison faults will set the wrong code. */
645 force_sig_info_fault(signal, si_code, address, tsk, 0); 645 force_sig_info_fault(signal, si_code, address, tsk, 0);
646 } 646 }
647 return; 647 return;
648 } 648 }
649 649
650 /* 650 /*
651 * 32-bit: 651 * 32-bit:
652 * 652 *
653 * Valid to do another page fault here, because if this fault 653 * Valid to do another page fault here, because if this fault
654 * had been triggered by is_prefetch fixup_exception would have 654 * had been triggered by is_prefetch fixup_exception would have
655 * handled it. 655 * handled it.
656 * 656 *
657 * 64-bit: 657 * 64-bit:
658 * 658 *
659 * Hall of shame of CPU/BIOS bugs. 659 * Hall of shame of CPU/BIOS bugs.
660 */ 660 */
661 if (is_prefetch(regs, error_code, address)) 661 if (is_prefetch(regs, error_code, address))
662 return; 662 return;
663 663
664 if (is_errata93(regs, address)) 664 if (is_errata93(regs, address))
665 return; 665 return;
666 666
667 /* 667 /*
668 * Oops. The kernel tried to access some bad page. We'll have to 668 * Oops. The kernel tried to access some bad page. We'll have to
669 * terminate things with extreme prejudice: 669 * terminate things with extreme prejudice:
670 */ 670 */
671 flags = oops_begin(); 671 flags = oops_begin();
672 672
673 show_fault_oops(regs, error_code, address); 673 show_fault_oops(regs, error_code, address);
674 674
675 stackend = end_of_stack(tsk); 675 stackend = end_of_stack(tsk);
676 if (tsk != &init_task && *stackend != STACK_END_MAGIC) 676 if (tsk != &init_task && *stackend != STACK_END_MAGIC)
677 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); 677 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
678 678
679 tsk->thread.cr2 = address; 679 tsk->thread.cr2 = address;
680 tsk->thread.trap_nr = X86_TRAP_PF; 680 tsk->thread.trap_nr = X86_TRAP_PF;
681 tsk->thread.error_code = error_code; 681 tsk->thread.error_code = error_code;
682 682
683 sig = SIGKILL; 683 sig = SIGKILL;
684 if (__die("Oops", regs, error_code)) 684 if (__die("Oops", regs, error_code))
685 sig = 0; 685 sig = 0;
686 686
687 /* Executive summary in case the body of the oops scrolled away */ 687 /* Executive summary in case the body of the oops scrolled away */
688 printk(KERN_DEFAULT "CR2: %016lx\n", address); 688 printk(KERN_DEFAULT "CR2: %016lx\n", address);
689 689
690 oops_end(flags, regs, sig); 690 oops_end(flags, regs, sig);
691 } 691 }
692 692
693 /* 693 /*
694 * Print out info about fatal segfaults, if the show_unhandled_signals 694 * Print out info about fatal segfaults, if the show_unhandled_signals
695 * sysctl is set: 695 * sysctl is set:
696 */ 696 */
697 static inline void 697 static inline void
698 show_signal_msg(struct pt_regs *regs, unsigned long error_code, 698 show_signal_msg(struct pt_regs *regs, unsigned long error_code,
699 unsigned long address, struct task_struct *tsk) 699 unsigned long address, struct task_struct *tsk)
700 { 700 {
701 if (!unhandled_signal(tsk, SIGSEGV)) 701 if (!unhandled_signal(tsk, SIGSEGV))
702 return; 702 return;
703 703
704 if (!printk_ratelimit()) 704 if (!printk_ratelimit())
705 return; 705 return;
706 706
707 printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx", 707 printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
708 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, 708 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
709 tsk->comm, task_pid_nr(tsk), address, 709 tsk->comm, task_pid_nr(tsk), address,
710 (void *)regs->ip, (void *)regs->sp, error_code); 710 (void *)regs->ip, (void *)regs->sp, error_code);
711 711
712 print_vma_addr(KERN_CONT " in ", regs->ip); 712 print_vma_addr(KERN_CONT " in ", regs->ip);
713 713
714 printk(KERN_CONT "\n"); 714 printk(KERN_CONT "\n");
715 } 715 }
716 716
717 static void 717 static void
718 __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, 718 __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
719 unsigned long address, int si_code) 719 unsigned long address, int si_code)
720 { 720 {
721 struct task_struct *tsk = current; 721 struct task_struct *tsk = current;
722 722
723 /* User mode accesses just cause a SIGSEGV */ 723 /* User mode accesses just cause a SIGSEGV */
724 if (error_code & PF_USER) { 724 if (error_code & PF_USER) {
725 /* 725 /*
726 * It's possible to have interrupts off here: 726 * It's possible to have interrupts off here:
727 */ 727 */
728 local_irq_enable(); 728 local_irq_enable();
729 729
730 /* 730 /*
731 * Valid to do another page fault here because this one came 731 * Valid to do another page fault here because this one came
732 * from user space: 732 * from user space:
733 */ 733 */
734 if (is_prefetch(regs, error_code, address)) 734 if (is_prefetch(regs, error_code, address))
735 return; 735 return;
736 736
737 if (is_errata100(regs, address)) 737 if (is_errata100(regs, address))
738 return; 738 return;
739 739
740 #ifdef CONFIG_X86_64 740 #ifdef CONFIG_X86_64
741 /* 741 /*
742 * Instruction fetch faults in the vsyscall page might need 742 * Instruction fetch faults in the vsyscall page might need
743 * emulation. 743 * emulation.
744 */ 744 */
745 if (unlikely((error_code & PF_INSTR) && 745 if (unlikely((error_code & PF_INSTR) &&
746 ((address & ~0xfff) == VSYSCALL_START))) { 746 ((address & ~0xfff) == VSYSCALL_START))) {
747 if (emulate_vsyscall(regs, address)) 747 if (emulate_vsyscall(regs, address))
748 return; 748 return;
749 } 749 }
750 #endif 750 #endif
751 /* Kernel addresses are always protection faults: */ 751 /* Kernel addresses are always protection faults: */
752 if (address >= TASK_SIZE) 752 if (address >= TASK_SIZE)
753 error_code |= PF_PROT; 753 error_code |= PF_PROT;
754 754
755 if (likely(show_unhandled_signals)) 755 if (likely(show_unhandled_signals))
756 show_signal_msg(regs, error_code, address, tsk); 756 show_signal_msg(regs, error_code, address, tsk);
757 757
758 tsk->thread.cr2 = address; 758 tsk->thread.cr2 = address;
759 tsk->thread.error_code = error_code; 759 tsk->thread.error_code = error_code;
760 tsk->thread.trap_nr = X86_TRAP_PF; 760 tsk->thread.trap_nr = X86_TRAP_PF;
761 761
762 force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0); 762 force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
763 763
764 return; 764 return;
765 } 765 }
766 766
767 if (is_f00f_bug(regs, address)) 767 if (is_f00f_bug(regs, address))
768 return; 768 return;
769 769
770 no_context(regs, error_code, address, SIGSEGV, si_code); 770 no_context(regs, error_code, address, SIGSEGV, si_code);
771 } 771 }
772 772
773 static noinline void 773 static noinline void
774 bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, 774 bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
775 unsigned long address) 775 unsigned long address)
776 { 776 {
777 __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR); 777 __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
778 } 778 }
779 779
780 static void 780 static void
781 __bad_area(struct pt_regs *regs, unsigned long error_code, 781 __bad_area(struct pt_regs *regs, unsigned long error_code,
782 unsigned long address, int si_code) 782 unsigned long address, int si_code)
783 { 783 {
784 struct mm_struct *mm = current->mm; 784 struct mm_struct *mm = current->mm;
785 785
786 /* 786 /*
787 * Something tried to access memory that isn't in our memory map.. 787 * Something tried to access memory that isn't in our memory map..
788 * Fix it, but check if it's kernel or user first.. 788 * Fix it, but check if it's kernel or user first..
789 */ 789 */
790 up_read(&mm->mmap_sem); 790 up_read(&mm->mmap_sem);
791 791
792 __bad_area_nosemaphore(regs, error_code, address, si_code); 792 __bad_area_nosemaphore(regs, error_code, address, si_code);
793 } 793 }
794 794
795 static noinline void 795 static noinline void
796 bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) 796 bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
797 { 797 {
798 __bad_area(regs, error_code, address, SEGV_MAPERR); 798 __bad_area(regs, error_code, address, SEGV_MAPERR);
799 } 799 }
800 800
801 static noinline void 801 static noinline void
802 bad_area_access_error(struct pt_regs *regs, unsigned long error_code, 802 bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
803 unsigned long address) 803 unsigned long address)
804 { 804 {
805 __bad_area(regs, error_code, address, SEGV_ACCERR); 805 __bad_area(regs, error_code, address, SEGV_ACCERR);
806 } 806 }
807 807
808 static void 808 static void
809 do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, 809 do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
810 unsigned int fault) 810 unsigned int fault)
811 { 811 {
812 struct task_struct *tsk = current; 812 struct task_struct *tsk = current;
813 struct mm_struct *mm = tsk->mm; 813 struct mm_struct *mm = tsk->mm;
814 int code = BUS_ADRERR; 814 int code = BUS_ADRERR;
815 815
816 up_read(&mm->mmap_sem); 816 up_read(&mm->mmap_sem);
817 817
818 /* Kernel mode? Handle exceptions or die: */ 818 /* Kernel mode? Handle exceptions or die: */
819 if (!(error_code & PF_USER)) { 819 if (!(error_code & PF_USER)) {
820 no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); 820 no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
821 return; 821 return;
822 } 822 }
823 823
824 /* User-space => ok to do another page fault: */ 824 /* User-space => ok to do another page fault: */
825 if (is_prefetch(regs, error_code, address)) 825 if (is_prefetch(regs, error_code, address))
826 return; 826 return;
827 827
828 tsk->thread.cr2 = address; 828 tsk->thread.cr2 = address;
829 tsk->thread.error_code = error_code; 829 tsk->thread.error_code = error_code;
830 tsk->thread.trap_nr = X86_TRAP_PF; 830 tsk->thread.trap_nr = X86_TRAP_PF;
831 831
832 #ifdef CONFIG_MEMORY_FAILURE 832 #ifdef CONFIG_MEMORY_FAILURE
833 if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { 833 if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
834 printk(KERN_ERR 834 printk(KERN_ERR
835 "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", 835 "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
836 tsk->comm, tsk->pid, address); 836 tsk->comm, tsk->pid, address);
837 code = BUS_MCEERR_AR; 837 code = BUS_MCEERR_AR;
838 } 838 }
839 #endif 839 #endif
840 force_sig_info_fault(SIGBUS, code, address, tsk, fault); 840 force_sig_info_fault(SIGBUS, code, address, tsk, fault);
841 } 841 }
842 842
843 static noinline int 843 static noinline int
844 mm_fault_error(struct pt_regs *regs, unsigned long error_code, 844 mm_fault_error(struct pt_regs *regs, unsigned long error_code,
845 unsigned long address, unsigned int fault) 845 unsigned long address, unsigned int fault)
846 { 846 {
847 /* 847 /*
848 * Pagefault was interrupted by SIGKILL. We have no reason to 848 * Pagefault was interrupted by SIGKILL. We have no reason to
849 * continue pagefault. 849 * continue pagefault.
850 */ 850 */
851 if (fatal_signal_pending(current)) { 851 if (fatal_signal_pending(current)) {
852 if (!(fault & VM_FAULT_RETRY)) 852 if (!(fault & VM_FAULT_RETRY))
853 up_read(&current->mm->mmap_sem); 853 up_read(&current->mm->mmap_sem);
854 if (!(error_code & PF_USER)) 854 if (!(error_code & PF_USER))
855 no_context(regs, error_code, address, 0, 0); 855 no_context(regs, error_code, address, 0, 0);
856 return 1; 856 return 1;
857 } 857 }
858 if (!(fault & VM_FAULT_ERROR)) 858 if (!(fault & VM_FAULT_ERROR))
859 return 0; 859 return 0;
860 860
861 if (fault & VM_FAULT_OOM) { 861 if (fault & VM_FAULT_OOM) {
862 /* Kernel mode? Handle exceptions or die: */ 862 /* Kernel mode? Handle exceptions or die: */
863 if (!(error_code & PF_USER)) { 863 if (!(error_code & PF_USER)) {
864 up_read(&current->mm->mmap_sem); 864 up_read(&current->mm->mmap_sem);
865 no_context(regs, error_code, address, 865 no_context(regs, error_code, address,
866 SIGSEGV, SEGV_MAPERR); 866 SIGSEGV, SEGV_MAPERR);
867 return 1; 867 return 1;
868 } 868 }
869 869
870 up_read(&current->mm->mmap_sem); 870 up_read(&current->mm->mmap_sem);
871 871
872 /* 872 /*
873 * We ran out of memory, call the OOM killer, and return the 873 * We ran out of memory, call the OOM killer, and return the
874 * userspace (which will retry the fault, or kill us if we got 874 * userspace (which will retry the fault, or kill us if we got
875 * oom-killed): 875 * oom-killed):
876 */ 876 */
877 pagefault_out_of_memory(); 877 pagefault_out_of_memory();
878 } else { 878 } else {
879 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| 879 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
880 VM_FAULT_HWPOISON_LARGE)) 880 VM_FAULT_HWPOISON_LARGE))
881 do_sigbus(regs, error_code, address, fault); 881 do_sigbus(regs, error_code, address, fault);
882 else 882 else
883 BUG(); 883 BUG();
884 } 884 }
885 return 1; 885 return 1;
886 } 886 }
887 887
888 static int spurious_fault_check(unsigned long error_code, pte_t *pte) 888 static int spurious_fault_check(unsigned long error_code, pte_t *pte)
889 { 889 {
890 if ((error_code & PF_WRITE) && !pte_write(*pte)) 890 if ((error_code & PF_WRITE) && !pte_write(*pte))
891 return 0; 891 return 0;
892 892
893 if ((error_code & PF_INSTR) && !pte_exec(*pte)) 893 if ((error_code & PF_INSTR) && !pte_exec(*pte))
894 return 0; 894 return 0;
895 895
896 return 1; 896 return 1;
897 } 897 }
898 898
899 /* 899 /*
900 * Handle a spurious fault caused by a stale TLB entry. 900 * Handle a spurious fault caused by a stale TLB entry.
901 * 901 *
902 * This allows us to lazily refresh the TLB when increasing the 902 * This allows us to lazily refresh the TLB when increasing the
903 * permissions of a kernel page (RO -> RW or NX -> X). Doing it 903 * permissions of a kernel page (RO -> RW or NX -> X). Doing it
904 * eagerly is very expensive since that implies doing a full 904 * eagerly is very expensive since that implies doing a full
905 * cross-processor TLB flush, even if no stale TLB entries exist 905 * cross-processor TLB flush, even if no stale TLB entries exist
906 * on other processors. 906 * on other processors.
907 * 907 *
908 * There are no security implications to leaving a stale TLB when 908 * There are no security implications to leaving a stale TLB when
909 * increasing the permissions on a page. 909 * increasing the permissions on a page.
910 */ 910 */
911 static noinline __kprobes int 911 static noinline __kprobes int
912 spurious_fault(unsigned long error_code, unsigned long address) 912 spurious_fault(unsigned long error_code, unsigned long address)
913 { 913 {
914 pgd_t *pgd; 914 pgd_t *pgd;
915 pud_t *pud; 915 pud_t *pud;
916 pmd_t *pmd; 916 pmd_t *pmd;
917 pte_t *pte; 917 pte_t *pte;
918 int ret; 918 int ret;
919 919
920 /* Reserved-bit violation or user access to kernel space? */ 920 /* Reserved-bit violation or user access to kernel space? */
921 if (error_code & (PF_USER | PF_RSVD)) 921 if (error_code & (PF_USER | PF_RSVD))
922 return 0; 922 return 0;
923 923
924 pgd = init_mm.pgd + pgd_index(address); 924 pgd = init_mm.pgd + pgd_index(address);
925 if (!pgd_present(*pgd)) 925 if (!pgd_present(*pgd))
926 return 0; 926 return 0;
927 927
928 pud = pud_offset(pgd, address); 928 pud = pud_offset(pgd, address);
929 if (!pud_present(*pud)) 929 if (!pud_present(*pud))
930 return 0; 930 return 0;
931 931
932 if (pud_large(*pud)) 932 if (pud_large(*pud))
933 return spurious_fault_check(error_code, (pte_t *) pud); 933 return spurious_fault_check(error_code, (pte_t *) pud);
934 934
935 pmd = pmd_offset(pud, address); 935 pmd = pmd_offset(pud, address);
936 if (!pmd_present(*pmd)) 936 if (!pmd_present(*pmd))
937 return 0; 937 return 0;
938 938
939 if (pmd_large(*pmd)) 939 if (pmd_large(*pmd))
940 return spurious_fault_check(error_code, (pte_t *) pmd); 940 return spurious_fault_check(error_code, (pte_t *) pmd);
941 941
942 pte = pte_offset_kernel(pmd, address); 942 pte = pte_offset_kernel(pmd, address);
943 if (!pte_present(*pte)) 943 if (!pte_present(*pte))
944 return 0; 944 return 0;
945 945
946 ret = spurious_fault_check(error_code, pte); 946 ret = spurious_fault_check(error_code, pte);
947 if (!ret) 947 if (!ret)
948 return 0; 948 return 0;
949 949
950 /* 950 /*
951 * Make sure we have permissions in PMD. 951 * Make sure we have permissions in PMD.
952 * If not, then there's a bug in the page tables: 952 * If not, then there's a bug in the page tables:
953 */ 953 */
954 ret = spurious_fault_check(error_code, (pte_t *) pmd); 954 ret = spurious_fault_check(error_code, (pte_t *) pmd);
955 WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); 955 WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
956 956
957 return ret; 957 return ret;
958 } 958 }
959 959
960 int show_unhandled_signals = 1; 960 int show_unhandled_signals = 1;
961 961
962 static inline int 962 static inline int
963 access_error(unsigned long error_code, struct vm_area_struct *vma) 963 access_error(unsigned long error_code, struct vm_area_struct *vma)
964 { 964 {
965 if (error_code & PF_WRITE) { 965 if (error_code & PF_WRITE) {
966 /* write, present and write, not present: */ 966 /* write, present and write, not present: */
967 if (unlikely(!(vma->vm_flags & VM_WRITE))) 967 if (unlikely(!(vma->vm_flags & VM_WRITE)))
968 return 1; 968 return 1;
969 return 0; 969 return 0;
970 } 970 }
971 971
972 /* read, present: */ 972 /* read, present: */
973 if (unlikely(error_code & PF_PROT)) 973 if (unlikely(error_code & PF_PROT))
974 return 1; 974 return 1;
975 975
976 /* read, not present: */ 976 /* read, not present: */
977 if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) 977 if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
978 return 1; 978 return 1;
979 979
980 return 0; 980 return 0;
981 } 981 }
982 982
983 static int fault_in_kernel_space(unsigned long address) 983 static int fault_in_kernel_space(unsigned long address)
984 { 984 {
985 return address >= TASK_SIZE_MAX; 985 return address >= TASK_SIZE_MAX;
986 } 986 }
987 987
988 static inline bool smap_violation(int error_code, struct pt_regs *regs) 988 static inline bool smap_violation(int error_code, struct pt_regs *regs)
989 { 989 {
990 if (error_code & PF_USER) 990 if (error_code & PF_USER)
991 return false; 991 return false;
992 992
993 if (!user_mode_vm(regs) && (regs->flags & X86_EFLAGS_AC)) 993 if (!user_mode_vm(regs) && (regs->flags & X86_EFLAGS_AC))
994 return false; 994 return false;
995 995
996 return true; 996 return true;
997 } 997 }
998 998
999 /* 999 /*
1000 * This routine handles page faults. It determines the address, 1000 * This routine handles page faults. It determines the address,
1001 * and the problem, and then passes it off to one of the appropriate 1001 * and the problem, and then passes it off to one of the appropriate
1002 * routines. 1002 * routines.
1003 */ 1003 */
1004 static void __kprobes 1004 static void __kprobes
1005 __do_page_fault(struct pt_regs *regs, unsigned long error_code) 1005 __do_page_fault(struct pt_regs *regs, unsigned long error_code)
1006 { 1006 {
1007 struct vm_area_struct *vma; 1007 struct vm_area_struct *vma;
1008 struct task_struct *tsk; 1008 struct task_struct *tsk;
1009 unsigned long address; 1009 unsigned long address;
1010 struct mm_struct *mm; 1010 struct mm_struct *mm;
1011 int fault; 1011 int fault;
1012 int write = error_code & PF_WRITE; 1012 int write = error_code & PF_WRITE;
1013 unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | 1013 unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
1014 (write ? FAULT_FLAG_WRITE : 0); 1014 (write ? FAULT_FLAG_WRITE : 0);
1015 1015
1016 tsk = current; 1016 tsk = current;
1017 mm = tsk->mm; 1017 mm = tsk->mm;
1018 1018
1019 /* Get the faulting address: */ 1019 /* Get the faulting address: */
1020 address = read_cr2(); 1020 address = read_cr2();
1021 1021
1022 /* 1022 /*
1023 * Detect and handle instructions that would cause a page fault for 1023 * Detect and handle instructions that would cause a page fault for
1024 * both a tracked kernel page and a userspace page. 1024 * both a tracked kernel page and a userspace page.
1025 */ 1025 */
1026 if (kmemcheck_active(regs)) 1026 if (kmemcheck_active(regs))
1027 kmemcheck_hide(regs); 1027 kmemcheck_hide(regs);
1028 prefetchw(&mm->mmap_sem); 1028 prefetchw(&mm->mmap_sem);
1029 1029
1030 if (unlikely(kmmio_fault(regs, address))) 1030 if (unlikely(kmmio_fault(regs, address)))
1031 return; 1031 return;
1032 1032
1033 /* 1033 /*
1034 * We fault-in kernel-space virtual memory on-demand. The 1034 * We fault-in kernel-space virtual memory on-demand. The
1035 * 'reference' page table is init_mm.pgd. 1035 * 'reference' page table is init_mm.pgd.
1036 * 1036 *
1037 * NOTE! We MUST NOT take any locks for this case. We may 1037 * NOTE! We MUST NOT take any locks for this case. We may
1038 * be in an interrupt or a critical region, and should 1038 * be in an interrupt or a critical region, and should
1039 * only copy the information from the master page table, 1039 * only copy the information from the master page table,
1040 * nothing more. 1040 * nothing more.
1041 * 1041 *
1042 * This verifies that the fault happens in kernel space 1042 * This verifies that the fault happens in kernel space
1043 * (error_code & 4) == 0, and that the fault was not a 1043 * (error_code & 4) == 0, and that the fault was not a
1044 * protection error (error_code & 9) == 0. 1044 * protection error (error_code & 9) == 0.
1045 */ 1045 */
1046 if (unlikely(fault_in_kernel_space(address))) { 1046 if (unlikely(fault_in_kernel_space(address))) {
1047 if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) { 1047 if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
1048 if (vmalloc_fault(address) >= 0) 1048 if (vmalloc_fault(address) >= 0)
1049 return; 1049 return;
1050 1050
1051 if (kmemcheck_fault(regs, address, error_code)) 1051 if (kmemcheck_fault(regs, address, error_code))
1052 return; 1052 return;
1053 } 1053 }
1054 1054
1055 /* Can handle a stale RO->RW TLB: */ 1055 /* Can handle a stale RO->RW TLB: */
1056 if (spurious_fault(error_code, address)) 1056 if (spurious_fault(error_code, address))
1057 return; 1057 return;
1058 1058
1059 /* kprobes don't want to hook the spurious faults: */ 1059 /* kprobes don't want to hook the spurious faults: */
1060 if (notify_page_fault(regs)) 1060 if (notify_page_fault(regs))
1061 return; 1061 return;
1062 /* 1062 /*
1063 * Don't take the mm semaphore here. If we fixup a prefetch 1063 * Don't take the mm semaphore here. If we fixup a prefetch
1064 * fault we could otherwise deadlock: 1064 * fault we could otherwise deadlock:
1065 */ 1065 */
1066 bad_area_nosemaphore(regs, error_code, address); 1066 bad_area_nosemaphore(regs, error_code, address);
1067 1067
1068 return; 1068 return;
1069 } 1069 }
1070 1070
1071 /* kprobes don't want to hook the spurious faults: */ 1071 /* kprobes don't want to hook the spurious faults: */
1072 if (unlikely(notify_page_fault(regs))) 1072 if (unlikely(notify_page_fault(regs)))
1073 return; 1073 return;
1074 /* 1074 /*
1075 * It's safe to allow irq's after cr2 has been saved and the 1075 * It's safe to allow irq's after cr2 has been saved and the
1076 * vmalloc fault has been handled. 1076 * vmalloc fault has been handled.
1077 * 1077 *
1078 * User-mode registers count as a user access even for any 1078 * User-mode registers count as a user access even for any
1079 * potential system fault or CPU buglet: 1079 * potential system fault or CPU buglet:
1080 */ 1080 */
1081 if (user_mode_vm(regs)) { 1081 if (user_mode_vm(regs)) {
1082 local_irq_enable(); 1082 local_irq_enable();
1083 error_code |= PF_USER; 1083 error_code |= PF_USER;
1084 } else { 1084 } else {
1085 if (regs->flags & X86_EFLAGS_IF) 1085 if (regs->flags & X86_EFLAGS_IF)
1086 local_irq_enable(); 1086 local_irq_enable();
1087 } 1087 }
1088 1088
1089 if (unlikely(error_code & PF_RSVD)) 1089 if (unlikely(error_code & PF_RSVD))
1090 pgtable_bad(regs, error_code, address); 1090 pgtable_bad(regs, error_code, address);
1091 1091
1092 if (static_cpu_has(X86_FEATURE_SMAP)) { 1092 if (static_cpu_has(X86_FEATURE_SMAP)) {
1093 if (unlikely(smap_violation(error_code, regs))) { 1093 if (unlikely(smap_violation(error_code, regs))) {
1094 bad_area_nosemaphore(regs, error_code, address); 1094 bad_area_nosemaphore(regs, error_code, address);
1095 return; 1095 return;
1096 } 1096 }
1097 } 1097 }
1098 1098
1099 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); 1099 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
1100 1100
1101 /* 1101 /*
1102 * If we're in an interrupt, have no user context or are running 1102 * If we're in an interrupt, have no user context or are running
1103 * in an atomic region then we must not take the fault: 1103 * in an atomic region then we must not take the fault:
1104 */ 1104 */
1105 if (unlikely(in_atomic() || !mm)) { 1105 if (unlikely(in_atomic() || !mm)) {
1106 bad_area_nosemaphore(regs, error_code, address); 1106 bad_area_nosemaphore(regs, error_code, address);
1107 return; 1107 return;
1108 } 1108 }
1109 1109
1110 /* 1110 /*
1111 * When running in the kernel we expect faults to occur only to 1111 * When running in the kernel we expect faults to occur only to
1112 * addresses in user space. All other faults represent errors in 1112 * addresses in user space. All other faults represent errors in
1113 * the kernel and should generate an OOPS. Unfortunately, in the 1113 * the kernel and should generate an OOPS. Unfortunately, in the
1114 * case of an erroneous fault occurring in a code path which already 1114 * case of an erroneous fault occurring in a code path which already
1115 * holds mmap_sem we will deadlock attempting to validate the fault 1115 * holds mmap_sem we will deadlock attempting to validate the fault
1116 * against the address space. Luckily the kernel only validly 1116 * against the address space. Luckily the kernel only validly
1117 * references user space from well defined areas of code, which are 1117 * references user space from well defined areas of code, which are
1118 * listed in the exceptions table. 1118 * listed in the exceptions table.
1119 * 1119 *
1120 * As the vast majority of faults will be valid we will only perform 1120 * As the vast majority of faults will be valid we will only perform
1121 * the source reference check when there is a possibility of a 1121 * the source reference check when there is a possibility of a
1122 * deadlock. Attempt to lock the address space, if we cannot we then 1122 * deadlock. Attempt to lock the address space, if we cannot we then
1123 * validate the source. If this is invalid we can skip the address 1123 * validate the source. If this is invalid we can skip the address
1124 * space check, thus avoiding the deadlock: 1124 * space check, thus avoiding the deadlock:
1125 */ 1125 */
1126 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 1126 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
1127 if ((error_code & PF_USER) == 0 && 1127 if ((error_code & PF_USER) == 0 &&
1128 !search_exception_tables(regs->ip)) { 1128 !search_exception_tables(regs->ip)) {
1129 bad_area_nosemaphore(regs, error_code, address); 1129 bad_area_nosemaphore(regs, error_code, address);
1130 return; 1130 return;
1131 } 1131 }
1132 retry: 1132 retry:
1133 down_read(&mm->mmap_sem); 1133 down_read(&mm->mmap_sem);
1134 } else { 1134 } else {
1135 /* 1135 /*
1136 * The above down_read_trylock() might have succeeded in 1136 * The above down_read_trylock() might have succeeded in
1137 * which case we'll have missed the might_sleep() from 1137 * which case we'll have missed the might_sleep() from
1138 * down_read(): 1138 * down_read():
1139 */ 1139 */
1140 might_sleep(); 1140 might_sleep();
1141 } 1141 }
1142 1142
1143 vma = find_vma(mm, address); 1143 vma = find_vma(mm, address);
1144 if (unlikely(!vma)) { 1144 if (unlikely(!vma)) {
1145 bad_area(regs, error_code, address); 1145 bad_area(regs, error_code, address);
1146 return; 1146 return;
1147 } 1147 }
1148 if (likely(vma->vm_start <= address)) 1148 if (likely(vma->vm_start <= address))
1149 goto good_area; 1149 goto good_area;
1150 if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { 1150 if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
1151 bad_area(regs, error_code, address); 1151 bad_area(regs, error_code, address);
1152 return; 1152 return;
1153 } 1153 }
1154 if (error_code & PF_USER) { 1154 if (error_code & PF_USER) {
1155 /* 1155 /*
1156 * Accessing the stack below %sp is always a bug. 1156 * Accessing the stack below %sp is always a bug.
1157 * The large cushion allows instructions like enter 1157 * The large cushion allows instructions like enter
1158 * and pusha to work. ("enter $65535, $31" pushes 1158 * and pusha to work. ("enter $65535, $31" pushes
1159 * 32 pointers and then decrements %sp by 65535.) 1159 * 32 pointers and then decrements %sp by 65535.)
1160 */ 1160 */
1161 if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { 1161 if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
1162 bad_area(regs, error_code, address); 1162 bad_area(regs, error_code, address);
1163 return; 1163 return;
1164 } 1164 }
1165 } 1165 }
1166 if (unlikely(expand_stack(vma, address))) { 1166 if (unlikely(expand_stack(vma, address))) {
1167 bad_area(regs, error_code, address); 1167 bad_area(regs, error_code, address);
1168 return; 1168 return;
1169 } 1169 }
1170 1170
1171 /* 1171 /*
1172 * Ok, we have a good vm_area for this memory access, so 1172 * Ok, we have a good vm_area for this memory access, so
1173 * we can handle it.. 1173 * we can handle it..
1174 */ 1174 */
1175 good_area: 1175 good_area:
1176 if (unlikely(access_error(error_code, vma))) { 1176 if (unlikely(access_error(error_code, vma))) {
1177 bad_area_access_error(regs, error_code, address); 1177 bad_area_access_error(regs, error_code, address);
1178 return; 1178 return;
1179 } 1179 }
1180 1180
1181 /* 1181 /*
1182 * If for any reason at all we couldn't handle the fault, 1182 * If for any reason at all we couldn't handle the fault,
1183 * make sure we exit gracefully rather than endlessly redo 1183 * make sure we exit gracefully rather than endlessly redo
1184 * the fault: 1184 * the fault:
1185 */ 1185 */
1186 fault = handle_mm_fault(mm, vma, address, flags); 1186 fault = handle_mm_fault(mm, vma, address, flags);
1187 1187
1188 if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) { 1188 if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
1189 if (mm_fault_error(regs, error_code, address, fault)) 1189 if (mm_fault_error(regs, error_code, address, fault))
1190 return; 1190 return;
1191 } 1191 }
1192 1192
1193 /* 1193 /*
1194 * Major/minor page fault accounting is only done on the 1194 * Major/minor page fault accounting is only done on the
1195 * initial attempt. If we go through a retry, it is extremely 1195 * initial attempt. If we go through a retry, it is extremely
1196 * likely that the page will be found in page cache at that point. 1196 * likely that the page will be found in page cache at that point.
1197 */ 1197 */
1198 if (flags & FAULT_FLAG_ALLOW_RETRY) { 1198 if (flags & FAULT_FLAG_ALLOW_RETRY) {
1199 if (fault & VM_FAULT_MAJOR) { 1199 if (fault & VM_FAULT_MAJOR) {
1200 tsk->maj_flt++; 1200 tsk->maj_flt++;
1201 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 1201 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
1202 regs, address); 1202 regs, address);
1203 } else { 1203 } else {
1204 tsk->min_flt++; 1204 tsk->min_flt++;
1205 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 1205 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
1206 regs, address); 1206 regs, address);
1207 } 1207 }
1208 if (fault & VM_FAULT_RETRY) { 1208 if (fault & VM_FAULT_RETRY) {
1209 /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk 1209 /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
1210 * of starvation. */ 1210 * of starvation. */
1211 flags &= ~FAULT_FLAG_ALLOW_RETRY; 1211 flags &= ~FAULT_FLAG_ALLOW_RETRY;
1212 flags |= FAULT_FLAG_TRIED; 1212 flags |= FAULT_FLAG_TRIED;
1213 goto retry; 1213 goto retry;
1214 } 1214 }
1215 } 1215 }
1216 1216
1217 check_v8086_mode(regs, address, tsk); 1217 check_v8086_mode(regs, address, tsk);
1218 1218
1219 up_read(&mm->mmap_sem); 1219 up_read(&mm->mmap_sem);
1220 } 1220 }
1221 1221
1222 dotraplinkage void __kprobes 1222 dotraplinkage void __kprobes
1223 do_page_fault(struct pt_regs *regs, unsigned long error_code) 1223 do_page_fault(struct pt_regs *regs, unsigned long error_code)
1224 { 1224 {
1225 exception_enter(regs); 1225 exception_enter(regs);
1226 __do_page_fault(regs, error_code); 1226 __do_page_fault(regs, error_code);
1227 exception_exit(regs); 1227 exception_exit(regs);
1228 } 1228 }
include/linux/context_tracking.h
1 #ifndef _LINUX_CONTEXT_TRACKING_H 1 #ifndef _LINUX_CONTEXT_TRACKING_H
2 #define _LINUX_CONTEXT_TRACKING_H 2 #define _LINUX_CONTEXT_TRACKING_H
3 3
4 #ifdef CONFIG_CONTEXT_TRACKING
5 #include <linux/sched.h> 4 #include <linux/sched.h>
6 #include <linux/percpu.h> 5 #include <linux/percpu.h>
6 #include <asm/ptrace.h>
7 7
8 #ifdef CONFIG_CONTEXT_TRACKING
8 struct context_tracking { 9 struct context_tracking {
9 /* 10 /*
10 * When active is false, probes are unset in order 11 * When active is false, probes are unset in order
11 * to minimize overhead: TIF flags are cleared 12 * to minimize overhead: TIF flags are cleared
12 * and calls to user_enter/exit are ignored. This 13 * and calls to user_enter/exit are ignored. This
13 * may be further optimized using static keys. 14 * may be further optimized using static keys.
14 */ 15 */
15 bool active; 16 bool active;
16 enum { 17 enum {
17 IN_KERNEL = 0, 18 IN_KERNEL = 0,
18 IN_USER, 19 IN_USER,
19 } state; 20 } state;
20 }; 21 };
21 22
22 DECLARE_PER_CPU(struct context_tracking, context_tracking); 23 DECLARE_PER_CPU(struct context_tracking, context_tracking);
23 24
24 static inline bool context_tracking_in_user(void) 25 static inline bool context_tracking_in_user(void)
25 { 26 {
26 return __this_cpu_read(context_tracking.state) == IN_USER; 27 return __this_cpu_read(context_tracking.state) == IN_USER;
27 } 28 }
28 29
29 static inline bool context_tracking_active(void) 30 static inline bool context_tracking_active(void)
30 { 31 {
31 return __this_cpu_read(context_tracking.active); 32 return __this_cpu_read(context_tracking.active);
32 } 33 }
33 34
34 extern void user_enter(void); 35 extern void user_enter(void);
35 extern void user_exit(void); 36 extern void user_exit(void);
37
38 static inline void exception_enter(struct pt_regs *regs)
39 {
40 user_exit();
41 }
42
43 static inline void exception_exit(struct pt_regs *regs)
44 {
45 if (user_mode(regs))
46 user_enter();
47 }
48
36 extern void context_tracking_task_switch(struct task_struct *prev, 49 extern void context_tracking_task_switch(struct task_struct *prev,
37 struct task_struct *next); 50 struct task_struct *next);
38 #else 51 #else
39 static inline bool context_tracking_in_user(void) { return false; } 52 static inline bool context_tracking_in_user(void) { return false; }
40 static inline void user_enter(void) { } 53 static inline void user_enter(void) { }
41 static inline void user_exit(void) { } 54 static inline void user_exit(void) { }
55 static inline void exception_enter(struct pt_regs *regs) { }
56 static inline void exception_exit(struct pt_regs *regs) { }
42 static inline void context_tracking_task_switch(struct task_struct *prev, 57 static inline void context_tracking_task_switch(struct task_struct *prev,
43 struct task_struct *next) { } 58 struct task_struct *next) { }
44 #endif /* !CONFIG_CONTEXT_TRACKING */ 59 #endif /* !CONFIG_CONTEXT_TRACKING */
45 60
46 #endif 61 #endif