Commit df79efde82952edc653fa6eb1338a82b87aa0585

Authored by Ravikiran G Thirumalai
Committed by Linus Torvalds
1 parent 05b3cbd8bb

[PATCH] x86_64: Node local pda take 2 -- cpu_pda preparation

Helper patch to change cpu_pda users to use macros to access cpu_pda
instead of the cpu_pda[] array.

Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 9 changed files with 21 additions and 20 deletions Inline Diff

arch/x86_64/kernel/irq.c
1 /* 1 /*
2 * linux/arch/x86_64/kernel/irq.c 2 * linux/arch/x86_64/kernel/irq.c
3 * 3 *
4 * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar 4 * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
5 * 5 *
6 * This file contains the lowest level x86_64-specific interrupt 6 * This file contains the lowest level x86_64-specific interrupt
7 * entry and irq statistics code. All the remaining irq logic is 7 * entry and irq statistics code. All the remaining irq logic is
8 * done by the generic kernel/irq/ code and in the 8 * done by the generic kernel/irq/ code and in the
9 * x86_64-specific irq controller code. (e.g. i8259.c and 9 * x86_64-specific irq controller code. (e.g. i8259.c and
10 * io_apic.c.) 10 * io_apic.c.)
11 */ 11 */
12 12
13 #include <linux/kernel_stat.h> 13 #include <linux/kernel_stat.h>
14 #include <linux/interrupt.h> 14 #include <linux/interrupt.h>
15 #include <linux/seq_file.h> 15 #include <linux/seq_file.h>
16 #include <linux/module.h> 16 #include <linux/module.h>
17 #include <linux/delay.h> 17 #include <linux/delay.h>
18 #include <asm/uaccess.h> 18 #include <asm/uaccess.h>
19 #include <asm/io_apic.h> 19 #include <asm/io_apic.h>
20 #include <asm/idle.h> 20 #include <asm/idle.h>
21 21
22 atomic_t irq_err_count; 22 atomic_t irq_err_count;
23 #ifdef CONFIG_X86_IO_APIC 23 #ifdef CONFIG_X86_IO_APIC
24 #ifdef APIC_MISMATCH_DEBUG 24 #ifdef APIC_MISMATCH_DEBUG
25 atomic_t irq_mis_count; 25 atomic_t irq_mis_count;
26 #endif 26 #endif
27 #endif 27 #endif
28 28
29 /* 29 /*
30 * Generic, controller-independent functions: 30 * Generic, controller-independent functions:
31 */ 31 */
32 32
33 int show_interrupts(struct seq_file *p, void *v) 33 int show_interrupts(struct seq_file *p, void *v)
34 { 34 {
35 int i = *(loff_t *) v, j; 35 int i = *(loff_t *) v, j;
36 struct irqaction * action; 36 struct irqaction * action;
37 unsigned long flags; 37 unsigned long flags;
38 38
39 if (i == 0) { 39 if (i == 0) {
40 seq_printf(p, " "); 40 seq_printf(p, " ");
41 for (j=0; j<NR_CPUS; j++) 41 for (j=0; j<NR_CPUS; j++)
42 if (cpu_online(j)) 42 if (cpu_online(j))
43 seq_printf(p, "CPU%d ",j); 43 seq_printf(p, "CPU%d ",j);
44 seq_putc(p, '\n'); 44 seq_putc(p, '\n');
45 } 45 }
46 46
47 if (i < NR_IRQS) { 47 if (i < NR_IRQS) {
48 spin_lock_irqsave(&irq_desc[i].lock, flags); 48 spin_lock_irqsave(&irq_desc[i].lock, flags);
49 action = irq_desc[i].action; 49 action = irq_desc[i].action;
50 if (!action) 50 if (!action)
51 goto skip; 51 goto skip;
52 seq_printf(p, "%3d: ",i); 52 seq_printf(p, "%3d: ",i);
53 #ifndef CONFIG_SMP 53 #ifndef CONFIG_SMP
54 seq_printf(p, "%10u ", kstat_irqs(i)); 54 seq_printf(p, "%10u ", kstat_irqs(i));
55 #else 55 #else
56 for (j=0; j<NR_CPUS; j++) 56 for (j=0; j<NR_CPUS; j++)
57 if (cpu_online(j)) 57 if (cpu_online(j))
58 seq_printf(p, "%10u ", 58 seq_printf(p, "%10u ",
59 kstat_cpu(j).irqs[i]); 59 kstat_cpu(j).irqs[i]);
60 #endif 60 #endif
61 seq_printf(p, " %14s", irq_desc[i].handler->typename); 61 seq_printf(p, " %14s", irq_desc[i].handler->typename);
62 62
63 seq_printf(p, " %s", action->name); 63 seq_printf(p, " %s", action->name);
64 for (action=action->next; action; action = action->next) 64 for (action=action->next; action; action = action->next)
65 seq_printf(p, ", %s", action->name); 65 seq_printf(p, ", %s", action->name);
66 seq_putc(p, '\n'); 66 seq_putc(p, '\n');
67 skip: 67 skip:
68 spin_unlock_irqrestore(&irq_desc[i].lock, flags); 68 spin_unlock_irqrestore(&irq_desc[i].lock, flags);
69 } else if (i == NR_IRQS) { 69 } else if (i == NR_IRQS) {
70 seq_printf(p, "NMI: "); 70 seq_printf(p, "NMI: ");
71 for (j = 0; j < NR_CPUS; j++) 71 for (j = 0; j < NR_CPUS; j++)
72 if (cpu_online(j)) 72 if (cpu_online(j))
73 seq_printf(p, "%10u ", cpu_pda[j].__nmi_count); 73 seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
74 seq_putc(p, '\n'); 74 seq_putc(p, '\n');
75 #ifdef CONFIG_X86_LOCAL_APIC 75 #ifdef CONFIG_X86_LOCAL_APIC
76 seq_printf(p, "LOC: "); 76 seq_printf(p, "LOC: ");
77 for (j = 0; j < NR_CPUS; j++) 77 for (j = 0; j < NR_CPUS; j++)
78 if (cpu_online(j)) 78 if (cpu_online(j))
79 seq_printf(p, "%10u ", cpu_pda[j].apic_timer_irqs); 79 seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
80 seq_putc(p, '\n'); 80 seq_putc(p, '\n');
81 #endif 81 #endif
82 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); 82 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
83 #ifdef CONFIG_X86_IO_APIC 83 #ifdef CONFIG_X86_IO_APIC
84 #ifdef APIC_MISMATCH_DEBUG 84 #ifdef APIC_MISMATCH_DEBUG
85 seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); 85 seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
86 #endif 86 #endif
87 #endif 87 #endif
88 } 88 }
89 return 0; 89 return 0;
90 } 90 }
91 91
92 /* 92 /*
93 * do_IRQ handles all normal device IRQ's (the special 93 * do_IRQ handles all normal device IRQ's (the special
94 * SMP cross-CPU interrupts have their own specific 94 * SMP cross-CPU interrupts have their own specific
95 * handlers). 95 * handlers).
96 */ 96 */
97 asmlinkage unsigned int do_IRQ(struct pt_regs *regs) 97 asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
98 { 98 {
99 /* high bits used in ret_from_ code */ 99 /* high bits used in ret_from_ code */
100 unsigned irq = regs->orig_rax & 0xff; 100 unsigned irq = regs->orig_rax & 0xff;
101 101
102 exit_idle(); 102 exit_idle();
103 irq_enter(); 103 irq_enter();
104 104
105 __do_IRQ(irq, regs); 105 __do_IRQ(irq, regs);
106 irq_exit(); 106 irq_exit();
107 107
108 return 1; 108 return 1;
109 } 109 }
110 110
111 #ifdef CONFIG_HOTPLUG_CPU 111 #ifdef CONFIG_HOTPLUG_CPU
112 void fixup_irqs(cpumask_t map) 112 void fixup_irqs(cpumask_t map)
113 { 113 {
114 unsigned int irq; 114 unsigned int irq;
115 static int warned; 115 static int warned;
116 116
117 for (irq = 0; irq < NR_IRQS; irq++) { 117 for (irq = 0; irq < NR_IRQS; irq++) {
118 cpumask_t mask; 118 cpumask_t mask;
119 if (irq == 2) 119 if (irq == 2)
120 continue; 120 continue;
121 121
122 cpus_and(mask, irq_affinity[irq], map); 122 cpus_and(mask, irq_affinity[irq], map);
123 if (any_online_cpu(mask) == NR_CPUS) { 123 if (any_online_cpu(mask) == NR_CPUS) {
124 printk("Breaking affinity for irq %i\n", irq); 124 printk("Breaking affinity for irq %i\n", irq);
125 mask = map; 125 mask = map;
126 } 126 }
127 if (irq_desc[irq].handler->set_affinity) 127 if (irq_desc[irq].handler->set_affinity)
128 irq_desc[irq].handler->set_affinity(irq, mask); 128 irq_desc[irq].handler->set_affinity(irq, mask);
129 else if (irq_desc[irq].action && !(warned++)) 129 else if (irq_desc[irq].action && !(warned++))
130 printk("Cannot set affinity for irq %i\n", irq); 130 printk("Cannot set affinity for irq %i\n", irq);
131 } 131 }
132 132
133 /* That doesn't seem sufficient. Give it 1ms. */ 133 /* That doesn't seem sufficient. Give it 1ms. */
134 local_irq_enable(); 134 local_irq_enable();
135 mdelay(1); 135 mdelay(1);
136 local_irq_disable(); 136 local_irq_disable();
137 } 137 }
138 #endif 138 #endif
139 139
140 extern void call_softirq(void); 140 extern void call_softirq(void);
141 141
142 asmlinkage void do_softirq(void) 142 asmlinkage void do_softirq(void)
143 { 143 {
144 __u32 pending; 144 __u32 pending;
145 unsigned long flags; 145 unsigned long flags;
146 146
147 if (in_interrupt()) 147 if (in_interrupt())
148 return; 148 return;
149 149
150 local_irq_save(flags); 150 local_irq_save(flags);
151 pending = local_softirq_pending(); 151 pending = local_softirq_pending();
152 /* Switch to interrupt stack */ 152 /* Switch to interrupt stack */
153 if (pending) 153 if (pending)
154 call_softirq(); 154 call_softirq();
155 local_irq_restore(flags); 155 local_irq_restore(flags);
156 } 156 }
157 EXPORT_SYMBOL(do_softirq); 157 EXPORT_SYMBOL(do_softirq);
158 158
arch/x86_64/kernel/nmi.c
1 /* 1 /*
2 * linux/arch/x86_64/nmi.c 2 * linux/arch/x86_64/nmi.c
3 * 3 *
4 * NMI watchdog support on APIC systems 4 * NMI watchdog support on APIC systems
5 * 5 *
6 * Started by Ingo Molnar <mingo@redhat.com> 6 * Started by Ingo Molnar <mingo@redhat.com>
7 * 7 *
8 * Fixes: 8 * Fixes:
9 * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. 9 * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
10 * Mikael Pettersson : Power Management for local APIC NMI watchdog. 10 * Mikael Pettersson : Power Management for local APIC NMI watchdog.
11 * Pavel Machek and 11 * Pavel Machek and
12 * Mikael Pettersson : PM converted to driver model. Disable/enable API. 12 * Mikael Pettersson : PM converted to driver model. Disable/enable API.
13 */ 13 */
14 14
15 #include <linux/config.h> 15 #include <linux/config.h>
16 #include <linux/mm.h> 16 #include <linux/mm.h>
17 #include <linux/delay.h> 17 #include <linux/delay.h>
18 #include <linux/bootmem.h> 18 #include <linux/bootmem.h>
19 #include <linux/smp_lock.h> 19 #include <linux/smp_lock.h>
20 #include <linux/interrupt.h> 20 #include <linux/interrupt.h>
21 #include <linux/mc146818rtc.h> 21 #include <linux/mc146818rtc.h>
22 #include <linux/kernel_stat.h> 22 #include <linux/kernel_stat.h>
23 #include <linux/module.h> 23 #include <linux/module.h>
24 #include <linux/sysdev.h> 24 #include <linux/sysdev.h>
25 #include <linux/nmi.h> 25 #include <linux/nmi.h>
26 #include <linux/sysctl.h> 26 #include <linux/sysctl.h>
27 27
28 #include <asm/smp.h> 28 #include <asm/smp.h>
29 #include <asm/mtrr.h> 29 #include <asm/mtrr.h>
30 #include <asm/mpspec.h> 30 #include <asm/mpspec.h>
31 #include <asm/nmi.h> 31 #include <asm/nmi.h>
32 #include <asm/msr.h> 32 #include <asm/msr.h>
33 #include <asm/proto.h> 33 #include <asm/proto.h>
34 #include <asm/kdebug.h> 34 #include <asm/kdebug.h>
35 #include <asm/local.h> 35 #include <asm/local.h>
36 36
37 /* 37 /*
38 * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: 38 * lapic_nmi_owner tracks the ownership of the lapic NMI hardware:
39 * - it may be reserved by some other driver, or not 39 * - it may be reserved by some other driver, or not
40 * - when not reserved by some other driver, it may be used for 40 * - when not reserved by some other driver, it may be used for
41 * the NMI watchdog, or not 41 * the NMI watchdog, or not
42 * 42 *
43 * This is maintained separately from nmi_active because the NMI 43 * This is maintained separately from nmi_active because the NMI
44 * watchdog may also be driven from the I/O APIC timer. 44 * watchdog may also be driven from the I/O APIC timer.
45 */ 45 */
46 static DEFINE_SPINLOCK(lapic_nmi_owner_lock); 46 static DEFINE_SPINLOCK(lapic_nmi_owner_lock);
47 static unsigned int lapic_nmi_owner; 47 static unsigned int lapic_nmi_owner;
48 #define LAPIC_NMI_WATCHDOG (1<<0) 48 #define LAPIC_NMI_WATCHDOG (1<<0)
49 #define LAPIC_NMI_RESERVED (1<<1) 49 #define LAPIC_NMI_RESERVED (1<<1)
50 50
51 /* nmi_active: 51 /* nmi_active:
52 * +1: the lapic NMI watchdog is active, but can be disabled 52 * +1: the lapic NMI watchdog is active, but can be disabled
53 * 0: the lapic NMI watchdog has not been set up, and cannot 53 * 0: the lapic NMI watchdog has not been set up, and cannot
54 * be enabled 54 * be enabled
55 * -1: the lapic NMI watchdog is disabled, but can be enabled 55 * -1: the lapic NMI watchdog is disabled, but can be enabled
56 */ 56 */
57 int nmi_active; /* oprofile uses this */ 57 int nmi_active; /* oprofile uses this */
58 int panic_on_timeout; 58 int panic_on_timeout;
59 59
60 unsigned int nmi_watchdog = NMI_DEFAULT; 60 unsigned int nmi_watchdog = NMI_DEFAULT;
61 static unsigned int nmi_hz = HZ; 61 static unsigned int nmi_hz = HZ;
62 static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ 62 static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
63 static unsigned int nmi_p4_cccr_val; 63 static unsigned int nmi_p4_cccr_val;
64 64
65 /* Note that these events don't tick when the CPU idles. This means 65 /* Note that these events don't tick when the CPU idles. This means
66 the frequency varies with CPU load. */ 66 the frequency varies with CPU load. */
67 67
68 #define K7_EVNTSEL_ENABLE (1 << 22) 68 #define K7_EVNTSEL_ENABLE (1 << 22)
69 #define K7_EVNTSEL_INT (1 << 20) 69 #define K7_EVNTSEL_INT (1 << 20)
70 #define K7_EVNTSEL_OS (1 << 17) 70 #define K7_EVNTSEL_OS (1 << 17)
71 #define K7_EVNTSEL_USR (1 << 16) 71 #define K7_EVNTSEL_USR (1 << 16)
72 #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 72 #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
73 #define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 73 #define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
74 74
75 #define MSR_P4_MISC_ENABLE 0x1A0 75 #define MSR_P4_MISC_ENABLE 0x1A0
76 #define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) 76 #define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7)
77 #define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12) 77 #define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12)
78 #define MSR_P4_PERFCTR0 0x300 78 #define MSR_P4_PERFCTR0 0x300
79 #define MSR_P4_CCCR0 0x360 79 #define MSR_P4_CCCR0 0x360
80 #define P4_ESCR_EVENT_SELECT(N) ((N)<<25) 80 #define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
81 #define P4_ESCR_OS (1<<3) 81 #define P4_ESCR_OS (1<<3)
82 #define P4_ESCR_USR (1<<2) 82 #define P4_ESCR_USR (1<<2)
83 #define P4_CCCR_OVF_PMI0 (1<<26) 83 #define P4_CCCR_OVF_PMI0 (1<<26)
84 #define P4_CCCR_OVF_PMI1 (1<<27) 84 #define P4_CCCR_OVF_PMI1 (1<<27)
85 #define P4_CCCR_THRESHOLD(N) ((N)<<20) 85 #define P4_CCCR_THRESHOLD(N) ((N)<<20)
86 #define P4_CCCR_COMPLEMENT (1<<19) 86 #define P4_CCCR_COMPLEMENT (1<<19)
87 #define P4_CCCR_COMPARE (1<<18) 87 #define P4_CCCR_COMPARE (1<<18)
88 #define P4_CCCR_REQUIRED (3<<16) 88 #define P4_CCCR_REQUIRED (3<<16)
89 #define P4_CCCR_ESCR_SELECT(N) ((N)<<13) 89 #define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
90 #define P4_CCCR_ENABLE (1<<12) 90 #define P4_CCCR_ENABLE (1<<12)
91 /* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter 91 /* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
92 CRU_ESCR0 (with any non-null event selector) through a complemented 92 CRU_ESCR0 (with any non-null event selector) through a complemented
93 max threshold. [IA32-Vol3, Section 14.9.9] */ 93 max threshold. [IA32-Vol3, Section 14.9.9] */
94 #define MSR_P4_IQ_COUNTER0 0x30C 94 #define MSR_P4_IQ_COUNTER0 0x30C
95 #define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR) 95 #define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR)
96 #define P4_NMI_IQ_CCCR0 \ 96 #define P4_NMI_IQ_CCCR0 \
97 (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ 97 (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \
98 P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) 98 P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
99 99
100 static __cpuinit inline int nmi_known_cpu(void) 100 static __cpuinit inline int nmi_known_cpu(void)
101 { 101 {
102 switch (boot_cpu_data.x86_vendor) { 102 switch (boot_cpu_data.x86_vendor) {
103 case X86_VENDOR_AMD: 103 case X86_VENDOR_AMD:
104 return boot_cpu_data.x86 == 15; 104 return boot_cpu_data.x86 == 15;
105 case X86_VENDOR_INTEL: 105 case X86_VENDOR_INTEL:
106 return boot_cpu_data.x86 == 15; 106 return boot_cpu_data.x86 == 15;
107 } 107 }
108 return 0; 108 return 0;
109 } 109 }
110 110
111 /* Run after command line and cpu_init init, but before all other checks */ 111 /* Run after command line and cpu_init init, but before all other checks */
112 void __cpuinit nmi_watchdog_default(void) 112 void __cpuinit nmi_watchdog_default(void)
113 { 113 {
114 if (nmi_watchdog != NMI_DEFAULT) 114 if (nmi_watchdog != NMI_DEFAULT)
115 return; 115 return;
116 if (nmi_known_cpu()) 116 if (nmi_known_cpu())
117 nmi_watchdog = NMI_LOCAL_APIC; 117 nmi_watchdog = NMI_LOCAL_APIC;
118 else 118 else
119 nmi_watchdog = NMI_IO_APIC; 119 nmi_watchdog = NMI_IO_APIC;
120 } 120 }
121 121
122 #ifdef CONFIG_SMP 122 #ifdef CONFIG_SMP
123 /* The performance counters used by NMI_LOCAL_APIC don't trigger when 123 /* The performance counters used by NMI_LOCAL_APIC don't trigger when
124 * the CPU is idle. To make sure the NMI watchdog really ticks on all 124 * the CPU is idle. To make sure the NMI watchdog really ticks on all
125 * CPUs during the test make them busy. 125 * CPUs during the test make them busy.
126 */ 126 */
127 static __init void nmi_cpu_busy(void *data) 127 static __init void nmi_cpu_busy(void *data)
128 { 128 {
129 volatile int *endflag = data; 129 volatile int *endflag = data;
130 local_irq_enable(); 130 local_irq_enable();
131 /* Intentionally don't use cpu_relax here. This is 131 /* Intentionally don't use cpu_relax here. This is
132 to make sure that the performance counter really ticks, 132 to make sure that the performance counter really ticks,
133 even if there is a simulator or similar that catches the 133 even if there is a simulator or similar that catches the
134 pause instruction. On a real HT machine this is fine because 134 pause instruction. On a real HT machine this is fine because
135 all other CPUs are busy with "useless" delay loops and don't 135 all other CPUs are busy with "useless" delay loops and don't
136 care if they get somewhat less cycles. */ 136 care if they get somewhat less cycles. */
137 while (*endflag == 0) 137 while (*endflag == 0)
138 barrier(); 138 barrier();
139 } 139 }
140 #endif 140 #endif
141 141
142 int __init check_nmi_watchdog (void) 142 int __init check_nmi_watchdog (void)
143 { 143 {
144 volatile int endflag = 0; 144 volatile int endflag = 0;
145 int *counts; 145 int *counts;
146 int cpu; 146 int cpu;
147 147
148 counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); 148 counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
149 if (!counts) 149 if (!counts)
150 return -1; 150 return -1;
151 151
152 printk(KERN_INFO "testing NMI watchdog ... "); 152 printk(KERN_INFO "testing NMI watchdog ... ");
153 153
154 if (nmi_watchdog == NMI_LOCAL_APIC) 154 if (nmi_watchdog == NMI_LOCAL_APIC)
155 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); 155 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
156 156
157 for (cpu = 0; cpu < NR_CPUS; cpu++) 157 for (cpu = 0; cpu < NR_CPUS; cpu++)
158 counts[cpu] = cpu_pda[cpu].__nmi_count; 158 counts[cpu] = cpu_pda(cpu)->__nmi_count;
159 local_irq_enable(); 159 local_irq_enable();
160 mdelay((10*1000)/nmi_hz); // wait 10 ticks 160 mdelay((10*1000)/nmi_hz); // wait 10 ticks
161 161
162 for (cpu = 0; cpu < NR_CPUS; cpu++) { 162 for (cpu = 0; cpu < NR_CPUS; cpu++) {
163 if (!cpu_online(cpu)) 163 if (!cpu_online(cpu))
164 continue; 164 continue;
165 if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) { 165 if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) {
166 endflag = 1; 166 endflag = 1;
167 printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n", 167 printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
168 cpu, 168 cpu,
169 counts[cpu], 169 counts[cpu],
170 cpu_pda[cpu].__nmi_count); 170 cpu_pda(cpu)->__nmi_count);
171 nmi_active = 0; 171 nmi_active = 0;
172 lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG; 172 lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG;
173 nmi_perfctr_msr = 0; 173 nmi_perfctr_msr = 0;
174 kfree(counts); 174 kfree(counts);
175 return -1; 175 return -1;
176 } 176 }
177 } 177 }
178 endflag = 1; 178 endflag = 1;
179 printk("OK.\n"); 179 printk("OK.\n");
180 180
181 /* now that we know it works we can reduce NMI frequency to 181 /* now that we know it works we can reduce NMI frequency to
182 something more reasonable; makes a difference in some configs */ 182 something more reasonable; makes a difference in some configs */
183 if (nmi_watchdog == NMI_LOCAL_APIC) 183 if (nmi_watchdog == NMI_LOCAL_APIC)
184 nmi_hz = 1; 184 nmi_hz = 1;
185 185
186 kfree(counts); 186 kfree(counts);
187 return 0; 187 return 0;
188 } 188 }
189 189
190 int __init setup_nmi_watchdog(char *str) 190 int __init setup_nmi_watchdog(char *str)
191 { 191 {
192 int nmi; 192 int nmi;
193 193
194 if (!strncmp(str,"panic",5)) { 194 if (!strncmp(str,"panic",5)) {
195 panic_on_timeout = 1; 195 panic_on_timeout = 1;
196 str = strchr(str, ','); 196 str = strchr(str, ',');
197 if (!str) 197 if (!str)
198 return 1; 198 return 1;
199 ++str; 199 ++str;
200 } 200 }
201 201
202 get_option(&str, &nmi); 202 get_option(&str, &nmi);
203 203
204 if (nmi >= NMI_INVALID) 204 if (nmi >= NMI_INVALID)
205 return 0; 205 return 0;
206 nmi_watchdog = nmi; 206 nmi_watchdog = nmi;
207 return 1; 207 return 1;
208 } 208 }
209 209
210 __setup("nmi_watchdog=", setup_nmi_watchdog); 210 __setup("nmi_watchdog=", setup_nmi_watchdog);
211 211
212 static void disable_lapic_nmi_watchdog(void) 212 static void disable_lapic_nmi_watchdog(void)
213 { 213 {
214 if (nmi_active <= 0) 214 if (nmi_active <= 0)
215 return; 215 return;
216 switch (boot_cpu_data.x86_vendor) { 216 switch (boot_cpu_data.x86_vendor) {
217 case X86_VENDOR_AMD: 217 case X86_VENDOR_AMD:
218 wrmsr(MSR_K7_EVNTSEL0, 0, 0); 218 wrmsr(MSR_K7_EVNTSEL0, 0, 0);
219 break; 219 break;
220 case X86_VENDOR_INTEL: 220 case X86_VENDOR_INTEL:
221 if (boot_cpu_data.x86 == 15) { 221 if (boot_cpu_data.x86 == 15) {
222 wrmsr(MSR_P4_IQ_CCCR0, 0, 0); 222 wrmsr(MSR_P4_IQ_CCCR0, 0, 0);
223 wrmsr(MSR_P4_CRU_ESCR0, 0, 0); 223 wrmsr(MSR_P4_CRU_ESCR0, 0, 0);
224 } 224 }
225 break; 225 break;
226 } 226 }
227 nmi_active = -1; 227 nmi_active = -1;
228 /* tell do_nmi() and others that we're not active any more */ 228 /* tell do_nmi() and others that we're not active any more */
229 nmi_watchdog = 0; 229 nmi_watchdog = 0;
230 } 230 }
231 231
232 static void enable_lapic_nmi_watchdog(void) 232 static void enable_lapic_nmi_watchdog(void)
233 { 233 {
234 if (nmi_active < 0) { 234 if (nmi_active < 0) {
235 nmi_watchdog = NMI_LOCAL_APIC; 235 nmi_watchdog = NMI_LOCAL_APIC;
236 setup_apic_nmi_watchdog(); 236 setup_apic_nmi_watchdog();
237 } 237 }
238 } 238 }
239 239
240 int reserve_lapic_nmi(void) 240 int reserve_lapic_nmi(void)
241 { 241 {
242 unsigned int old_owner; 242 unsigned int old_owner;
243 243
244 spin_lock(&lapic_nmi_owner_lock); 244 spin_lock(&lapic_nmi_owner_lock);
245 old_owner = lapic_nmi_owner; 245 old_owner = lapic_nmi_owner;
246 lapic_nmi_owner |= LAPIC_NMI_RESERVED; 246 lapic_nmi_owner |= LAPIC_NMI_RESERVED;
247 spin_unlock(&lapic_nmi_owner_lock); 247 spin_unlock(&lapic_nmi_owner_lock);
248 if (old_owner & LAPIC_NMI_RESERVED) 248 if (old_owner & LAPIC_NMI_RESERVED)
249 return -EBUSY; 249 return -EBUSY;
250 if (old_owner & LAPIC_NMI_WATCHDOG) 250 if (old_owner & LAPIC_NMI_WATCHDOG)
251 disable_lapic_nmi_watchdog(); 251 disable_lapic_nmi_watchdog();
252 return 0; 252 return 0;
253 } 253 }
254 254
255 void release_lapic_nmi(void) 255 void release_lapic_nmi(void)
256 { 256 {
257 unsigned int new_owner; 257 unsigned int new_owner;
258 258
259 spin_lock(&lapic_nmi_owner_lock); 259 spin_lock(&lapic_nmi_owner_lock);
260 new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED; 260 new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED;
261 lapic_nmi_owner = new_owner; 261 lapic_nmi_owner = new_owner;
262 spin_unlock(&lapic_nmi_owner_lock); 262 spin_unlock(&lapic_nmi_owner_lock);
263 if (new_owner & LAPIC_NMI_WATCHDOG) 263 if (new_owner & LAPIC_NMI_WATCHDOG)
264 enable_lapic_nmi_watchdog(); 264 enable_lapic_nmi_watchdog();
265 } 265 }
266 266
267 void disable_timer_nmi_watchdog(void) 267 void disable_timer_nmi_watchdog(void)
268 { 268 {
269 if ((nmi_watchdog != NMI_IO_APIC) || (nmi_active <= 0)) 269 if ((nmi_watchdog != NMI_IO_APIC) || (nmi_active <= 0))
270 return; 270 return;
271 271
272 disable_irq(0); 272 disable_irq(0);
273 unset_nmi_callback(); 273 unset_nmi_callback();
274 nmi_active = -1; 274 nmi_active = -1;
275 nmi_watchdog = NMI_NONE; 275 nmi_watchdog = NMI_NONE;
276 } 276 }
277 277
278 void enable_timer_nmi_watchdog(void) 278 void enable_timer_nmi_watchdog(void)
279 { 279 {
280 if (nmi_active < 0) { 280 if (nmi_active < 0) {
281 nmi_watchdog = NMI_IO_APIC; 281 nmi_watchdog = NMI_IO_APIC;
282 touch_nmi_watchdog(); 282 touch_nmi_watchdog();
283 nmi_active = 1; 283 nmi_active = 1;
284 enable_irq(0); 284 enable_irq(0);
285 } 285 }
286 } 286 }
287 287
288 #ifdef CONFIG_PM 288 #ifdef CONFIG_PM
289 289
290 static int nmi_pm_active; /* nmi_active before suspend */ 290 static int nmi_pm_active; /* nmi_active before suspend */
291 291
292 static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) 292 static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
293 { 293 {
294 nmi_pm_active = nmi_active; 294 nmi_pm_active = nmi_active;
295 disable_lapic_nmi_watchdog(); 295 disable_lapic_nmi_watchdog();
296 return 0; 296 return 0;
297 } 297 }
298 298
299 static int lapic_nmi_resume(struct sys_device *dev) 299 static int lapic_nmi_resume(struct sys_device *dev)
300 { 300 {
301 if (nmi_pm_active > 0) 301 if (nmi_pm_active > 0)
302 enable_lapic_nmi_watchdog(); 302 enable_lapic_nmi_watchdog();
303 return 0; 303 return 0;
304 } 304 }
305 305
306 static struct sysdev_class nmi_sysclass = { 306 static struct sysdev_class nmi_sysclass = {
307 set_kset_name("lapic_nmi"), 307 set_kset_name("lapic_nmi"),
308 .resume = lapic_nmi_resume, 308 .resume = lapic_nmi_resume,
309 .suspend = lapic_nmi_suspend, 309 .suspend = lapic_nmi_suspend,
310 }; 310 };
311 311
312 static struct sys_device device_lapic_nmi = { 312 static struct sys_device device_lapic_nmi = {
313 .id = 0, 313 .id = 0,
314 .cls = &nmi_sysclass, 314 .cls = &nmi_sysclass,
315 }; 315 };
316 316
317 static int __init init_lapic_nmi_sysfs(void) 317 static int __init init_lapic_nmi_sysfs(void)
318 { 318 {
319 int error; 319 int error;
320 320
321 if (nmi_active == 0 || nmi_watchdog != NMI_LOCAL_APIC) 321 if (nmi_active == 0 || nmi_watchdog != NMI_LOCAL_APIC)
322 return 0; 322 return 0;
323 323
324 error = sysdev_class_register(&nmi_sysclass); 324 error = sysdev_class_register(&nmi_sysclass);
325 if (!error) 325 if (!error)
326 error = sysdev_register(&device_lapic_nmi); 326 error = sysdev_register(&device_lapic_nmi);
327 return error; 327 return error;
328 } 328 }
329 /* must come after the local APIC's device_initcall() */ 329 /* must come after the local APIC's device_initcall() */
330 late_initcall(init_lapic_nmi_sysfs); 330 late_initcall(init_lapic_nmi_sysfs);
331 331
332 #endif /* CONFIG_PM */ 332 #endif /* CONFIG_PM */
333 333
334 /* 334 /*
335 * Activate the NMI watchdog via the local APIC. 335 * Activate the NMI watchdog via the local APIC.
336 * Original code written by Keith Owens. 336 * Original code written by Keith Owens.
337 */ 337 */
338 338
339 static void clear_msr_range(unsigned int base, unsigned int n) 339 static void clear_msr_range(unsigned int base, unsigned int n)
340 { 340 {
341 unsigned int i; 341 unsigned int i;
342 342
343 for(i = 0; i < n; ++i) 343 for(i = 0; i < n; ++i)
344 wrmsr(base+i, 0, 0); 344 wrmsr(base+i, 0, 0);
345 } 345 }
346 346
347 static void setup_k7_watchdog(void) 347 static void setup_k7_watchdog(void)
348 { 348 {
349 int i; 349 int i;
350 unsigned int evntsel; 350 unsigned int evntsel;
351 351
352 nmi_perfctr_msr = MSR_K7_PERFCTR0; 352 nmi_perfctr_msr = MSR_K7_PERFCTR0;
353 353
354 for(i = 0; i < 4; ++i) { 354 for(i = 0; i < 4; ++i) {
355 /* Simulator may not support it */ 355 /* Simulator may not support it */
356 if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) { 356 if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) {
357 nmi_perfctr_msr = 0; 357 nmi_perfctr_msr = 0;
358 return; 358 return;
359 } 359 }
360 wrmsrl(MSR_K7_PERFCTR0+i, 0UL); 360 wrmsrl(MSR_K7_PERFCTR0+i, 0UL);
361 } 361 }
362 362
363 evntsel = K7_EVNTSEL_INT 363 evntsel = K7_EVNTSEL_INT
364 | K7_EVNTSEL_OS 364 | K7_EVNTSEL_OS
365 | K7_EVNTSEL_USR 365 | K7_EVNTSEL_USR
366 | K7_NMI_EVENT; 366 | K7_NMI_EVENT;
367 367
368 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); 368 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
369 wrmsrl(MSR_K7_PERFCTR0, -((u64)cpu_khz * 1000 / nmi_hz)); 369 wrmsrl(MSR_K7_PERFCTR0, -((u64)cpu_khz * 1000 / nmi_hz));
370 apic_write(APIC_LVTPC, APIC_DM_NMI); 370 apic_write(APIC_LVTPC, APIC_DM_NMI);
371 evntsel |= K7_EVNTSEL_ENABLE; 371 evntsel |= K7_EVNTSEL_ENABLE;
372 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); 372 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
373 } 373 }
374 374
375 375
376 static int setup_p4_watchdog(void) 376 static int setup_p4_watchdog(void)
377 { 377 {
378 unsigned int misc_enable, dummy; 378 unsigned int misc_enable, dummy;
379 379
380 rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy); 380 rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy);
381 if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL)) 381 if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
382 return 0; 382 return 0;
383 383
384 nmi_perfctr_msr = MSR_P4_IQ_COUNTER0; 384 nmi_perfctr_msr = MSR_P4_IQ_COUNTER0;
385 nmi_p4_cccr_val = P4_NMI_IQ_CCCR0; 385 nmi_p4_cccr_val = P4_NMI_IQ_CCCR0;
386 #ifdef CONFIG_SMP 386 #ifdef CONFIG_SMP
387 if (smp_num_siblings == 2) 387 if (smp_num_siblings == 2)
388 nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1; 388 nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1;
389 #endif 389 #endif
390 390
391 if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL)) 391 if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL))
392 clear_msr_range(0x3F1, 2); 392 clear_msr_range(0x3F1, 2);
393 /* MSR 0x3F0 seems to have a default value of 0xFC00, but current 393 /* MSR 0x3F0 seems to have a default value of 0xFC00, but current
394 docs doesn't fully define it, so leave it alone for now. */ 394 docs doesn't fully define it, so leave it alone for now. */
395 if (boot_cpu_data.x86_model >= 0x3) { 395 if (boot_cpu_data.x86_model >= 0x3) {
396 /* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */ 396 /* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */
397 clear_msr_range(0x3A0, 26); 397 clear_msr_range(0x3A0, 26);
398 clear_msr_range(0x3BC, 3); 398 clear_msr_range(0x3BC, 3);
399 } else { 399 } else {
400 clear_msr_range(0x3A0, 31); 400 clear_msr_range(0x3A0, 31);
401 } 401 }
402 clear_msr_range(0x3C0, 6); 402 clear_msr_range(0x3C0, 6);
403 clear_msr_range(0x3C8, 6); 403 clear_msr_range(0x3C8, 6);
404 clear_msr_range(0x3E0, 2); 404 clear_msr_range(0x3E0, 2);
405 clear_msr_range(MSR_P4_CCCR0, 18); 405 clear_msr_range(MSR_P4_CCCR0, 18);
406 clear_msr_range(MSR_P4_PERFCTR0, 18); 406 clear_msr_range(MSR_P4_PERFCTR0, 18);
407 407
408 wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0); 408 wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0);
409 wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0); 409 wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0);
410 Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz * 1000UL / nmi_hz)); 410 Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz * 1000UL / nmi_hz));
411 wrmsrl(MSR_P4_IQ_COUNTER0, -((u64)cpu_khz * 1000 / nmi_hz)); 411 wrmsrl(MSR_P4_IQ_COUNTER0, -((u64)cpu_khz * 1000 / nmi_hz));
412 apic_write(APIC_LVTPC, APIC_DM_NMI); 412 apic_write(APIC_LVTPC, APIC_DM_NMI);
413 wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); 413 wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
414 return 1; 414 return 1;
415 } 415 }
416 416
417 void setup_apic_nmi_watchdog(void) 417 void setup_apic_nmi_watchdog(void)
418 { 418 {
419 switch (boot_cpu_data.x86_vendor) { 419 switch (boot_cpu_data.x86_vendor) {
420 case X86_VENDOR_AMD: 420 case X86_VENDOR_AMD:
421 if (boot_cpu_data.x86 != 15) 421 if (boot_cpu_data.x86 != 15)
422 return; 422 return;
423 if (strstr(boot_cpu_data.x86_model_id, "Screwdriver")) 423 if (strstr(boot_cpu_data.x86_model_id, "Screwdriver"))
424 return; 424 return;
425 setup_k7_watchdog(); 425 setup_k7_watchdog();
426 break; 426 break;
427 case X86_VENDOR_INTEL: 427 case X86_VENDOR_INTEL:
428 if (boot_cpu_data.x86 != 15) 428 if (boot_cpu_data.x86 != 15)
429 return; 429 return;
430 if (!setup_p4_watchdog()) 430 if (!setup_p4_watchdog())
431 return; 431 return;
432 break; 432 break;
433 433
434 default: 434 default:
435 return; 435 return;
436 } 436 }
437 lapic_nmi_owner = LAPIC_NMI_WATCHDOG; 437 lapic_nmi_owner = LAPIC_NMI_WATCHDOG;
438 nmi_active = 1; 438 nmi_active = 1;
439 } 439 }
440 440
441 /* 441 /*
442 * the best way to detect whether a CPU has a 'hard lockup' problem 442 * the best way to detect whether a CPU has a 'hard lockup' problem
443 * is to check it's local APIC timer IRQ counts. If they are not 443 * is to check it's local APIC timer IRQ counts. If they are not
444 * changing then that CPU has some problem. 444 * changing then that CPU has some problem.
445 * 445 *
446 * as these watchdog NMI IRQs are generated on every CPU, we only 446 * as these watchdog NMI IRQs are generated on every CPU, we only
447 * have to check the current processor. 447 * have to check the current processor.
448 */ 448 */
449 449
450 static DEFINE_PER_CPU(unsigned, last_irq_sum); 450 static DEFINE_PER_CPU(unsigned, last_irq_sum);
451 static DEFINE_PER_CPU(local_t, alert_counter); 451 static DEFINE_PER_CPU(local_t, alert_counter);
452 static DEFINE_PER_CPU(int, nmi_touch); 452 static DEFINE_PER_CPU(int, nmi_touch);
453 453
454 void touch_nmi_watchdog (void) 454 void touch_nmi_watchdog (void)
455 { 455 {
456 int i; 456 int i;
457 457
458 /* 458 /*
459 * Tell other CPUs to reset their alert counters. We cannot 459 * Tell other CPUs to reset their alert counters. We cannot
460 * do it ourselves because the alert count increase is not 460 * do it ourselves because the alert count increase is not
461 * atomic. 461 * atomic.
462 */ 462 */
463 for (i = 0; i < NR_CPUS; i++) 463 for (i = 0; i < NR_CPUS; i++)
464 per_cpu(nmi_touch, i) = 1; 464 per_cpu(nmi_touch, i) = 1;
465 465
466 touch_softlockup_watchdog(); 466 touch_softlockup_watchdog();
467 } 467 }
468 468
469 void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) 469 void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason)
470 { 470 {
471 int sum; 471 int sum;
472 int touched = 0; 472 int touched = 0;
473 473
474 sum = read_pda(apic_timer_irqs); 474 sum = read_pda(apic_timer_irqs);
475 if (__get_cpu_var(nmi_touch)) { 475 if (__get_cpu_var(nmi_touch)) {
476 __get_cpu_var(nmi_touch) = 0; 476 __get_cpu_var(nmi_touch) = 0;
477 touched = 1; 477 touched = 1;
478 } 478 }
479 if (!touched && __get_cpu_var(last_irq_sum) == sum) { 479 if (!touched && __get_cpu_var(last_irq_sum) == sum) {
480 /* 480 /*
481 * Ayiee, looks like this CPU is stuck ... 481 * Ayiee, looks like this CPU is stuck ...
482 * wait a few IRQs (5 seconds) before doing the oops ... 482 * wait a few IRQs (5 seconds) before doing the oops ...
483 */ 483 */
484 local_inc(&__get_cpu_var(alert_counter)); 484 local_inc(&__get_cpu_var(alert_counter));
485 if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) { 485 if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) {
486 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) 486 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
487 == NOTIFY_STOP) { 487 == NOTIFY_STOP) {
488 local_set(&__get_cpu_var(alert_counter), 0); 488 local_set(&__get_cpu_var(alert_counter), 0);
489 return; 489 return;
490 } 490 }
491 die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs); 491 die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs);
492 } 492 }
493 } else { 493 } else {
494 __get_cpu_var(last_irq_sum) = sum; 494 __get_cpu_var(last_irq_sum) = sum;
495 local_set(&__get_cpu_var(alert_counter), 0); 495 local_set(&__get_cpu_var(alert_counter), 0);
496 } 496 }
497 if (nmi_perfctr_msr) { 497 if (nmi_perfctr_msr) {
498 if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) { 498 if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) {
499 /* 499 /*
500 * P4 quirks: 500 * P4 quirks:
501 * - An overflown perfctr will assert its interrupt 501 * - An overflown perfctr will assert its interrupt
502 * until the OVF flag in its CCCR is cleared. 502 * until the OVF flag in its CCCR is cleared.
503 * - LVTPC is masked on interrupt and must be 503 * - LVTPC is masked on interrupt and must be
504 * unmasked by the LVTPC handler. 504 * unmasked by the LVTPC handler.
505 */ 505 */
506 wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); 506 wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
507 apic_write(APIC_LVTPC, APIC_DM_NMI); 507 apic_write(APIC_LVTPC, APIC_DM_NMI);
508 } 508 }
509 wrmsrl(nmi_perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); 509 wrmsrl(nmi_perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));
510 } 510 }
511 } 511 }
512 512
513 static int dummy_nmi_callback(struct pt_regs * regs, int cpu) 513 static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
514 { 514 {
515 return 0; 515 return 0;
516 } 516 }
517 517
518 static nmi_callback_t nmi_callback = dummy_nmi_callback; 518 static nmi_callback_t nmi_callback = dummy_nmi_callback;
519 519
520 asmlinkage void do_nmi(struct pt_regs * regs, long error_code) 520 asmlinkage void do_nmi(struct pt_regs * regs, long error_code)
521 { 521 {
522 int cpu = safe_smp_processor_id(); 522 int cpu = safe_smp_processor_id();
523 523
524 nmi_enter(); 524 nmi_enter();
525 add_pda(__nmi_count,1); 525 add_pda(__nmi_count,1);
526 if (!rcu_dereference(nmi_callback)(regs, cpu)) 526 if (!rcu_dereference(nmi_callback)(regs, cpu))
527 default_do_nmi(regs); 527 default_do_nmi(regs);
528 nmi_exit(); 528 nmi_exit();
529 } 529 }
530 530
531 void set_nmi_callback(nmi_callback_t callback) 531 void set_nmi_callback(nmi_callback_t callback)
532 { 532 {
533 rcu_assign_pointer(nmi_callback, callback); 533 rcu_assign_pointer(nmi_callback, callback);
534 } 534 }
535 535
536 void unset_nmi_callback(void) 536 void unset_nmi_callback(void)
537 { 537 {
538 nmi_callback = dummy_nmi_callback; 538 nmi_callback = dummy_nmi_callback;
539 } 539 }
540 540
541 #ifdef CONFIG_SYSCTL 541 #ifdef CONFIG_SYSCTL
542 542
543 static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) 543 static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
544 { 544 {
545 unsigned char reason = get_nmi_reason(); 545 unsigned char reason = get_nmi_reason();
546 char buf[64]; 546 char buf[64];
547 547
548 if (!(reason & 0xc0)) { 548 if (!(reason & 0xc0)) {
549 sprintf(buf, "NMI received for unknown reason %02x\n", reason); 549 sprintf(buf, "NMI received for unknown reason %02x\n", reason);
550 die_nmi(buf,regs); 550 die_nmi(buf,regs);
551 } 551 }
552 return 0; 552 return 0;
553 } 553 }
554 554
555 /* 555 /*
556 * proc handler for /proc/sys/kernel/unknown_nmi_panic 556 * proc handler for /proc/sys/kernel/unknown_nmi_panic
557 */ 557 */
558 int proc_unknown_nmi_panic(struct ctl_table *table, int write, struct file *file, 558 int proc_unknown_nmi_panic(struct ctl_table *table, int write, struct file *file,
559 void __user *buffer, size_t *length, loff_t *ppos) 559 void __user *buffer, size_t *length, loff_t *ppos)
560 { 560 {
561 int old_state; 561 int old_state;
562 562
563 old_state = unknown_nmi_panic; 563 old_state = unknown_nmi_panic;
564 proc_dointvec(table, write, file, buffer, length, ppos); 564 proc_dointvec(table, write, file, buffer, length, ppos);
565 if (!!old_state == !!unknown_nmi_panic) 565 if (!!old_state == !!unknown_nmi_panic)
566 return 0; 566 return 0;
567 567
568 if (unknown_nmi_panic) { 568 if (unknown_nmi_panic) {
569 if (reserve_lapic_nmi() < 0) { 569 if (reserve_lapic_nmi() < 0) {
570 unknown_nmi_panic = 0; 570 unknown_nmi_panic = 0;
571 return -EBUSY; 571 return -EBUSY;
572 } else { 572 } else {
573 set_nmi_callback(unknown_nmi_panic_callback); 573 set_nmi_callback(unknown_nmi_panic_callback);
574 } 574 }
575 } else { 575 } else {
576 release_lapic_nmi(); 576 release_lapic_nmi();
577 unset_nmi_callback(); 577 unset_nmi_callback();
578 } 578 }
579 return 0; 579 return 0;
580 } 580 }
581 581
582 #endif 582 #endif
583 583
584 EXPORT_SYMBOL(nmi_active); 584 EXPORT_SYMBOL(nmi_active);
585 EXPORT_SYMBOL(nmi_watchdog); 585 EXPORT_SYMBOL(nmi_watchdog);
586 EXPORT_SYMBOL(reserve_lapic_nmi); 586 EXPORT_SYMBOL(reserve_lapic_nmi);
587 EXPORT_SYMBOL(release_lapic_nmi); 587 EXPORT_SYMBOL(release_lapic_nmi);
588 EXPORT_SYMBOL(disable_timer_nmi_watchdog); 588 EXPORT_SYMBOL(disable_timer_nmi_watchdog);
589 EXPORT_SYMBOL(enable_timer_nmi_watchdog); 589 EXPORT_SYMBOL(enable_timer_nmi_watchdog);
590 EXPORT_SYMBOL(touch_nmi_watchdog); 590 EXPORT_SYMBOL(touch_nmi_watchdog);
591 591
arch/x86_64/kernel/setup64.c
1 /* 1 /*
2 * X86-64 specific CPU setup. 2 * X86-64 specific CPU setup.
3 * Copyright (C) 1995 Linus Torvalds 3 * Copyright (C) 1995 Linus Torvalds
4 * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen. 4 * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
5 * See setup.c for older changelog. 5 * See setup.c for older changelog.
6 * $Id: setup64.c,v 1.12 2002/03/21 10:09:17 ak Exp $ 6 * $Id: setup64.c,v 1.12 2002/03/21 10:09:17 ak Exp $
7 */ 7 */
8 #include <linux/config.h> 8 #include <linux/config.h>
9 #include <linux/init.h> 9 #include <linux/init.h>
10 #include <linux/kernel.h> 10 #include <linux/kernel.h>
11 #include <linux/sched.h> 11 #include <linux/sched.h>
12 #include <linux/string.h> 12 #include <linux/string.h>
13 #include <linux/bootmem.h> 13 #include <linux/bootmem.h>
14 #include <linux/bitops.h> 14 #include <linux/bitops.h>
15 #include <linux/module.h> 15 #include <linux/module.h>
16 #include <asm/bootsetup.h> 16 #include <asm/bootsetup.h>
17 #include <asm/pda.h> 17 #include <asm/pda.h>
18 #include <asm/pgtable.h> 18 #include <asm/pgtable.h>
19 #include <asm/processor.h> 19 #include <asm/processor.h>
20 #include <asm/desc.h> 20 #include <asm/desc.h>
21 #include <asm/atomic.h> 21 #include <asm/atomic.h>
22 #include <asm/mmu_context.h> 22 #include <asm/mmu_context.h>
23 #include <asm/smp.h> 23 #include <asm/smp.h>
24 #include <asm/i387.h> 24 #include <asm/i387.h>
25 #include <asm/percpu.h> 25 #include <asm/percpu.h>
26 #include <asm/proto.h> 26 #include <asm/proto.h>
27 #include <asm/sections.h> 27 #include <asm/sections.h>
28 28
29 char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,}; 29 char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,};
30 30
31 cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; 31 cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
32 32
33 struct x8664_pda cpu_pda[NR_CPUS] __cacheline_aligned; 33 struct x8664_pda _cpu_pda[NR_CPUS] __cacheline_aligned;
34 34
35 struct desc_ptr idt_descr = { 256 * 16, (unsigned long) idt_table }; 35 struct desc_ptr idt_descr = { 256 * 16, (unsigned long) idt_table };
36 36
37 char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); 37 char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
38 38
39 unsigned long __supported_pte_mask __read_mostly = ~0UL; 39 unsigned long __supported_pte_mask __read_mostly = ~0UL;
40 static int do_not_nx __initdata = 0; 40 static int do_not_nx __initdata = 0;
41 41
42 /* noexec=on|off 42 /* noexec=on|off
43 Control non executable mappings for 64bit processes. 43 Control non executable mappings for 64bit processes.
44 44
45 on Enable(default) 45 on Enable(default)
46 off Disable 46 off Disable
47 */ 47 */
48 int __init nonx_setup(char *str) 48 int __init nonx_setup(char *str)
49 { 49 {
50 if (!strncmp(str, "on", 2)) { 50 if (!strncmp(str, "on", 2)) {
51 __supported_pte_mask |= _PAGE_NX; 51 __supported_pte_mask |= _PAGE_NX;
52 do_not_nx = 0; 52 do_not_nx = 0;
53 } else if (!strncmp(str, "off", 3)) { 53 } else if (!strncmp(str, "off", 3)) {
54 do_not_nx = 1; 54 do_not_nx = 1;
55 __supported_pte_mask &= ~_PAGE_NX; 55 __supported_pte_mask &= ~_PAGE_NX;
56 } 56 }
57 return 0; 57 return 0;
58 } 58 }
59 __setup("noexec=", nonx_setup); /* parsed early actually */ 59 __setup("noexec=", nonx_setup); /* parsed early actually */
60 60
61 int force_personality32 = READ_IMPLIES_EXEC; 61 int force_personality32 = READ_IMPLIES_EXEC;
62 62
63 /* noexec32=on|off 63 /* noexec32=on|off
64 Control non executable heap for 32bit processes. 64 Control non executable heap for 32bit processes.
65 To control the stack too use noexec=off 65 To control the stack too use noexec=off
66 66
67 on PROT_READ does not imply PROT_EXEC for 32bit processes 67 on PROT_READ does not imply PROT_EXEC for 32bit processes
68 off PROT_READ implies PROT_EXEC (default) 68 off PROT_READ implies PROT_EXEC (default)
69 */ 69 */
70 static int __init nonx32_setup(char *str) 70 static int __init nonx32_setup(char *str)
71 { 71 {
72 if (!strcmp(str, "on")) 72 if (!strcmp(str, "on"))
73 force_personality32 &= ~READ_IMPLIES_EXEC; 73 force_personality32 &= ~READ_IMPLIES_EXEC;
74 else if (!strcmp(str, "off")) 74 else if (!strcmp(str, "off"))
75 force_personality32 |= READ_IMPLIES_EXEC; 75 force_personality32 |= READ_IMPLIES_EXEC;
76 return 0; 76 return 0;
77 } 77 }
78 __setup("noexec32=", nonx32_setup); 78 __setup("noexec32=", nonx32_setup);
79 79
80 /* 80 /*
81 * Great future plan: 81 * Great future plan:
82 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. 82 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
83 * Always point %gs to its beginning 83 * Always point %gs to its beginning
84 */ 84 */
85 void __init setup_per_cpu_areas(void) 85 void __init setup_per_cpu_areas(void)
86 { 86 {
87 int i; 87 int i;
88 unsigned long size; 88 unsigned long size;
89 89
90 #ifdef CONFIG_HOTPLUG_CPU 90 #ifdef CONFIG_HOTPLUG_CPU
91 prefill_possible_map(); 91 prefill_possible_map();
92 #endif 92 #endif
93 93
94 /* Copy section for each CPU (we discard the original) */ 94 /* Copy section for each CPU (we discard the original) */
95 size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES); 95 size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
96 #ifdef CONFIG_MODULES 96 #ifdef CONFIG_MODULES
97 if (size < PERCPU_ENOUGH_ROOM) 97 if (size < PERCPU_ENOUGH_ROOM)
98 size = PERCPU_ENOUGH_ROOM; 98 size = PERCPU_ENOUGH_ROOM;
99 #endif 99 #endif
100 100
101 for_each_cpu_mask (i, cpu_possible_map) { 101 for_each_cpu_mask (i, cpu_possible_map) {
102 char *ptr; 102 char *ptr;
103 103
104 if (!NODE_DATA(cpu_to_node(i))) { 104 if (!NODE_DATA(cpu_to_node(i))) {
105 printk("cpu with no node %d, num_online_nodes %d\n", 105 printk("cpu with no node %d, num_online_nodes %d\n",
106 i, num_online_nodes()); 106 i, num_online_nodes());
107 ptr = alloc_bootmem(size); 107 ptr = alloc_bootmem(size);
108 } else { 108 } else {
109 ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size); 109 ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size);
110 } 110 }
111 if (!ptr) 111 if (!ptr)
112 panic("Cannot allocate cpu data for CPU %d\n", i); 112 panic("Cannot allocate cpu data for CPU %d\n", i);
113 cpu_pda[i].data_offset = ptr - __per_cpu_start; 113 cpu_pda(i)->data_offset = ptr - __per_cpu_start;
114 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); 114 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
115 } 115 }
116 } 116 }
117 117
118 void pda_init(int cpu) 118 void pda_init(int cpu)
119 { 119 {
120 struct x8664_pda *pda = &cpu_pda[cpu]; 120 struct x8664_pda *pda = cpu_pda(cpu);
121 121
122 /* Setup up data that may be needed in __get_free_pages early */ 122 /* Setup up data that may be needed in __get_free_pages early */
123 asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); 123 asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
124 wrmsrl(MSR_GS_BASE, cpu_pda + cpu); 124 wrmsrl(MSR_GS_BASE, pda);
125 125
126 pda->cpunumber = cpu; 126 pda->cpunumber = cpu;
127 pda->irqcount = -1; 127 pda->irqcount = -1;
128 pda->kernelstack = 128 pda->kernelstack =
129 (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; 129 (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE;
130 pda->active_mm = &init_mm; 130 pda->active_mm = &init_mm;
131 pda->mmu_state = 0; 131 pda->mmu_state = 0;
132 132
133 if (cpu == 0) { 133 if (cpu == 0) {
134 /* others are initialized in smpboot.c */ 134 /* others are initialized in smpboot.c */
135 pda->pcurrent = &init_task; 135 pda->pcurrent = &init_task;
136 pda->irqstackptr = boot_cpu_stack; 136 pda->irqstackptr = boot_cpu_stack;
137 } else { 137 } else {
138 pda->irqstackptr = (char *) 138 pda->irqstackptr = (char *)
139 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); 139 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
140 if (!pda->irqstackptr) 140 if (!pda->irqstackptr)
141 panic("cannot allocate irqstack for cpu %d", cpu); 141 panic("cannot allocate irqstack for cpu %d", cpu);
142 } 142 }
143 143
144 144
145 pda->irqstackptr += IRQSTACKSIZE-64; 145 pda->irqstackptr += IRQSTACKSIZE-64;
146 } 146 }
147 147
148 char boot_exception_stacks[(N_EXCEPTION_STACKS - 2) * EXCEPTION_STKSZ + DEBUG_STKSZ] 148 char boot_exception_stacks[(N_EXCEPTION_STACKS - 2) * EXCEPTION_STKSZ + DEBUG_STKSZ]
149 __attribute__((section(".bss.page_aligned"))); 149 __attribute__((section(".bss.page_aligned")));
150 150
151 /* May not be marked __init: used by software suspend */ 151 /* May not be marked __init: used by software suspend */
152 void syscall_init(void) 152 void syscall_init(void)
153 { 153 {
154 /* 154 /*
155 * LSTAR and STAR live in a bit strange symbiosis. 155 * LSTAR and STAR live in a bit strange symbiosis.
156 * They both write to the same internal register. STAR allows to set CS/DS 156 * They both write to the same internal register. STAR allows to set CS/DS
157 * but only a 32bit target. LSTAR sets the 64bit rip. 157 * but only a 32bit target. LSTAR sets the 64bit rip.
158 */ 158 */
159 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); 159 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
160 wrmsrl(MSR_LSTAR, system_call); 160 wrmsrl(MSR_LSTAR, system_call);
161 161
162 #ifdef CONFIG_IA32_EMULATION 162 #ifdef CONFIG_IA32_EMULATION
163 syscall32_cpu_init (); 163 syscall32_cpu_init ();
164 #endif 164 #endif
165 165
166 /* Flags to clear on syscall */ 166 /* Flags to clear on syscall */
167 wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); 167 wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000);
168 } 168 }
169 169
170 void __cpuinit check_efer(void) 170 void __cpuinit check_efer(void)
171 { 171 {
172 unsigned long efer; 172 unsigned long efer;
173 173
174 rdmsrl(MSR_EFER, efer); 174 rdmsrl(MSR_EFER, efer);
175 if (!(efer & EFER_NX) || do_not_nx) { 175 if (!(efer & EFER_NX) || do_not_nx) {
176 __supported_pte_mask &= ~_PAGE_NX; 176 __supported_pte_mask &= ~_PAGE_NX;
177 } 177 }
178 } 178 }
179 179
180 /* 180 /*
181 * cpu_init() initializes state that is per-CPU. Some data is already 181 * cpu_init() initializes state that is per-CPU. Some data is already
182 * initialized (naturally) in the bootstrap process, such as the GDT 182 * initialized (naturally) in the bootstrap process, such as the GDT
183 * and IDT. We reload them nevertheless, this function acts as a 183 * and IDT. We reload them nevertheless, this function acts as a
184 * 'CPU state barrier', nothing should get across. 184 * 'CPU state barrier', nothing should get across.
185 * A lot of state is already set up in PDA init. 185 * A lot of state is already set up in PDA init.
186 */ 186 */
187 void __cpuinit cpu_init (void) 187 void __cpuinit cpu_init (void)
188 { 188 {
189 int cpu = stack_smp_processor_id(); 189 int cpu = stack_smp_processor_id();
190 struct tss_struct *t = &per_cpu(init_tss, cpu); 190 struct tss_struct *t = &per_cpu(init_tss, cpu);
191 unsigned long v; 191 unsigned long v;
192 char *estacks = NULL; 192 char *estacks = NULL;
193 struct task_struct *me; 193 struct task_struct *me;
194 int i; 194 int i;
195 195
196 /* CPU 0 is initialised in head64.c */ 196 /* CPU 0 is initialised in head64.c */
197 if (cpu != 0) { 197 if (cpu != 0) {
198 pda_init(cpu); 198 pda_init(cpu);
199 zap_low_mappings(cpu); 199 zap_low_mappings(cpu);
200 } else 200 } else
201 estacks = boot_exception_stacks; 201 estacks = boot_exception_stacks;
202 202
203 me = current; 203 me = current;
204 204
205 if (cpu_test_and_set(cpu, cpu_initialized)) 205 if (cpu_test_and_set(cpu, cpu_initialized))
206 panic("CPU#%d already initialized!\n", cpu); 206 panic("CPU#%d already initialized!\n", cpu);
207 207
208 printk("Initializing CPU#%d\n", cpu); 208 printk("Initializing CPU#%d\n", cpu);
209 209
210 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); 210 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
211 211
212 /* 212 /*
213 * Initialize the per-CPU GDT with the boot GDT, 213 * Initialize the per-CPU GDT with the boot GDT,
214 * and set up the GDT descriptor: 214 * and set up the GDT descriptor:
215 */ 215 */
216 if (cpu) 216 if (cpu)
217 memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE); 217 memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE);
218 218
219 cpu_gdt_descr[cpu].size = GDT_SIZE; 219 cpu_gdt_descr[cpu].size = GDT_SIZE;
220 asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu])); 220 asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu]));
221 asm volatile("lidt %0" :: "m" (idt_descr)); 221 asm volatile("lidt %0" :: "m" (idt_descr));
222 222
223 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); 223 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
224 syscall_init(); 224 syscall_init();
225 225
226 wrmsrl(MSR_FS_BASE, 0); 226 wrmsrl(MSR_FS_BASE, 0);
227 wrmsrl(MSR_KERNEL_GS_BASE, 0); 227 wrmsrl(MSR_KERNEL_GS_BASE, 0);
228 barrier(); 228 barrier();
229 229
230 check_efer(); 230 check_efer();
231 231
232 /* 232 /*
233 * set up and load the per-CPU TSS 233 * set up and load the per-CPU TSS
234 */ 234 */
235 for (v = 0; v < N_EXCEPTION_STACKS; v++) { 235 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
236 if (cpu) { 236 if (cpu) {
237 static const unsigned int order[N_EXCEPTION_STACKS] = { 237 static const unsigned int order[N_EXCEPTION_STACKS] = {
238 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, 238 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
239 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER 239 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
240 }; 240 };
241 241
242 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); 242 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
243 if (!estacks) 243 if (!estacks)
244 panic("Cannot allocate exception stack %ld %d\n", 244 panic("Cannot allocate exception stack %ld %d\n",
245 v, cpu); 245 v, cpu);
246 } 246 }
247 switch (v + 1) { 247 switch (v + 1) {
248 #if DEBUG_STKSZ > EXCEPTION_STKSZ 248 #if DEBUG_STKSZ > EXCEPTION_STKSZ
249 case DEBUG_STACK: 249 case DEBUG_STACK:
250 cpu_pda[cpu].debugstack = (unsigned long)estacks; 250 cpu_pda[cpu].debugstack = (unsigned long)estacks;
251 estacks += DEBUG_STKSZ; 251 estacks += DEBUG_STKSZ;
252 break; 252 break;
253 #endif 253 #endif
254 default: 254 default:
255 estacks += EXCEPTION_STKSZ; 255 estacks += EXCEPTION_STKSZ;
256 break; 256 break;
257 } 257 }
258 t->ist[v] = (unsigned long)estacks; 258 t->ist[v] = (unsigned long)estacks;
259 } 259 }
260 260
261 t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap); 261 t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
262 /* 262 /*
263 * <= is required because the CPU will access up to 263 * <= is required because the CPU will access up to
264 * 8 bits beyond the end of the IO permission bitmap. 264 * 8 bits beyond the end of the IO permission bitmap.
265 */ 265 */
266 for (i = 0; i <= IO_BITMAP_LONGS; i++) 266 for (i = 0; i <= IO_BITMAP_LONGS; i++)
267 t->io_bitmap[i] = ~0UL; 267 t->io_bitmap[i] = ~0UL;
268 268
269 atomic_inc(&init_mm.mm_count); 269 atomic_inc(&init_mm.mm_count);
270 me->active_mm = &init_mm; 270 me->active_mm = &init_mm;
271 if (me->mm) 271 if (me->mm)
272 BUG(); 272 BUG();
273 enter_lazy_tlb(&init_mm, me); 273 enter_lazy_tlb(&init_mm, me);
274 274
275 set_tss_desc(cpu, t); 275 set_tss_desc(cpu, t);
276 load_TR_desc(); 276 load_TR_desc();
277 load_LDT(&init_mm.context); 277 load_LDT(&init_mm.context);
278 278
279 /* 279 /*
280 * Clear all 6 debug registers: 280 * Clear all 6 debug registers:
281 */ 281 */
282 282
283 set_debug(0UL, 0); 283 set_debug(0UL, 0);
284 set_debug(0UL, 1); 284 set_debug(0UL, 1);
285 set_debug(0UL, 2); 285 set_debug(0UL, 2);
286 set_debug(0UL, 3); 286 set_debug(0UL, 3);
287 set_debug(0UL, 6); 287 set_debug(0UL, 6);
288 set_debug(0UL, 7); 288 set_debug(0UL, 7);
289 289
290 fpu_init(); 290 fpu_init();
291 } 291 }
292 292
arch/x86_64/kernel/smpboot.c
1 /* 1 /*
2 * x86 SMP booting functions 2 * x86 SMP booting functions
3 * 3 *
4 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> 4 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
5 * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> 5 * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
6 * Copyright 2001 Andi Kleen, SuSE Labs. 6 * Copyright 2001 Andi Kleen, SuSE Labs.
7 * 7 *
8 * Much of the core SMP work is based on previous work by Thomas Radke, to 8 * Much of the core SMP work is based on previous work by Thomas Radke, to
9 * whom a great many thanks are extended. 9 * whom a great many thanks are extended.
10 * 10 *
11 * Thanks to Intel for making available several different Pentium, 11 * Thanks to Intel for making available several different Pentium,
12 * Pentium Pro and Pentium-II/Xeon MP machines. 12 * Pentium Pro and Pentium-II/Xeon MP machines.
13 * Original development of Linux SMP code supported by Caldera. 13 * Original development of Linux SMP code supported by Caldera.
14 * 14 *
15 * This code is released under the GNU General Public License version 2 15 * This code is released under the GNU General Public License version 2
16 * 16 *
17 * Fixes 17 * Fixes
18 * Felix Koop : NR_CPUS used properly 18 * Felix Koop : NR_CPUS used properly
19 * Jose Renau : Handle single CPU case. 19 * Jose Renau : Handle single CPU case.
20 * Alan Cox : By repeated request 8) - Total BogoMIP report. 20 * Alan Cox : By repeated request 8) - Total BogoMIP report.
21 * Greg Wright : Fix for kernel stacks panic. 21 * Greg Wright : Fix for kernel stacks panic.
22 * Erich Boleyn : MP v1.4 and additional changes. 22 * Erich Boleyn : MP v1.4 and additional changes.
23 * Matthias Sattler : Changes for 2.1 kernel map. 23 * Matthias Sattler : Changes for 2.1 kernel map.
24 * Michel Lespinasse : Changes for 2.1 kernel map. 24 * Michel Lespinasse : Changes for 2.1 kernel map.
25 * Michael Chastain : Change trampoline.S to gnu as. 25 * Michael Chastain : Change trampoline.S to gnu as.
26 * Alan Cox : Dumb bug: 'B' step PPro's are fine 26 * Alan Cox : Dumb bug: 'B' step PPro's are fine
27 * Ingo Molnar : Added APIC timers, based on code 27 * Ingo Molnar : Added APIC timers, based on code
28 * from Jose Renau 28 * from Jose Renau
29 * Ingo Molnar : various cleanups and rewrites 29 * Ingo Molnar : various cleanups and rewrites
30 * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug. 30 * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug.
31 * Maciej W. Rozycki : Bits for genuine 82489DX APICs 31 * Maciej W. Rozycki : Bits for genuine 82489DX APICs
32 * Andi Kleen : Changed for SMP boot into long mode. 32 * Andi Kleen : Changed for SMP boot into long mode.
33 * Rusty Russell : Hacked into shape for new "hotplug" boot process. 33 * Rusty Russell : Hacked into shape for new "hotplug" boot process.
34 * Andi Kleen : Converted to new state machine. 34 * Andi Kleen : Converted to new state machine.
35 * Various cleanups. 35 * Various cleanups.
36 * Probably mostly hotplug CPU ready now. 36 * Probably mostly hotplug CPU ready now.
37 * Ashok Raj : CPU hotplug support 37 * Ashok Raj : CPU hotplug support
38 */ 38 */
39 39
40 40
41 #include <linux/config.h> 41 #include <linux/config.h>
42 #include <linux/init.h> 42 #include <linux/init.h>
43 43
44 #include <linux/mm.h> 44 #include <linux/mm.h>
45 #include <linux/kernel_stat.h> 45 #include <linux/kernel_stat.h>
46 #include <linux/smp_lock.h> 46 #include <linux/smp_lock.h>
47 #include <linux/bootmem.h> 47 #include <linux/bootmem.h>
48 #include <linux/thread_info.h> 48 #include <linux/thread_info.h>
49 #include <linux/module.h> 49 #include <linux/module.h>
50 50
51 #include <linux/delay.h> 51 #include <linux/delay.h>
52 #include <linux/mc146818rtc.h> 52 #include <linux/mc146818rtc.h>
53 #include <asm/mtrr.h> 53 #include <asm/mtrr.h>
54 #include <asm/pgalloc.h> 54 #include <asm/pgalloc.h>
55 #include <asm/desc.h> 55 #include <asm/desc.h>
56 #include <asm/kdebug.h> 56 #include <asm/kdebug.h>
57 #include <asm/tlbflush.h> 57 #include <asm/tlbflush.h>
58 #include <asm/proto.h> 58 #include <asm/proto.h>
59 #include <asm/nmi.h> 59 #include <asm/nmi.h>
60 #include <asm/irq.h> 60 #include <asm/irq.h>
61 #include <asm/hw_irq.h> 61 #include <asm/hw_irq.h>
62 62
63 /* Number of siblings per CPU package */ 63 /* Number of siblings per CPU package */
64 int smp_num_siblings = 1; 64 int smp_num_siblings = 1;
65 /* Package ID of each logical CPU */ 65 /* Package ID of each logical CPU */
66 u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; 66 u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
67 /* core ID of each logical CPU */ 67 /* core ID of each logical CPU */
68 u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; 68 u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
69 69
70 /* Bitmask of currently online CPUs */ 70 /* Bitmask of currently online CPUs */
71 cpumask_t cpu_online_map __read_mostly; 71 cpumask_t cpu_online_map __read_mostly;
72 72
73 EXPORT_SYMBOL(cpu_online_map); 73 EXPORT_SYMBOL(cpu_online_map);
74 74
75 /* 75 /*
76 * Private maps to synchronize booting between AP and BP. 76 * Private maps to synchronize booting between AP and BP.
77 * Probably not needed anymore, but it makes for easier debugging. -AK 77 * Probably not needed anymore, but it makes for easier debugging. -AK
78 */ 78 */
79 cpumask_t cpu_callin_map; 79 cpumask_t cpu_callin_map;
80 cpumask_t cpu_callout_map; 80 cpumask_t cpu_callout_map;
81 81
82 cpumask_t cpu_possible_map; 82 cpumask_t cpu_possible_map;
83 EXPORT_SYMBOL(cpu_possible_map); 83 EXPORT_SYMBOL(cpu_possible_map);
84 84
85 /* Per CPU bogomips and other parameters */ 85 /* Per CPU bogomips and other parameters */
86 struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; 86 struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
87 87
88 /* Set when the idlers are all forked */ 88 /* Set when the idlers are all forked */
89 int smp_threads_ready; 89 int smp_threads_ready;
90 90
91 /* representing HT siblings of each logical CPU */ 91 /* representing HT siblings of each logical CPU */
92 cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly; 92 cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
93 93
94 /* representing HT and core siblings of each logical CPU */ 94 /* representing HT and core siblings of each logical CPU */
95 cpumask_t cpu_core_map[NR_CPUS] __read_mostly; 95 cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
96 EXPORT_SYMBOL(cpu_core_map); 96 EXPORT_SYMBOL(cpu_core_map);
97 97
98 /* 98 /*
99 * Trampoline 80x86 program as an array. 99 * Trampoline 80x86 program as an array.
100 */ 100 */
101 101
102 extern unsigned char trampoline_data[]; 102 extern unsigned char trampoline_data[];
103 extern unsigned char trampoline_end[]; 103 extern unsigned char trampoline_end[];
104 104
105 /* State of each CPU */ 105 /* State of each CPU */
106 DEFINE_PER_CPU(int, cpu_state) = { 0 }; 106 DEFINE_PER_CPU(int, cpu_state) = { 0 };
107 107
108 /* 108 /*
109 * Store all idle threads, this can be reused instead of creating 109 * Store all idle threads, this can be reused instead of creating
110 * a new thread. Also avoids complicated thread destroy functionality 110 * a new thread. Also avoids complicated thread destroy functionality
111 * for idle threads. 111 * for idle threads.
112 */ 112 */
113 struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; 113 struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
114 114
115 #define get_idle_for_cpu(x) (idle_thread_array[(x)]) 115 #define get_idle_for_cpu(x) (idle_thread_array[(x)])
116 #define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p)) 116 #define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p))
117 117
118 /* 118 /*
119 * Currently trivial. Write the real->protected mode 119 * Currently trivial. Write the real->protected mode
120 * bootstrap into the page concerned. The caller 120 * bootstrap into the page concerned. The caller
121 * has made sure it's suitably aligned. 121 * has made sure it's suitably aligned.
122 */ 122 */
123 123
124 static unsigned long __cpuinit setup_trampoline(void) 124 static unsigned long __cpuinit setup_trampoline(void)
125 { 125 {
126 void *tramp = __va(SMP_TRAMPOLINE_BASE); 126 void *tramp = __va(SMP_TRAMPOLINE_BASE);
127 memcpy(tramp, trampoline_data, trampoline_end - trampoline_data); 127 memcpy(tramp, trampoline_data, trampoline_end - trampoline_data);
128 return virt_to_phys(tramp); 128 return virt_to_phys(tramp);
129 } 129 }
130 130
131 /* 131 /*
132 * The bootstrap kernel entry code has set these up. Save them for 132 * The bootstrap kernel entry code has set these up. Save them for
133 * a given CPU 133 * a given CPU
134 */ 134 */
135 135
136 static void __cpuinit smp_store_cpu_info(int id) 136 static void __cpuinit smp_store_cpu_info(int id)
137 { 137 {
138 struct cpuinfo_x86 *c = cpu_data + id; 138 struct cpuinfo_x86 *c = cpu_data + id;
139 139
140 *c = boot_cpu_data; 140 *c = boot_cpu_data;
141 identify_cpu(c); 141 identify_cpu(c);
142 print_cpu_info(c); 142 print_cpu_info(c);
143 } 143 }
144 144
145 /* 145 /*
146 * New Funky TSC sync algorithm borrowed from IA64. 146 * New Funky TSC sync algorithm borrowed from IA64.
147 * Main advantage is that it doesn't reset the TSCs fully and 147 * Main advantage is that it doesn't reset the TSCs fully and
148 * in general looks more robust and it works better than my earlier 148 * in general looks more robust and it works better than my earlier
149 * attempts. I believe it was written by David Mosberger. Some minor 149 * attempts. I believe it was written by David Mosberger. Some minor
150 * adjustments for x86-64 by me -AK 150 * adjustments for x86-64 by me -AK
151 * 151 *
152 * Original comment reproduced below. 152 * Original comment reproduced below.
153 * 153 *
154 * Synchronize TSC of the current (slave) CPU with the TSC of the 154 * Synchronize TSC of the current (slave) CPU with the TSC of the
155 * MASTER CPU (normally the time-keeper CPU). We use a closed loop to 155 * MASTER CPU (normally the time-keeper CPU). We use a closed loop to
156 * eliminate the possibility of unaccounted-for errors (such as 156 * eliminate the possibility of unaccounted-for errors (such as
157 * getting a machine check in the middle of a calibration step). The 157 * getting a machine check in the middle of a calibration step). The
158 * basic idea is for the slave to ask the master what itc value it has 158 * basic idea is for the slave to ask the master what itc value it has
159 * and to read its own itc before and after the master responds. Each 159 * and to read its own itc before and after the master responds. Each
160 * iteration gives us three timestamps: 160 * iteration gives us three timestamps:
161 * 161 *
162 * slave master 162 * slave master
163 * 163 *
164 * t0 ---\ 164 * t0 ---\
165 * ---\ 165 * ---\
166 * ---> 166 * --->
167 * tm 167 * tm
168 * /--- 168 * /---
169 * /--- 169 * /---
170 * t1 <--- 170 * t1 <---
171 * 171 *
172 * 172 *
173 * The goal is to adjust the slave's TSC such that tm falls exactly 173 * The goal is to adjust the slave's TSC such that tm falls exactly
174 * half-way between t0 and t1. If we achieve this, the clocks are 174 * half-way between t0 and t1. If we achieve this, the clocks are
175 * synchronized provided the interconnect between the slave and the 175 * synchronized provided the interconnect between the slave and the
176 * master is symmetric. Even if the interconnect were asymmetric, we 176 * master is symmetric. Even if the interconnect were asymmetric, we
177 * would still know that the synchronization error is smaller than the 177 * would still know that the synchronization error is smaller than the
178 * roundtrip latency (t0 - t1). 178 * roundtrip latency (t0 - t1).
179 * 179 *
180 * When the interconnect is quiet and symmetric, this lets us 180 * When the interconnect is quiet and symmetric, this lets us
181 * synchronize the TSC to within one or two cycles. However, we can 181 * synchronize the TSC to within one or two cycles. However, we can
182 * only *guarantee* that the synchronization is accurate to within a 182 * only *guarantee* that the synchronization is accurate to within a
183 * round-trip time, which is typically in the range of several hundred 183 * round-trip time, which is typically in the range of several hundred
184 * cycles (e.g., ~500 cycles). In practice, this means that the TSCs 184 * cycles (e.g., ~500 cycles). In practice, this means that the TSCs
185 * are usually almost perfectly synchronized, but we shouldn't assume 185 * are usually almost perfectly synchronized, but we shouldn't assume
186 * that the accuracy is much better than half a micro second or so. 186 * that the accuracy is much better than half a micro second or so.
187 * 187 *
188 * [there are other errors like the latency of RDTSC and of the 188 * [there are other errors like the latency of RDTSC and of the
189 * WRMSR. These can also account to hundreds of cycles. So it's 189 * WRMSR. These can also account to hundreds of cycles. So it's
190 * probably worse. It claims 153 cycles error on a dual Opteron, 190 * probably worse. It claims 153 cycles error on a dual Opteron,
191 * but I suspect the numbers are actually somewhat worse -AK] 191 * but I suspect the numbers are actually somewhat worse -AK]
192 */ 192 */
193 193
194 #define MASTER 0 194 #define MASTER 0
195 #define SLAVE (SMP_CACHE_BYTES/8) 195 #define SLAVE (SMP_CACHE_BYTES/8)
196 196
197 /* Intentionally don't use cpu_relax() while TSC synchronization 197 /* Intentionally don't use cpu_relax() while TSC synchronization
198 because we don't want to go into funky power save modi or cause 198 because we don't want to go into funky power save modi or cause
199 hypervisors to schedule us away. Going to sleep would likely affect 199 hypervisors to schedule us away. Going to sleep would likely affect
200 latency and low latency is the primary objective here. -AK */ 200 latency and low latency is the primary objective here. -AK */
201 #define no_cpu_relax() barrier() 201 #define no_cpu_relax() barrier()
202 202
203 static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock); 203 static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock);
204 static volatile __cpuinitdata unsigned long go[SLAVE + 1]; 204 static volatile __cpuinitdata unsigned long go[SLAVE + 1];
205 static int notscsync __cpuinitdata; 205 static int notscsync __cpuinitdata;
206 206
207 #undef DEBUG_TSC_SYNC 207 #undef DEBUG_TSC_SYNC
208 208
209 #define NUM_ROUNDS 64 /* magic value */ 209 #define NUM_ROUNDS 64 /* magic value */
210 #define NUM_ITERS 5 /* likewise */ 210 #define NUM_ITERS 5 /* likewise */
211 211
212 /* Callback on boot CPU */ 212 /* Callback on boot CPU */
213 static __cpuinit void sync_master(void *arg) 213 static __cpuinit void sync_master(void *arg)
214 { 214 {
215 unsigned long flags, i; 215 unsigned long flags, i;
216 216
217 go[MASTER] = 0; 217 go[MASTER] = 0;
218 218
219 local_irq_save(flags); 219 local_irq_save(flags);
220 { 220 {
221 for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) { 221 for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) {
222 while (!go[MASTER]) 222 while (!go[MASTER])
223 no_cpu_relax(); 223 no_cpu_relax();
224 go[MASTER] = 0; 224 go[MASTER] = 0;
225 rdtscll(go[SLAVE]); 225 rdtscll(go[SLAVE]);
226 } 226 }
227 } 227 }
228 local_irq_restore(flags); 228 local_irq_restore(flags);
229 } 229 }
230 230
231 /* 231 /*
232 * Return the number of cycles by which our tsc differs from the tsc 232 * Return the number of cycles by which our tsc differs from the tsc
233 * on the master (time-keeper) CPU. A positive number indicates our 233 * on the master (time-keeper) CPU. A positive number indicates our
234 * tsc is ahead of the master, negative that it is behind. 234 * tsc is ahead of the master, negative that it is behind.
235 */ 235 */
236 static inline long 236 static inline long
237 get_delta(long *rt, long *master) 237 get_delta(long *rt, long *master)
238 { 238 {
239 unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0; 239 unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
240 unsigned long tcenter, t0, t1, tm; 240 unsigned long tcenter, t0, t1, tm;
241 int i; 241 int i;
242 242
243 for (i = 0; i < NUM_ITERS; ++i) { 243 for (i = 0; i < NUM_ITERS; ++i) {
244 rdtscll(t0); 244 rdtscll(t0);
245 go[MASTER] = 1; 245 go[MASTER] = 1;
246 while (!(tm = go[SLAVE])) 246 while (!(tm = go[SLAVE]))
247 no_cpu_relax(); 247 no_cpu_relax();
248 go[SLAVE] = 0; 248 go[SLAVE] = 0;
249 rdtscll(t1); 249 rdtscll(t1);
250 250
251 if (t1 - t0 < best_t1 - best_t0) 251 if (t1 - t0 < best_t1 - best_t0)
252 best_t0 = t0, best_t1 = t1, best_tm = tm; 252 best_t0 = t0, best_t1 = t1, best_tm = tm;
253 } 253 }
254 254
255 *rt = best_t1 - best_t0; 255 *rt = best_t1 - best_t0;
256 *master = best_tm - best_t0; 256 *master = best_tm - best_t0;
257 257
258 /* average best_t0 and best_t1 without overflow: */ 258 /* average best_t0 and best_t1 without overflow: */
259 tcenter = (best_t0/2 + best_t1/2); 259 tcenter = (best_t0/2 + best_t1/2);
260 if (best_t0 % 2 + best_t1 % 2 == 2) 260 if (best_t0 % 2 + best_t1 % 2 == 2)
261 ++tcenter; 261 ++tcenter;
262 return tcenter - best_tm; 262 return tcenter - best_tm;
263 } 263 }
264 264
265 static __cpuinit void sync_tsc(unsigned int master) 265 static __cpuinit void sync_tsc(unsigned int master)
266 { 266 {
267 int i, done = 0; 267 int i, done = 0;
268 long delta, adj, adjust_latency = 0; 268 long delta, adj, adjust_latency = 0;
269 unsigned long flags, rt, master_time_stamp, bound; 269 unsigned long flags, rt, master_time_stamp, bound;
270 #ifdef DEBUG_TSC_SYNC 270 #ifdef DEBUG_TSC_SYNC
271 static struct syncdebug { 271 static struct syncdebug {
272 long rt; /* roundtrip time */ 272 long rt; /* roundtrip time */
273 long master; /* master's timestamp */ 273 long master; /* master's timestamp */
274 long diff; /* difference between midpoint and master's timestamp */ 274 long diff; /* difference between midpoint and master's timestamp */
275 long lat; /* estimate of tsc adjustment latency */ 275 long lat; /* estimate of tsc adjustment latency */
276 } t[NUM_ROUNDS] __cpuinitdata; 276 } t[NUM_ROUNDS] __cpuinitdata;
277 #endif 277 #endif
278 278
279 printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", 279 printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n",
280 smp_processor_id(), master); 280 smp_processor_id(), master);
281 281
282 go[MASTER] = 1; 282 go[MASTER] = 1;
283 283
284 /* It is dangerous to broadcast IPI as cpus are coming up, 284 /* It is dangerous to broadcast IPI as cpus are coming up,
285 * as they may not be ready to accept them. So since 285 * as they may not be ready to accept them. So since
286 * we only need to send the ipi to the boot cpu direct 286 * we only need to send the ipi to the boot cpu direct
287 * the message, and avoid the race. 287 * the message, and avoid the race.
288 */ 288 */
289 smp_call_function_single(master, sync_master, NULL, 1, 0); 289 smp_call_function_single(master, sync_master, NULL, 1, 0);
290 290
291 while (go[MASTER]) /* wait for master to be ready */ 291 while (go[MASTER]) /* wait for master to be ready */
292 no_cpu_relax(); 292 no_cpu_relax();
293 293
294 spin_lock_irqsave(&tsc_sync_lock, flags); 294 spin_lock_irqsave(&tsc_sync_lock, flags);
295 { 295 {
296 for (i = 0; i < NUM_ROUNDS; ++i) { 296 for (i = 0; i < NUM_ROUNDS; ++i) {
297 delta = get_delta(&rt, &master_time_stamp); 297 delta = get_delta(&rt, &master_time_stamp);
298 if (delta == 0) { 298 if (delta == 0) {
299 done = 1; /* let's lock on to this... */ 299 done = 1; /* let's lock on to this... */
300 bound = rt; 300 bound = rt;
301 } 301 }
302 302
303 if (!done) { 303 if (!done) {
304 unsigned long t; 304 unsigned long t;
305 if (i > 0) { 305 if (i > 0) {
306 adjust_latency += -delta; 306 adjust_latency += -delta;
307 adj = -delta + adjust_latency/4; 307 adj = -delta + adjust_latency/4;
308 } else 308 } else
309 adj = -delta; 309 adj = -delta;
310 310
311 rdtscll(t); 311 rdtscll(t);
312 wrmsrl(MSR_IA32_TSC, t + adj); 312 wrmsrl(MSR_IA32_TSC, t + adj);
313 } 313 }
314 #ifdef DEBUG_TSC_SYNC 314 #ifdef DEBUG_TSC_SYNC
315 t[i].rt = rt; 315 t[i].rt = rt;
316 t[i].master = master_time_stamp; 316 t[i].master = master_time_stamp;
317 t[i].diff = delta; 317 t[i].diff = delta;
318 t[i].lat = adjust_latency/4; 318 t[i].lat = adjust_latency/4;
319 #endif 319 #endif
320 } 320 }
321 } 321 }
322 spin_unlock_irqrestore(&tsc_sync_lock, flags); 322 spin_unlock_irqrestore(&tsc_sync_lock, flags);
323 323
324 #ifdef DEBUG_TSC_SYNC 324 #ifdef DEBUG_TSC_SYNC
325 for (i = 0; i < NUM_ROUNDS; ++i) 325 for (i = 0; i < NUM_ROUNDS; ++i)
326 printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n", 326 printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
327 t[i].rt, t[i].master, t[i].diff, t[i].lat); 327 t[i].rt, t[i].master, t[i].diff, t[i].lat);
328 #endif 328 #endif
329 329
330 printk(KERN_INFO 330 printk(KERN_INFO
331 "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, " 331 "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, "
332 "maxerr %lu cycles)\n", 332 "maxerr %lu cycles)\n",
333 smp_processor_id(), master, delta, rt); 333 smp_processor_id(), master, delta, rt);
334 } 334 }
335 335
336 static void __cpuinit tsc_sync_wait(void) 336 static void __cpuinit tsc_sync_wait(void)
337 { 337 {
338 /* 338 /*
339 * When the CPU has synchronized TSCs assume the BIOS 339 * When the CPU has synchronized TSCs assume the BIOS
340 * or the hardware already synced. Otherwise we could 340 * or the hardware already synced. Otherwise we could
341 * mess up a possible perfect synchronization with a 341 * mess up a possible perfect synchronization with a
342 * not-quite-perfect algorithm. 342 * not-quite-perfect algorithm.
343 */ 343 */
344 if (notscsync || !cpu_has_tsc || !unsynchronized_tsc()) 344 if (notscsync || !cpu_has_tsc || !unsynchronized_tsc())
345 return; 345 return;
346 sync_tsc(0); 346 sync_tsc(0);
347 } 347 }
348 348
349 static __init int notscsync_setup(char *s) 349 static __init int notscsync_setup(char *s)
350 { 350 {
351 notscsync = 1; 351 notscsync = 1;
352 return 0; 352 return 0;
353 } 353 }
354 __setup("notscsync", notscsync_setup); 354 __setup("notscsync", notscsync_setup);
355 355
356 static atomic_t init_deasserted __cpuinitdata; 356 static atomic_t init_deasserted __cpuinitdata;
357 357
358 /* 358 /*
359 * Report back to the Boot Processor. 359 * Report back to the Boot Processor.
360 * Running on AP. 360 * Running on AP.
361 */ 361 */
362 void __cpuinit smp_callin(void) 362 void __cpuinit smp_callin(void)
363 { 363 {
364 int cpuid, phys_id; 364 int cpuid, phys_id;
365 unsigned long timeout; 365 unsigned long timeout;
366 366
367 /* 367 /*
368 * If waken up by an INIT in an 82489DX configuration 368 * If waken up by an INIT in an 82489DX configuration
369 * we may get here before an INIT-deassert IPI reaches 369 * we may get here before an INIT-deassert IPI reaches
370 * our local APIC. We have to wait for the IPI or we'll 370 * our local APIC. We have to wait for the IPI or we'll
371 * lock up on an APIC access. 371 * lock up on an APIC access.
372 */ 372 */
373 while (!atomic_read(&init_deasserted)) 373 while (!atomic_read(&init_deasserted))
374 cpu_relax(); 374 cpu_relax();
375 375
376 /* 376 /*
377 * (This works even if the APIC is not enabled.) 377 * (This works even if the APIC is not enabled.)
378 */ 378 */
379 phys_id = GET_APIC_ID(apic_read(APIC_ID)); 379 phys_id = GET_APIC_ID(apic_read(APIC_ID));
380 cpuid = smp_processor_id(); 380 cpuid = smp_processor_id();
381 if (cpu_isset(cpuid, cpu_callin_map)) { 381 if (cpu_isset(cpuid, cpu_callin_map)) {
382 panic("smp_callin: phys CPU#%d, CPU#%d already present??\n", 382 panic("smp_callin: phys CPU#%d, CPU#%d already present??\n",
383 phys_id, cpuid); 383 phys_id, cpuid);
384 } 384 }
385 Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); 385 Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
386 386
387 /* 387 /*
388 * STARTUP IPIs are fragile beasts as they might sometimes 388 * STARTUP IPIs are fragile beasts as they might sometimes
389 * trigger some glue motherboard logic. Complete APIC bus 389 * trigger some glue motherboard logic. Complete APIC bus
390 * silence for 1 second, this overestimates the time the 390 * silence for 1 second, this overestimates the time the
391 * boot CPU is spending to send the up to 2 STARTUP IPIs 391 * boot CPU is spending to send the up to 2 STARTUP IPIs
392 * by a factor of two. This should be enough. 392 * by a factor of two. This should be enough.
393 */ 393 */
394 394
395 /* 395 /*
396 * Waiting 2s total for startup (udelay is not yet working) 396 * Waiting 2s total for startup (udelay is not yet working)
397 */ 397 */
398 timeout = jiffies + 2*HZ; 398 timeout = jiffies + 2*HZ;
399 while (time_before(jiffies, timeout)) { 399 while (time_before(jiffies, timeout)) {
400 /* 400 /*
401 * Has the boot CPU finished it's STARTUP sequence? 401 * Has the boot CPU finished it's STARTUP sequence?
402 */ 402 */
403 if (cpu_isset(cpuid, cpu_callout_map)) 403 if (cpu_isset(cpuid, cpu_callout_map))
404 break; 404 break;
405 cpu_relax(); 405 cpu_relax();
406 } 406 }
407 407
408 if (!time_before(jiffies, timeout)) { 408 if (!time_before(jiffies, timeout)) {
409 panic("smp_callin: CPU%d started up but did not get a callout!\n", 409 panic("smp_callin: CPU%d started up but did not get a callout!\n",
410 cpuid); 410 cpuid);
411 } 411 }
412 412
413 /* 413 /*
414 * the boot CPU has finished the init stage and is spinning 414 * the boot CPU has finished the init stage and is spinning
415 * on callin_map until we finish. We are free to set up this 415 * on callin_map until we finish. We are free to set up this
416 * CPU, first the APIC. (this is probably redundant on most 416 * CPU, first the APIC. (this is probably redundant on most
417 * boards) 417 * boards)
418 */ 418 */
419 419
420 Dprintk("CALLIN, before setup_local_APIC().\n"); 420 Dprintk("CALLIN, before setup_local_APIC().\n");
421 setup_local_APIC(); 421 setup_local_APIC();
422 422
423 /* 423 /*
424 * Get our bogomips. 424 * Get our bogomips.
425 * 425 *
426 * Need to enable IRQs because it can take longer and then 426 * Need to enable IRQs because it can take longer and then
427 * the NMI watchdog might kill us. 427 * the NMI watchdog might kill us.
428 */ 428 */
429 local_irq_enable(); 429 local_irq_enable();
430 calibrate_delay(); 430 calibrate_delay();
431 local_irq_disable(); 431 local_irq_disable();
432 Dprintk("Stack at about %p\n",&cpuid); 432 Dprintk("Stack at about %p\n",&cpuid);
433 433
434 disable_APIC_timer(); 434 disable_APIC_timer();
435 435
436 /* 436 /*
437 * Save our processor parameters 437 * Save our processor parameters
438 */ 438 */
439 smp_store_cpu_info(cpuid); 439 smp_store_cpu_info(cpuid);
440 440
441 /* 441 /*
442 * Allow the master to continue. 442 * Allow the master to continue.
443 */ 443 */
444 cpu_set(cpuid, cpu_callin_map); 444 cpu_set(cpuid, cpu_callin_map);
445 } 445 }
446 446
447 /* representing cpus for which sibling maps can be computed */ 447 /* representing cpus for which sibling maps can be computed */
448 static cpumask_t cpu_sibling_setup_map; 448 static cpumask_t cpu_sibling_setup_map;
449 449
450 static inline void set_cpu_sibling_map(int cpu) 450 static inline void set_cpu_sibling_map(int cpu)
451 { 451 {
452 int i; 452 int i;
453 struct cpuinfo_x86 *c = cpu_data; 453 struct cpuinfo_x86 *c = cpu_data;
454 454
455 cpu_set(cpu, cpu_sibling_setup_map); 455 cpu_set(cpu, cpu_sibling_setup_map);
456 456
457 if (smp_num_siblings > 1) { 457 if (smp_num_siblings > 1) {
458 for_each_cpu_mask(i, cpu_sibling_setup_map) { 458 for_each_cpu_mask(i, cpu_sibling_setup_map) {
459 if (phys_proc_id[cpu] == phys_proc_id[i] && 459 if (phys_proc_id[cpu] == phys_proc_id[i] &&
460 cpu_core_id[cpu] == cpu_core_id[i]) { 460 cpu_core_id[cpu] == cpu_core_id[i]) {
461 cpu_set(i, cpu_sibling_map[cpu]); 461 cpu_set(i, cpu_sibling_map[cpu]);
462 cpu_set(cpu, cpu_sibling_map[i]); 462 cpu_set(cpu, cpu_sibling_map[i]);
463 cpu_set(i, cpu_core_map[cpu]); 463 cpu_set(i, cpu_core_map[cpu]);
464 cpu_set(cpu, cpu_core_map[i]); 464 cpu_set(cpu, cpu_core_map[i]);
465 } 465 }
466 } 466 }
467 } else { 467 } else {
468 cpu_set(cpu, cpu_sibling_map[cpu]); 468 cpu_set(cpu, cpu_sibling_map[cpu]);
469 } 469 }
470 470
471 if (current_cpu_data.x86_max_cores == 1) { 471 if (current_cpu_data.x86_max_cores == 1) {
472 cpu_core_map[cpu] = cpu_sibling_map[cpu]; 472 cpu_core_map[cpu] = cpu_sibling_map[cpu];
473 c[cpu].booted_cores = 1; 473 c[cpu].booted_cores = 1;
474 return; 474 return;
475 } 475 }
476 476
477 for_each_cpu_mask(i, cpu_sibling_setup_map) { 477 for_each_cpu_mask(i, cpu_sibling_setup_map) {
478 if (phys_proc_id[cpu] == phys_proc_id[i]) { 478 if (phys_proc_id[cpu] == phys_proc_id[i]) {
479 cpu_set(i, cpu_core_map[cpu]); 479 cpu_set(i, cpu_core_map[cpu]);
480 cpu_set(cpu, cpu_core_map[i]); 480 cpu_set(cpu, cpu_core_map[i]);
481 /* 481 /*
482 * Does this new cpu bringup a new core? 482 * Does this new cpu bringup a new core?
483 */ 483 */
484 if (cpus_weight(cpu_sibling_map[cpu]) == 1) { 484 if (cpus_weight(cpu_sibling_map[cpu]) == 1) {
485 /* 485 /*
486 * for each core in package, increment 486 * for each core in package, increment
487 * the booted_cores for this new cpu 487 * the booted_cores for this new cpu
488 */ 488 */
489 if (first_cpu(cpu_sibling_map[i]) == i) 489 if (first_cpu(cpu_sibling_map[i]) == i)
490 c[cpu].booted_cores++; 490 c[cpu].booted_cores++;
491 /* 491 /*
492 * increment the core count for all 492 * increment the core count for all
493 * the other cpus in this package 493 * the other cpus in this package
494 */ 494 */
495 if (i != cpu) 495 if (i != cpu)
496 c[i].booted_cores++; 496 c[i].booted_cores++;
497 } else if (i != cpu && !c[cpu].booted_cores) 497 } else if (i != cpu && !c[cpu].booted_cores)
498 c[cpu].booted_cores = c[i].booted_cores; 498 c[cpu].booted_cores = c[i].booted_cores;
499 } 499 }
500 } 500 }
501 } 501 }
502 502
503 /* 503 /*
504 * Setup code on secondary processor (after comming out of the trampoline) 504 * Setup code on secondary processor (after comming out of the trampoline)
505 */ 505 */
506 void __cpuinit start_secondary(void) 506 void __cpuinit start_secondary(void)
507 { 507 {
508 /* 508 /*
509 * Dont put anything before smp_callin(), SMP 509 * Dont put anything before smp_callin(), SMP
510 * booting is too fragile that we want to limit the 510 * booting is too fragile that we want to limit the
511 * things done here to the most necessary things. 511 * things done here to the most necessary things.
512 */ 512 */
513 cpu_init(); 513 cpu_init();
514 preempt_disable(); 514 preempt_disable();
515 smp_callin(); 515 smp_callin();
516 516
517 /* otherwise gcc will move up the smp_processor_id before the cpu_init */ 517 /* otherwise gcc will move up the smp_processor_id before the cpu_init */
518 barrier(); 518 barrier();
519 519
520 Dprintk("cpu %d: setting up apic clock\n", smp_processor_id()); 520 Dprintk("cpu %d: setting up apic clock\n", smp_processor_id());
521 setup_secondary_APIC_clock(); 521 setup_secondary_APIC_clock();
522 522
523 Dprintk("cpu %d: enabling apic timer\n", smp_processor_id()); 523 Dprintk("cpu %d: enabling apic timer\n", smp_processor_id());
524 524
525 if (nmi_watchdog == NMI_IO_APIC) { 525 if (nmi_watchdog == NMI_IO_APIC) {
526 disable_8259A_irq(0); 526 disable_8259A_irq(0);
527 enable_NMI_through_LVT0(NULL); 527 enable_NMI_through_LVT0(NULL);
528 enable_8259A_irq(0); 528 enable_8259A_irq(0);
529 } 529 }
530 530
531 enable_APIC_timer(); 531 enable_APIC_timer();
532 532
533 /* 533 /*
534 * The sibling maps must be set before turing the online map on for 534 * The sibling maps must be set before turing the online map on for
535 * this cpu 535 * this cpu
536 */ 536 */
537 set_cpu_sibling_map(smp_processor_id()); 537 set_cpu_sibling_map(smp_processor_id());
538 538
539 /* 539 /*
540 * Wait for TSC sync to not schedule things before. 540 * Wait for TSC sync to not schedule things before.
541 * We still process interrupts, which could see an inconsistent 541 * We still process interrupts, which could see an inconsistent
542 * time in that window unfortunately. 542 * time in that window unfortunately.
543 * Do this here because TSC sync has global unprotected state. 543 * Do this here because TSC sync has global unprotected state.
544 */ 544 */
545 tsc_sync_wait(); 545 tsc_sync_wait();
546 546
547 /* 547 /*
548 * We need to hold call_lock, so there is no inconsistency 548 * We need to hold call_lock, so there is no inconsistency
549 * between the time smp_call_function() determines number of 549 * between the time smp_call_function() determines number of
550 * IPI receipients, and the time when the determination is made 550 * IPI receipients, and the time when the determination is made
551 * for which cpus receive the IPI in genapic_flat.c. Holding this 551 * for which cpus receive the IPI in genapic_flat.c. Holding this
552 * lock helps us to not include this cpu in a currently in progress 552 * lock helps us to not include this cpu in a currently in progress
553 * smp_call_function(). 553 * smp_call_function().
554 */ 554 */
555 lock_ipi_call_lock(); 555 lock_ipi_call_lock();
556 556
557 /* 557 /*
558 * Allow the master to continue. 558 * Allow the master to continue.
559 */ 559 */
560 cpu_set(smp_processor_id(), cpu_online_map); 560 cpu_set(smp_processor_id(), cpu_online_map);
561 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 561 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
562 unlock_ipi_call_lock(); 562 unlock_ipi_call_lock();
563 563
564 cpu_idle(); 564 cpu_idle();
565 } 565 }
566 566
567 extern volatile unsigned long init_rsp; 567 extern volatile unsigned long init_rsp;
568 extern void (*initial_code)(void); 568 extern void (*initial_code)(void);
569 569
570 #ifdef APIC_DEBUG 570 #ifdef APIC_DEBUG
571 static void inquire_remote_apic(int apicid) 571 static void inquire_remote_apic(int apicid)
572 { 572 {
573 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; 573 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
574 char *names[] = { "ID", "VERSION", "SPIV" }; 574 char *names[] = { "ID", "VERSION", "SPIV" };
575 int timeout, status; 575 int timeout, status;
576 576
577 printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid); 577 printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid);
578 578
579 for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) { 579 for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) {
580 printk("... APIC #%d %s: ", apicid, names[i]); 580 printk("... APIC #%d %s: ", apicid, names[i]);
581 581
582 /* 582 /*
583 * Wait for idle. 583 * Wait for idle.
584 */ 584 */
585 apic_wait_icr_idle(); 585 apic_wait_icr_idle();
586 586
587 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); 587 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
588 apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]); 588 apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]);
589 589
590 timeout = 0; 590 timeout = 0;
591 do { 591 do {
592 udelay(100); 592 udelay(100);
593 status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK; 593 status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
594 } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000); 594 } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
595 595
596 switch (status) { 596 switch (status) {
597 case APIC_ICR_RR_VALID: 597 case APIC_ICR_RR_VALID:
598 status = apic_read(APIC_RRR); 598 status = apic_read(APIC_RRR);
599 printk("%08x\n", status); 599 printk("%08x\n", status);
600 break; 600 break;
601 default: 601 default:
602 printk("failed\n"); 602 printk("failed\n");
603 } 603 }
604 } 604 }
605 } 605 }
606 #endif 606 #endif
607 607
608 /* 608 /*
609 * Kick the secondary to wake up. 609 * Kick the secondary to wake up.
610 */ 610 */
611 static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip) 611 static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip)
612 { 612 {
613 unsigned long send_status = 0, accept_status = 0; 613 unsigned long send_status = 0, accept_status = 0;
614 int maxlvt, timeout, num_starts, j; 614 int maxlvt, timeout, num_starts, j;
615 615
616 Dprintk("Asserting INIT.\n"); 616 Dprintk("Asserting INIT.\n");
617 617
618 /* 618 /*
619 * Turn INIT on target chip 619 * Turn INIT on target chip
620 */ 620 */
621 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); 621 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
622 622
623 /* 623 /*
624 * Send IPI 624 * Send IPI
625 */ 625 */
626 apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT 626 apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
627 | APIC_DM_INIT); 627 | APIC_DM_INIT);
628 628
629 Dprintk("Waiting for send to finish...\n"); 629 Dprintk("Waiting for send to finish...\n");
630 timeout = 0; 630 timeout = 0;
631 do { 631 do {
632 Dprintk("+"); 632 Dprintk("+");
633 udelay(100); 633 udelay(100);
634 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; 634 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
635 } while (send_status && (timeout++ < 1000)); 635 } while (send_status && (timeout++ < 1000));
636 636
637 mdelay(10); 637 mdelay(10);
638 638
639 Dprintk("Deasserting INIT.\n"); 639 Dprintk("Deasserting INIT.\n");
640 640
641 /* Target chip */ 641 /* Target chip */
642 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); 642 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
643 643
644 /* Send IPI */ 644 /* Send IPI */
645 apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); 645 apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
646 646
647 Dprintk("Waiting for send to finish...\n"); 647 Dprintk("Waiting for send to finish...\n");
648 timeout = 0; 648 timeout = 0;
649 do { 649 do {
650 Dprintk("+"); 650 Dprintk("+");
651 udelay(100); 651 udelay(100);
652 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; 652 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
653 } while (send_status && (timeout++ < 1000)); 653 } while (send_status && (timeout++ < 1000));
654 654
655 mb(); 655 mb();
656 atomic_set(&init_deasserted, 1); 656 atomic_set(&init_deasserted, 1);
657 657
658 num_starts = 2; 658 num_starts = 2;
659 659
660 /* 660 /*
661 * Run STARTUP IPI loop. 661 * Run STARTUP IPI loop.
662 */ 662 */
663 Dprintk("#startup loops: %d.\n", num_starts); 663 Dprintk("#startup loops: %d.\n", num_starts);
664 664
665 maxlvt = get_maxlvt(); 665 maxlvt = get_maxlvt();
666 666
667 for (j = 1; j <= num_starts; j++) { 667 for (j = 1; j <= num_starts; j++) {
668 Dprintk("Sending STARTUP #%d.\n",j); 668 Dprintk("Sending STARTUP #%d.\n",j);
669 apic_read_around(APIC_SPIV); 669 apic_read_around(APIC_SPIV);
670 apic_write(APIC_ESR, 0); 670 apic_write(APIC_ESR, 0);
671 apic_read(APIC_ESR); 671 apic_read(APIC_ESR);
672 Dprintk("After apic_write.\n"); 672 Dprintk("After apic_write.\n");
673 673
674 /* 674 /*
675 * STARTUP IPI 675 * STARTUP IPI
676 */ 676 */
677 677
678 /* Target chip */ 678 /* Target chip */
679 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); 679 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
680 680
681 /* Boot on the stack */ 681 /* Boot on the stack */
682 /* Kick the second */ 682 /* Kick the second */
683 apic_write(APIC_ICR, APIC_DM_STARTUP | (start_rip >> 12)); 683 apic_write(APIC_ICR, APIC_DM_STARTUP | (start_rip >> 12));
684 684
685 /* 685 /*
686 * Give the other CPU some time to accept the IPI. 686 * Give the other CPU some time to accept the IPI.
687 */ 687 */
688 udelay(300); 688 udelay(300);
689 689
690 Dprintk("Startup point 1.\n"); 690 Dprintk("Startup point 1.\n");
691 691
692 Dprintk("Waiting for send to finish...\n"); 692 Dprintk("Waiting for send to finish...\n");
693 timeout = 0; 693 timeout = 0;
694 do { 694 do {
695 Dprintk("+"); 695 Dprintk("+");
696 udelay(100); 696 udelay(100);
697 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; 697 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
698 } while (send_status && (timeout++ < 1000)); 698 } while (send_status && (timeout++ < 1000));
699 699
700 /* 700 /*
701 * Give the other CPU some time to accept the IPI. 701 * Give the other CPU some time to accept the IPI.
702 */ 702 */
703 udelay(200); 703 udelay(200);
704 /* 704 /*
705 * Due to the Pentium erratum 3AP. 705 * Due to the Pentium erratum 3AP.
706 */ 706 */
707 if (maxlvt > 3) { 707 if (maxlvt > 3) {
708 apic_read_around(APIC_SPIV); 708 apic_read_around(APIC_SPIV);
709 apic_write(APIC_ESR, 0); 709 apic_write(APIC_ESR, 0);
710 } 710 }
711 accept_status = (apic_read(APIC_ESR) & 0xEF); 711 accept_status = (apic_read(APIC_ESR) & 0xEF);
712 if (send_status || accept_status) 712 if (send_status || accept_status)
713 break; 713 break;
714 } 714 }
715 Dprintk("After Startup.\n"); 715 Dprintk("After Startup.\n");
716 716
717 if (send_status) 717 if (send_status)
718 printk(KERN_ERR "APIC never delivered???\n"); 718 printk(KERN_ERR "APIC never delivered???\n");
719 if (accept_status) 719 if (accept_status)
720 printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status); 720 printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
721 721
722 return (send_status | accept_status); 722 return (send_status | accept_status);
723 } 723 }
724 724
725 struct create_idle { 725 struct create_idle {
726 struct task_struct *idle; 726 struct task_struct *idle;
727 struct completion done; 727 struct completion done;
728 int cpu; 728 int cpu;
729 }; 729 };
730 730
731 void do_fork_idle(void *_c_idle) 731 void do_fork_idle(void *_c_idle)
732 { 732 {
733 struct create_idle *c_idle = _c_idle; 733 struct create_idle *c_idle = _c_idle;
734 734
735 c_idle->idle = fork_idle(c_idle->cpu); 735 c_idle->idle = fork_idle(c_idle->cpu);
736 complete(&c_idle->done); 736 complete(&c_idle->done);
737 } 737 }
738 738
739 /* 739 /*
740 * Boot one CPU. 740 * Boot one CPU.
741 */ 741 */
742 static int __cpuinit do_boot_cpu(int cpu, int apicid) 742 static int __cpuinit do_boot_cpu(int cpu, int apicid)
743 { 743 {
744 unsigned long boot_error; 744 unsigned long boot_error;
745 int timeout; 745 int timeout;
746 unsigned long start_rip; 746 unsigned long start_rip;
747 struct create_idle c_idle = { 747 struct create_idle c_idle = {
748 .cpu = cpu, 748 .cpu = cpu,
749 .done = COMPLETION_INITIALIZER(c_idle.done), 749 .done = COMPLETION_INITIALIZER(c_idle.done),
750 }; 750 };
751 DECLARE_WORK(work, do_fork_idle, &c_idle); 751 DECLARE_WORK(work, do_fork_idle, &c_idle);
752 752
753 /* allocate memory for gdts of secondary cpus. Hotplug is considered */ 753 /* allocate memory for gdts of secondary cpus. Hotplug is considered */
754 if (!cpu_gdt_descr[cpu].address && 754 if (!cpu_gdt_descr[cpu].address &&
755 !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) { 755 !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) {
756 printk(KERN_ERR "Failed to allocate GDT for CPU %d\n", cpu); 756 printk(KERN_ERR "Failed to allocate GDT for CPU %d\n", cpu);
757 return -1; 757 return -1;
758 } 758 }
759 759
760 c_idle.idle = get_idle_for_cpu(cpu); 760 c_idle.idle = get_idle_for_cpu(cpu);
761 761
762 if (c_idle.idle) { 762 if (c_idle.idle) {
763 c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *) 763 c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *)
764 (THREAD_SIZE + (unsigned long) c_idle.idle->thread_info)) - 1); 764 (THREAD_SIZE + (unsigned long) c_idle.idle->thread_info)) - 1);
765 init_idle(c_idle.idle, cpu); 765 init_idle(c_idle.idle, cpu);
766 goto do_rest; 766 goto do_rest;
767 } 767 }
768 768
769 /* 769 /*
770 * During cold boot process, keventd thread is not spun up yet. 770 * During cold boot process, keventd thread is not spun up yet.
771 * When we do cpu hot-add, we create idle threads on the fly, we should 771 * When we do cpu hot-add, we create idle threads on the fly, we should
772 * not acquire any attributes from the calling context. Hence the clean 772 * not acquire any attributes from the calling context. Hence the clean
773 * way to create kernel_threads() is to do that from keventd(). 773 * way to create kernel_threads() is to do that from keventd().
774 * We do the current_is_keventd() due to the fact that ACPI notifier 774 * We do the current_is_keventd() due to the fact that ACPI notifier
775 * was also queuing to keventd() and when the caller is already running 775 * was also queuing to keventd() and when the caller is already running
776 * in context of keventd(), we would end up with locking up the keventd 776 * in context of keventd(), we would end up with locking up the keventd
777 * thread. 777 * thread.
778 */ 778 */
779 if (!keventd_up() || current_is_keventd()) 779 if (!keventd_up() || current_is_keventd())
780 work.func(work.data); 780 work.func(work.data);
781 else { 781 else {
782 schedule_work(&work); 782 schedule_work(&work);
783 wait_for_completion(&c_idle.done); 783 wait_for_completion(&c_idle.done);
784 } 784 }
785 785
786 if (IS_ERR(c_idle.idle)) { 786 if (IS_ERR(c_idle.idle)) {
787 printk("failed fork for CPU %d\n", cpu); 787 printk("failed fork for CPU %d\n", cpu);
788 return PTR_ERR(c_idle.idle); 788 return PTR_ERR(c_idle.idle);
789 } 789 }
790 790
791 set_idle_for_cpu(cpu, c_idle.idle); 791 set_idle_for_cpu(cpu, c_idle.idle);
792 792
793 do_rest: 793 do_rest:
794 794
795 cpu_pda[cpu].pcurrent = c_idle.idle; 795 cpu_pda(cpu)->pcurrent = c_idle.idle;
796 796
797 start_rip = setup_trampoline(); 797 start_rip = setup_trampoline();
798 798
799 init_rsp = c_idle.idle->thread.rsp; 799 init_rsp = c_idle.idle->thread.rsp;
800 per_cpu(init_tss,cpu).rsp0 = init_rsp; 800 per_cpu(init_tss,cpu).rsp0 = init_rsp;
801 initial_code = start_secondary; 801 initial_code = start_secondary;
802 clear_ti_thread_flag(c_idle.idle->thread_info, TIF_FORK); 802 clear_ti_thread_flag(c_idle.idle->thread_info, TIF_FORK);
803 803
804 printk(KERN_INFO "Booting processor %d/%d APIC 0x%x\n", cpu, 804 printk(KERN_INFO "Booting processor %d/%d APIC 0x%x\n", cpu,
805 cpus_weight(cpu_present_map), 805 cpus_weight(cpu_present_map),
806 apicid); 806 apicid);
807 807
808 /* 808 /*
809 * This grunge runs the startup process for 809 * This grunge runs the startup process for
810 * the targeted processor. 810 * the targeted processor.
811 */ 811 */
812 812
813 atomic_set(&init_deasserted, 0); 813 atomic_set(&init_deasserted, 0);
814 814
815 Dprintk("Setting warm reset code and vector.\n"); 815 Dprintk("Setting warm reset code and vector.\n");
816 816
817 CMOS_WRITE(0xa, 0xf); 817 CMOS_WRITE(0xa, 0xf);
818 local_flush_tlb(); 818 local_flush_tlb();
819 Dprintk("1.\n"); 819 Dprintk("1.\n");
820 *((volatile unsigned short *) phys_to_virt(0x469)) = start_rip >> 4; 820 *((volatile unsigned short *) phys_to_virt(0x469)) = start_rip >> 4;
821 Dprintk("2.\n"); 821 Dprintk("2.\n");
822 *((volatile unsigned short *) phys_to_virt(0x467)) = start_rip & 0xf; 822 *((volatile unsigned short *) phys_to_virt(0x467)) = start_rip & 0xf;
823 Dprintk("3.\n"); 823 Dprintk("3.\n");
824 824
825 /* 825 /*
826 * Be paranoid about clearing APIC errors. 826 * Be paranoid about clearing APIC errors.
827 */ 827 */
828 if (APIC_INTEGRATED(apic_version[apicid])) { 828 if (APIC_INTEGRATED(apic_version[apicid])) {
829 apic_read_around(APIC_SPIV); 829 apic_read_around(APIC_SPIV);
830 apic_write(APIC_ESR, 0); 830 apic_write(APIC_ESR, 0);
831 apic_read(APIC_ESR); 831 apic_read(APIC_ESR);
832 } 832 }
833 833
834 /* 834 /*
835 * Status is now clean 835 * Status is now clean
836 */ 836 */
837 boot_error = 0; 837 boot_error = 0;
838 838
839 /* 839 /*
840 * Starting actual IPI sequence... 840 * Starting actual IPI sequence...
841 */ 841 */
842 boot_error = wakeup_secondary_via_INIT(apicid, start_rip); 842 boot_error = wakeup_secondary_via_INIT(apicid, start_rip);
843 843
844 if (!boot_error) { 844 if (!boot_error) {
845 /* 845 /*
846 * allow APs to start initializing. 846 * allow APs to start initializing.
847 */ 847 */
848 Dprintk("Before Callout %d.\n", cpu); 848 Dprintk("Before Callout %d.\n", cpu);
849 cpu_set(cpu, cpu_callout_map); 849 cpu_set(cpu, cpu_callout_map);
850 Dprintk("After Callout %d.\n", cpu); 850 Dprintk("After Callout %d.\n", cpu);
851 851
852 /* 852 /*
853 * Wait 5s total for a response 853 * Wait 5s total for a response
854 */ 854 */
855 for (timeout = 0; timeout < 50000; timeout++) { 855 for (timeout = 0; timeout < 50000; timeout++) {
856 if (cpu_isset(cpu, cpu_callin_map)) 856 if (cpu_isset(cpu, cpu_callin_map))
857 break; /* It has booted */ 857 break; /* It has booted */
858 udelay(100); 858 udelay(100);
859 } 859 }
860 860
861 if (cpu_isset(cpu, cpu_callin_map)) { 861 if (cpu_isset(cpu, cpu_callin_map)) {
862 /* number CPUs logically, starting from 1 (BSP is 0) */ 862 /* number CPUs logically, starting from 1 (BSP is 0) */
863 Dprintk("CPU has booted.\n"); 863 Dprintk("CPU has booted.\n");
864 } else { 864 } else {
865 boot_error = 1; 865 boot_error = 1;
866 if (*((volatile unsigned char *)phys_to_virt(SMP_TRAMPOLINE_BASE)) 866 if (*((volatile unsigned char *)phys_to_virt(SMP_TRAMPOLINE_BASE))
867 == 0xA5) 867 == 0xA5)
868 /* trampoline started but...? */ 868 /* trampoline started but...? */
869 printk("Stuck ??\n"); 869 printk("Stuck ??\n");
870 else 870 else
871 /* trampoline code not run */ 871 /* trampoline code not run */
872 printk("Not responding.\n"); 872 printk("Not responding.\n");
873 #ifdef APIC_DEBUG 873 #ifdef APIC_DEBUG
874 inquire_remote_apic(apicid); 874 inquire_remote_apic(apicid);
875 #endif 875 #endif
876 } 876 }
877 } 877 }
878 if (boot_error) { 878 if (boot_error) {
879 cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ 879 cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
880 clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ 880 clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
881 cpu_clear(cpu, cpu_present_map); 881 cpu_clear(cpu, cpu_present_map);
882 cpu_clear(cpu, cpu_possible_map); 882 cpu_clear(cpu, cpu_possible_map);
883 x86_cpu_to_apicid[cpu] = BAD_APICID; 883 x86_cpu_to_apicid[cpu] = BAD_APICID;
884 x86_cpu_to_log_apicid[cpu] = BAD_APICID; 884 x86_cpu_to_log_apicid[cpu] = BAD_APICID;
885 return -EIO; 885 return -EIO;
886 } 886 }
887 887
888 return 0; 888 return 0;
889 } 889 }
890 890
891 cycles_t cacheflush_time; 891 cycles_t cacheflush_time;
892 unsigned long cache_decay_ticks; 892 unsigned long cache_decay_ticks;
893 893
894 /* 894 /*
895 * Cleanup possible dangling ends... 895 * Cleanup possible dangling ends...
896 */ 896 */
897 static __cpuinit void smp_cleanup_boot(void) 897 static __cpuinit void smp_cleanup_boot(void)
898 { 898 {
899 /* 899 /*
900 * Paranoid: Set warm reset code and vector here back 900 * Paranoid: Set warm reset code and vector here back
901 * to default values. 901 * to default values.
902 */ 902 */
903 CMOS_WRITE(0, 0xf); 903 CMOS_WRITE(0, 0xf);
904 904
905 /* 905 /*
906 * Reset trampoline flag 906 * Reset trampoline flag
907 */ 907 */
908 *((volatile int *) phys_to_virt(0x467)) = 0; 908 *((volatile int *) phys_to_virt(0x467)) = 0;
909 } 909 }
910 910
911 /* 911 /*
912 * Fall back to non SMP mode after errors. 912 * Fall back to non SMP mode after errors.
913 * 913 *
914 * RED-PEN audit/test this more. I bet there is more state messed up here. 914 * RED-PEN audit/test this more. I bet there is more state messed up here.
915 */ 915 */
916 static __init void disable_smp(void) 916 static __init void disable_smp(void)
917 { 917 {
918 cpu_present_map = cpumask_of_cpu(0); 918 cpu_present_map = cpumask_of_cpu(0);
919 cpu_possible_map = cpumask_of_cpu(0); 919 cpu_possible_map = cpumask_of_cpu(0);
920 if (smp_found_config) 920 if (smp_found_config)
921 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id); 921 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
922 else 922 else
923 phys_cpu_present_map = physid_mask_of_physid(0); 923 phys_cpu_present_map = physid_mask_of_physid(0);
924 cpu_set(0, cpu_sibling_map[0]); 924 cpu_set(0, cpu_sibling_map[0]);
925 cpu_set(0, cpu_core_map[0]); 925 cpu_set(0, cpu_core_map[0]);
926 } 926 }
927 927
928 #ifdef CONFIG_HOTPLUG_CPU 928 #ifdef CONFIG_HOTPLUG_CPU
929 929
930 int additional_cpus __initdata = -1; 930 int additional_cpus __initdata = -1;
931 931
932 /* 932 /*
933 * cpu_possible_map should be static, it cannot change as cpu's 933 * cpu_possible_map should be static, it cannot change as cpu's
934 * are onlined, or offlined. The reason is per-cpu data-structures 934 * are onlined, or offlined. The reason is per-cpu data-structures
935 * are allocated by some modules at init time, and dont expect to 935 * are allocated by some modules at init time, and dont expect to
936 * do this dynamically on cpu arrival/departure. 936 * do this dynamically on cpu arrival/departure.
937 * cpu_present_map on the other hand can change dynamically. 937 * cpu_present_map on the other hand can change dynamically.
938 * In case when cpu_hotplug is not compiled, then we resort to current 938 * In case when cpu_hotplug is not compiled, then we resort to current
939 * behaviour, which is cpu_possible == cpu_present. 939 * behaviour, which is cpu_possible == cpu_present.
940 * - Ashok Raj 940 * - Ashok Raj
941 * 941 *
942 * Three ways to find out the number of additional hotplug CPUs: 942 * Three ways to find out the number of additional hotplug CPUs:
943 * - If the BIOS specified disabled CPUs in ACPI/mptables use that. 943 * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
944 * - The user can overwrite it with additional_cpus=NUM 944 * - The user can overwrite it with additional_cpus=NUM
945 * - Otherwise don't reserve additional CPUs. 945 * - Otherwise don't reserve additional CPUs.
946 * We do this because additional CPUs waste a lot of memory. 946 * We do this because additional CPUs waste a lot of memory.
947 * -AK 947 * -AK
948 */ 948 */
949 __init void prefill_possible_map(void) 949 __init void prefill_possible_map(void)
950 { 950 {
951 int i; 951 int i;
952 int possible; 952 int possible;
953 953
954 if (additional_cpus == -1) { 954 if (additional_cpus == -1) {
955 if (disabled_cpus > 0) 955 if (disabled_cpus > 0)
956 additional_cpus = disabled_cpus; 956 additional_cpus = disabled_cpus;
957 else 957 else
958 additional_cpus = 0; 958 additional_cpus = 0;
959 } 959 }
960 possible = num_processors + additional_cpus; 960 possible = num_processors + additional_cpus;
961 if (possible > NR_CPUS) 961 if (possible > NR_CPUS)
962 possible = NR_CPUS; 962 possible = NR_CPUS;
963 963
964 printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", 964 printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
965 possible, 965 possible,
966 max_t(int, possible - num_processors, 0)); 966 max_t(int, possible - num_processors, 0));
967 967
968 for (i = 0; i < possible; i++) 968 for (i = 0; i < possible; i++)
969 cpu_set(i, cpu_possible_map); 969 cpu_set(i, cpu_possible_map);
970 } 970 }
971 #endif 971 #endif
972 972
973 /* 973 /*
974 * Various sanity checks. 974 * Various sanity checks.
975 */ 975 */
976 static int __init smp_sanity_check(unsigned max_cpus) 976 static int __init smp_sanity_check(unsigned max_cpus)
977 { 977 {
978 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { 978 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
979 printk("weird, boot CPU (#%d) not listed by the BIOS.\n", 979 printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
980 hard_smp_processor_id()); 980 hard_smp_processor_id());
981 physid_set(hard_smp_processor_id(), phys_cpu_present_map); 981 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
982 } 982 }
983 983
984 /* 984 /*
985 * If we couldn't find an SMP configuration at boot time, 985 * If we couldn't find an SMP configuration at boot time,
986 * get out of here now! 986 * get out of here now!
987 */ 987 */
988 if (!smp_found_config) { 988 if (!smp_found_config) {
989 printk(KERN_NOTICE "SMP motherboard not detected.\n"); 989 printk(KERN_NOTICE "SMP motherboard not detected.\n");
990 disable_smp(); 990 disable_smp();
991 if (APIC_init_uniprocessor()) 991 if (APIC_init_uniprocessor())
992 printk(KERN_NOTICE "Local APIC not detected." 992 printk(KERN_NOTICE "Local APIC not detected."
993 " Using dummy APIC emulation.\n"); 993 " Using dummy APIC emulation.\n");
994 return -1; 994 return -1;
995 } 995 }
996 996
997 /* 997 /*
998 * Should not be necessary because the MP table should list the boot 998 * Should not be necessary because the MP table should list the boot
999 * CPU too, but we do it for the sake of robustness anyway. 999 * CPU too, but we do it for the sake of robustness anyway.
1000 */ 1000 */
1001 if (!physid_isset(boot_cpu_id, phys_cpu_present_map)) { 1001 if (!physid_isset(boot_cpu_id, phys_cpu_present_map)) {
1002 printk(KERN_NOTICE "weird, boot CPU (#%d) not listed by the BIOS.\n", 1002 printk(KERN_NOTICE "weird, boot CPU (#%d) not listed by the BIOS.\n",
1003 boot_cpu_id); 1003 boot_cpu_id);
1004 physid_set(hard_smp_processor_id(), phys_cpu_present_map); 1004 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
1005 } 1005 }
1006 1006
1007 /* 1007 /*
1008 * If we couldn't find a local APIC, then get out of here now! 1008 * If we couldn't find a local APIC, then get out of here now!
1009 */ 1009 */
1010 if (APIC_INTEGRATED(apic_version[boot_cpu_id]) && !cpu_has_apic) { 1010 if (APIC_INTEGRATED(apic_version[boot_cpu_id]) && !cpu_has_apic) {
1011 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", 1011 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
1012 boot_cpu_id); 1012 boot_cpu_id);
1013 printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n"); 1013 printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
1014 nr_ioapics = 0; 1014 nr_ioapics = 0;
1015 return -1; 1015 return -1;
1016 } 1016 }
1017 1017
1018 /* 1018 /*
1019 * If SMP should be disabled, then really disable it! 1019 * If SMP should be disabled, then really disable it!
1020 */ 1020 */
1021 if (!max_cpus) { 1021 if (!max_cpus) {
1022 printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n"); 1022 printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
1023 nr_ioapics = 0; 1023 nr_ioapics = 0;
1024 return -1; 1024 return -1;
1025 } 1025 }
1026 1026
1027 return 0; 1027 return 0;
1028 } 1028 }
1029 1029
1030 /* 1030 /*
1031 * Prepare for SMP bootup. The MP table or ACPI has been read 1031 * Prepare for SMP bootup. The MP table or ACPI has been read
1032 * earlier. Just do some sanity checking here and enable APIC mode. 1032 * earlier. Just do some sanity checking here and enable APIC mode.
1033 */ 1033 */
1034 void __init smp_prepare_cpus(unsigned int max_cpus) 1034 void __init smp_prepare_cpus(unsigned int max_cpus)
1035 { 1035 {
1036 nmi_watchdog_default(); 1036 nmi_watchdog_default();
1037 current_cpu_data = boot_cpu_data; 1037 current_cpu_data = boot_cpu_data;
1038 current_thread_info()->cpu = 0; /* needed? */ 1038 current_thread_info()->cpu = 0; /* needed? */
1039 set_cpu_sibling_map(0); 1039 set_cpu_sibling_map(0);
1040 1040
1041 if (smp_sanity_check(max_cpus) < 0) { 1041 if (smp_sanity_check(max_cpus) < 0) {
1042 printk(KERN_INFO "SMP disabled\n"); 1042 printk(KERN_INFO "SMP disabled\n");
1043 disable_smp(); 1043 disable_smp();
1044 return; 1044 return;
1045 } 1045 }
1046 1046
1047 1047
1048 /* 1048 /*
1049 * Switch from PIC to APIC mode. 1049 * Switch from PIC to APIC mode.
1050 */ 1050 */
1051 connect_bsp_APIC(); 1051 connect_bsp_APIC();
1052 setup_local_APIC(); 1052 setup_local_APIC();
1053 1053
1054 if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) { 1054 if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) {
1055 panic("Boot APIC ID in local APIC unexpected (%d vs %d)", 1055 panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
1056 GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id); 1056 GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id);
1057 /* Or can we switch back to PIC here? */ 1057 /* Or can we switch back to PIC here? */
1058 } 1058 }
1059 1059
1060 /* 1060 /*
1061 * Now start the IO-APICs 1061 * Now start the IO-APICs
1062 */ 1062 */
1063 if (!skip_ioapic_setup && nr_ioapics) 1063 if (!skip_ioapic_setup && nr_ioapics)
1064 setup_IO_APIC(); 1064 setup_IO_APIC();
1065 else 1065 else
1066 nr_ioapics = 0; 1066 nr_ioapics = 0;
1067 1067
1068 /* 1068 /*
1069 * Set up local APIC timer on boot CPU. 1069 * Set up local APIC timer on boot CPU.
1070 */ 1070 */
1071 1071
1072 setup_boot_APIC_clock(); 1072 setup_boot_APIC_clock();
1073 } 1073 }
1074 1074
1075 /* 1075 /*
1076 * Early setup to make printk work. 1076 * Early setup to make printk work.
1077 */ 1077 */
1078 void __init smp_prepare_boot_cpu(void) 1078 void __init smp_prepare_boot_cpu(void)
1079 { 1079 {
1080 int me = smp_processor_id(); 1080 int me = smp_processor_id();
1081 cpu_set(me, cpu_online_map); 1081 cpu_set(me, cpu_online_map);
1082 cpu_set(me, cpu_callout_map); 1082 cpu_set(me, cpu_callout_map);
1083 per_cpu(cpu_state, me) = CPU_ONLINE; 1083 per_cpu(cpu_state, me) = CPU_ONLINE;
1084 } 1084 }
1085 1085
1086 /* 1086 /*
1087 * Entry point to boot a CPU. 1087 * Entry point to boot a CPU.
1088 */ 1088 */
1089 int __cpuinit __cpu_up(unsigned int cpu) 1089 int __cpuinit __cpu_up(unsigned int cpu)
1090 { 1090 {
1091 int err; 1091 int err;
1092 int apicid = cpu_present_to_apicid(cpu); 1092 int apicid = cpu_present_to_apicid(cpu);
1093 1093
1094 WARN_ON(irqs_disabled()); 1094 WARN_ON(irqs_disabled());
1095 1095
1096 Dprintk("++++++++++++++++++++=_---CPU UP %u\n", cpu); 1096 Dprintk("++++++++++++++++++++=_---CPU UP %u\n", cpu);
1097 1097
1098 if (apicid == BAD_APICID || apicid == boot_cpu_id || 1098 if (apicid == BAD_APICID || apicid == boot_cpu_id ||
1099 !physid_isset(apicid, phys_cpu_present_map)) { 1099 !physid_isset(apicid, phys_cpu_present_map)) {
1100 printk("__cpu_up: bad cpu %d\n", cpu); 1100 printk("__cpu_up: bad cpu %d\n", cpu);
1101 return -EINVAL; 1101 return -EINVAL;
1102 } 1102 }
1103 1103
1104 /* 1104 /*
1105 * Already booted CPU? 1105 * Already booted CPU?
1106 */ 1106 */
1107 if (cpu_isset(cpu, cpu_callin_map)) { 1107 if (cpu_isset(cpu, cpu_callin_map)) {
1108 Dprintk("do_boot_cpu %d Already started\n", cpu); 1108 Dprintk("do_boot_cpu %d Already started\n", cpu);
1109 return -ENOSYS; 1109 return -ENOSYS;
1110 } 1110 }
1111 1111
1112 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; 1112 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
1113 /* Boot it! */ 1113 /* Boot it! */
1114 err = do_boot_cpu(cpu, apicid); 1114 err = do_boot_cpu(cpu, apicid);
1115 if (err < 0) { 1115 if (err < 0) {
1116 Dprintk("do_boot_cpu failed %d\n", err); 1116 Dprintk("do_boot_cpu failed %d\n", err);
1117 return err; 1117 return err;
1118 } 1118 }
1119 1119
1120 /* Unleash the CPU! */ 1120 /* Unleash the CPU! */
1121 Dprintk("waiting for cpu %d\n", cpu); 1121 Dprintk("waiting for cpu %d\n", cpu);
1122 1122
1123 while (!cpu_isset(cpu, cpu_online_map)) 1123 while (!cpu_isset(cpu, cpu_online_map))
1124 cpu_relax(); 1124 cpu_relax();
1125 err = 0; 1125 err = 0;
1126 1126
1127 return err; 1127 return err;
1128 } 1128 }
1129 1129
1130 /* 1130 /*
1131 * Finish the SMP boot. 1131 * Finish the SMP boot.
1132 */ 1132 */
1133 void __init smp_cpus_done(unsigned int max_cpus) 1133 void __init smp_cpus_done(unsigned int max_cpus)
1134 { 1134 {
1135 smp_cleanup_boot(); 1135 smp_cleanup_boot();
1136 1136
1137 #ifdef CONFIG_X86_IO_APIC 1137 #ifdef CONFIG_X86_IO_APIC
1138 setup_ioapic_dest(); 1138 setup_ioapic_dest();
1139 #endif 1139 #endif
1140 1140
1141 time_init_gtod(); 1141 time_init_gtod();
1142 1142
1143 check_nmi_watchdog(); 1143 check_nmi_watchdog();
1144 } 1144 }
1145 1145
1146 #ifdef CONFIG_HOTPLUG_CPU 1146 #ifdef CONFIG_HOTPLUG_CPU
1147 1147
1148 static void remove_siblinginfo(int cpu) 1148 static void remove_siblinginfo(int cpu)
1149 { 1149 {
1150 int sibling; 1150 int sibling;
1151 struct cpuinfo_x86 *c = cpu_data; 1151 struct cpuinfo_x86 *c = cpu_data;
1152 1152
1153 for_each_cpu_mask(sibling, cpu_core_map[cpu]) { 1153 for_each_cpu_mask(sibling, cpu_core_map[cpu]) {
1154 cpu_clear(cpu, cpu_core_map[sibling]); 1154 cpu_clear(cpu, cpu_core_map[sibling]);
1155 /* 1155 /*
1156 * last thread sibling in this cpu core going down 1156 * last thread sibling in this cpu core going down
1157 */ 1157 */
1158 if (cpus_weight(cpu_sibling_map[cpu]) == 1) 1158 if (cpus_weight(cpu_sibling_map[cpu]) == 1)
1159 c[sibling].booted_cores--; 1159 c[sibling].booted_cores--;
1160 } 1160 }
1161 1161
1162 for_each_cpu_mask(sibling, cpu_sibling_map[cpu]) 1162 for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
1163 cpu_clear(cpu, cpu_sibling_map[sibling]); 1163 cpu_clear(cpu, cpu_sibling_map[sibling]);
1164 cpus_clear(cpu_sibling_map[cpu]); 1164 cpus_clear(cpu_sibling_map[cpu]);
1165 cpus_clear(cpu_core_map[cpu]); 1165 cpus_clear(cpu_core_map[cpu]);
1166 phys_proc_id[cpu] = BAD_APICID; 1166 phys_proc_id[cpu] = BAD_APICID;
1167 cpu_core_id[cpu] = BAD_APICID; 1167 cpu_core_id[cpu] = BAD_APICID;
1168 cpu_clear(cpu, cpu_sibling_setup_map); 1168 cpu_clear(cpu, cpu_sibling_setup_map);
1169 } 1169 }
1170 1170
1171 void remove_cpu_from_maps(void) 1171 void remove_cpu_from_maps(void)
1172 { 1172 {
1173 int cpu = smp_processor_id(); 1173 int cpu = smp_processor_id();
1174 1174
1175 cpu_clear(cpu, cpu_callout_map); 1175 cpu_clear(cpu, cpu_callout_map);
1176 cpu_clear(cpu, cpu_callin_map); 1176 cpu_clear(cpu, cpu_callin_map);
1177 clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ 1177 clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
1178 } 1178 }
1179 1179
1180 int __cpu_disable(void) 1180 int __cpu_disable(void)
1181 { 1181 {
1182 int cpu = smp_processor_id(); 1182 int cpu = smp_processor_id();
1183 1183
1184 /* 1184 /*
1185 * Perhaps use cpufreq to drop frequency, but that could go 1185 * Perhaps use cpufreq to drop frequency, but that could go
1186 * into generic code. 1186 * into generic code.
1187 * 1187 *
1188 * We won't take down the boot processor on i386 due to some 1188 * We won't take down the boot processor on i386 due to some
1189 * interrupts only being able to be serviced by the BSP. 1189 * interrupts only being able to be serviced by the BSP.
1190 * Especially so if we're not using an IOAPIC -zwane 1190 * Especially so if we're not using an IOAPIC -zwane
1191 */ 1191 */
1192 if (cpu == 0) 1192 if (cpu == 0)
1193 return -EBUSY; 1193 return -EBUSY;
1194 1194
1195 clear_local_APIC(); 1195 clear_local_APIC();
1196 1196
1197 /* 1197 /*
1198 * HACK: 1198 * HACK:
1199 * Allow any queued timer interrupts to get serviced 1199 * Allow any queued timer interrupts to get serviced
1200 * This is only a temporary solution until we cleanup 1200 * This is only a temporary solution until we cleanup
1201 * fixup_irqs as we do for IA64. 1201 * fixup_irqs as we do for IA64.
1202 */ 1202 */
1203 local_irq_enable(); 1203 local_irq_enable();
1204 mdelay(1); 1204 mdelay(1);
1205 1205
1206 local_irq_disable(); 1206 local_irq_disable();
1207 remove_siblinginfo(cpu); 1207 remove_siblinginfo(cpu);
1208 1208
1209 /* It's now safe to remove this processor from the online map */ 1209 /* It's now safe to remove this processor from the online map */
1210 cpu_clear(cpu, cpu_online_map); 1210 cpu_clear(cpu, cpu_online_map);
1211 remove_cpu_from_maps(); 1211 remove_cpu_from_maps();
1212 fixup_irqs(cpu_online_map); 1212 fixup_irqs(cpu_online_map);
1213 return 0; 1213 return 0;
1214 } 1214 }
1215 1215
1216 void __cpu_die(unsigned int cpu) 1216 void __cpu_die(unsigned int cpu)
1217 { 1217 {
1218 /* We don't do anything here: idle task is faking death itself. */ 1218 /* We don't do anything here: idle task is faking death itself. */
1219 unsigned int i; 1219 unsigned int i;
1220 1220
1221 for (i = 0; i < 10; i++) { 1221 for (i = 0; i < 10; i++) {
1222 /* They ack this in play_dead by setting CPU_DEAD */ 1222 /* They ack this in play_dead by setting CPU_DEAD */
1223 if (per_cpu(cpu_state, cpu) == CPU_DEAD) { 1223 if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
1224 printk ("CPU %d is now offline\n", cpu); 1224 printk ("CPU %d is now offline\n", cpu);
1225 return; 1225 return;
1226 } 1226 }
1227 msleep(100); 1227 msleep(100);
1228 } 1228 }
1229 printk(KERN_ERR "CPU %u didn't die...\n", cpu); 1229 printk(KERN_ERR "CPU %u didn't die...\n", cpu);
1230 } 1230 }
1231 1231
1232 static __init int setup_additional_cpus(char *s) 1232 static __init int setup_additional_cpus(char *s)
1233 { 1233 {
1234 return get_option(&s, &additional_cpus); 1234 return get_option(&s, &additional_cpus);
1235 } 1235 }
1236 __setup("additional_cpus=", setup_additional_cpus); 1236 __setup("additional_cpus=", setup_additional_cpus);
1237 1237
1238 #else /* ... !CONFIG_HOTPLUG_CPU */ 1238 #else /* ... !CONFIG_HOTPLUG_CPU */
1239 1239
1240 int __cpu_disable(void) 1240 int __cpu_disable(void)
1241 { 1241 {
1242 return -ENOSYS; 1242 return -ENOSYS;
1243 } 1243 }
1244 1244
1245 void __cpu_die(unsigned int cpu) 1245 void __cpu_die(unsigned int cpu)
1246 { 1246 {
1247 /* We said "no" in __cpu_disable */ 1247 /* We said "no" in __cpu_disable */
1248 BUG(); 1248 BUG();
1249 } 1249 }
1250 #endif /* CONFIG_HOTPLUG_CPU */ 1250 #endif /* CONFIG_HOTPLUG_CPU */
1251 1251
arch/x86_64/kernel/traps.c
1 /* 1 /*
2 * linux/arch/x86-64/traps.c 2 * linux/arch/x86-64/traps.c
3 * 3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs 5 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
6 * 6 *
7 * Pentium III FXSR, SSE support 7 * Pentium III FXSR, SSE support
8 * Gareth Hughes <gareth@valinux.com>, May 2000 8 * Gareth Hughes <gareth@valinux.com>, May 2000
9 * 9 *
10 * $Id: traps.c,v 1.36 2002/03/24 11:09:10 ak Exp $ 10 * $Id: traps.c,v 1.36 2002/03/24 11:09:10 ak Exp $
11 */ 11 */
12 12
13 /* 13 /*
14 * 'Traps.c' handles hardware traps and faults after we have saved some 14 * 'Traps.c' handles hardware traps and faults after we have saved some
15 * state in 'entry.S'. 15 * state in 'entry.S'.
16 */ 16 */
17 #include <linux/config.h> 17 #include <linux/config.h>
18 #include <linux/sched.h> 18 #include <linux/sched.h>
19 #include <linux/kernel.h> 19 #include <linux/kernel.h>
20 #include <linux/string.h> 20 #include <linux/string.h>
21 #include <linux/errno.h> 21 #include <linux/errno.h>
22 #include <linux/ptrace.h> 22 #include <linux/ptrace.h>
23 #include <linux/timer.h> 23 #include <linux/timer.h>
24 #include <linux/mm.h> 24 #include <linux/mm.h>
25 #include <linux/init.h> 25 #include <linux/init.h>
26 #include <linux/delay.h> 26 #include <linux/delay.h>
27 #include <linux/spinlock.h> 27 #include <linux/spinlock.h>
28 #include <linux/interrupt.h> 28 #include <linux/interrupt.h>
29 #include <linux/module.h> 29 #include <linux/module.h>
30 #include <linux/moduleparam.h> 30 #include <linux/moduleparam.h>
31 #include <linux/nmi.h> 31 #include <linux/nmi.h>
32 #include <linux/kprobes.h> 32 #include <linux/kprobes.h>
33 33
34 #include <asm/system.h> 34 #include <asm/system.h>
35 #include <asm/uaccess.h> 35 #include <asm/uaccess.h>
36 #include <asm/io.h> 36 #include <asm/io.h>
37 #include <asm/atomic.h> 37 #include <asm/atomic.h>
38 #include <asm/debugreg.h> 38 #include <asm/debugreg.h>
39 #include <asm/desc.h> 39 #include <asm/desc.h>
40 #include <asm/i387.h> 40 #include <asm/i387.h>
41 #include <asm/kdebug.h> 41 #include <asm/kdebug.h>
42 #include <asm/processor.h> 42 #include <asm/processor.h>
43 43
44 #include <asm/smp.h> 44 #include <asm/smp.h>
45 #include <asm/pgalloc.h> 45 #include <asm/pgalloc.h>
46 #include <asm/pda.h> 46 #include <asm/pda.h>
47 #include <asm/proto.h> 47 #include <asm/proto.h>
48 #include <asm/nmi.h> 48 #include <asm/nmi.h>
49 49
50 extern struct gate_struct idt_table[256]; 50 extern struct gate_struct idt_table[256];
51 51
52 asmlinkage void divide_error(void); 52 asmlinkage void divide_error(void);
53 asmlinkage void debug(void); 53 asmlinkage void debug(void);
54 asmlinkage void nmi(void); 54 asmlinkage void nmi(void);
55 asmlinkage void int3(void); 55 asmlinkage void int3(void);
56 asmlinkage void overflow(void); 56 asmlinkage void overflow(void);
57 asmlinkage void bounds(void); 57 asmlinkage void bounds(void);
58 asmlinkage void invalid_op(void); 58 asmlinkage void invalid_op(void);
59 asmlinkage void device_not_available(void); 59 asmlinkage void device_not_available(void);
60 asmlinkage void double_fault(void); 60 asmlinkage void double_fault(void);
61 asmlinkage void coprocessor_segment_overrun(void); 61 asmlinkage void coprocessor_segment_overrun(void);
62 asmlinkage void invalid_TSS(void); 62 asmlinkage void invalid_TSS(void);
63 asmlinkage void segment_not_present(void); 63 asmlinkage void segment_not_present(void);
64 asmlinkage void stack_segment(void); 64 asmlinkage void stack_segment(void);
65 asmlinkage void general_protection(void); 65 asmlinkage void general_protection(void);
66 asmlinkage void page_fault(void); 66 asmlinkage void page_fault(void);
67 asmlinkage void coprocessor_error(void); 67 asmlinkage void coprocessor_error(void);
68 asmlinkage void simd_coprocessor_error(void); 68 asmlinkage void simd_coprocessor_error(void);
69 asmlinkage void reserved(void); 69 asmlinkage void reserved(void);
70 asmlinkage void alignment_check(void); 70 asmlinkage void alignment_check(void);
71 asmlinkage void machine_check(void); 71 asmlinkage void machine_check(void);
72 asmlinkage void spurious_interrupt_bug(void); 72 asmlinkage void spurious_interrupt_bug(void);
73 asmlinkage void call_debug(void);
74 73
75 struct notifier_block *die_chain; 74 struct notifier_block *die_chain;
76 static DEFINE_SPINLOCK(die_notifier_lock); 75 static DEFINE_SPINLOCK(die_notifier_lock);
77 76
78 int register_die_notifier(struct notifier_block *nb) 77 int register_die_notifier(struct notifier_block *nb)
79 { 78 {
80 int err = 0; 79 int err = 0;
81 unsigned long flags; 80 unsigned long flags;
82 spin_lock_irqsave(&die_notifier_lock, flags); 81 spin_lock_irqsave(&die_notifier_lock, flags);
83 err = notifier_chain_register(&die_chain, nb); 82 err = notifier_chain_register(&die_chain, nb);
84 spin_unlock_irqrestore(&die_notifier_lock, flags); 83 spin_unlock_irqrestore(&die_notifier_lock, flags);
85 return err; 84 return err;
86 } 85 }
87 86
88 static inline void conditional_sti(struct pt_regs *regs) 87 static inline void conditional_sti(struct pt_regs *regs)
89 { 88 {
90 if (regs->eflags & X86_EFLAGS_IF) 89 if (regs->eflags & X86_EFLAGS_IF)
91 local_irq_enable(); 90 local_irq_enable();
92 } 91 }
93 92
94 static int kstack_depth_to_print = 10; 93 static int kstack_depth_to_print = 10;
95 94
96 #ifdef CONFIG_KALLSYMS 95 #ifdef CONFIG_KALLSYMS
97 #include <linux/kallsyms.h> 96 #include <linux/kallsyms.h>
98 int printk_address(unsigned long address) 97 int printk_address(unsigned long address)
99 { 98 {
100 unsigned long offset = 0, symsize; 99 unsigned long offset = 0, symsize;
101 const char *symname; 100 const char *symname;
102 char *modname; 101 char *modname;
103 char *delim = ":"; 102 char *delim = ":";
104 char namebuf[128]; 103 char namebuf[128];
105 104
106 symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf); 105 symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf);
107 if (!symname) 106 if (!symname)
108 return printk("[<%016lx>]", address); 107 return printk("[<%016lx>]", address);
109 if (!modname) 108 if (!modname)
110 modname = delim = ""; 109 modname = delim = "";
111 return printk("<%016lx>{%s%s%s%s%+ld}", 110 return printk("<%016lx>{%s%s%s%s%+ld}",
112 address,delim,modname,delim,symname,offset); 111 address,delim,modname,delim,symname,offset);
113 } 112 }
114 #else 113 #else
115 int printk_address(unsigned long address) 114 int printk_address(unsigned long address)
116 { 115 {
117 return printk("[<%016lx>]", address); 116 return printk("[<%016lx>]", address);
118 } 117 }
119 #endif 118 #endif
120 119
121 static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 120 static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
122 unsigned *usedp, const char **idp) 121 unsigned *usedp, const char **idp)
123 { 122 {
124 static char ids[][8] = { 123 static char ids[][8] = {
125 [DEBUG_STACK - 1] = "#DB", 124 [DEBUG_STACK - 1] = "#DB",
126 [NMI_STACK - 1] = "NMI", 125 [NMI_STACK - 1] = "NMI",
127 [DOUBLEFAULT_STACK - 1] = "#DF", 126 [DOUBLEFAULT_STACK - 1] = "#DF",
128 [STACKFAULT_STACK - 1] = "#SS", 127 [STACKFAULT_STACK - 1] = "#SS",
129 [MCE_STACK - 1] = "#MC", 128 [MCE_STACK - 1] = "#MC",
130 #if DEBUG_STKSZ > EXCEPTION_STKSZ 129 #if DEBUG_STKSZ > EXCEPTION_STKSZ
131 [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" 130 [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
132 #endif 131 #endif
133 }; 132 };
134 unsigned k; 133 unsigned k;
135 134
136 for (k = 0; k < N_EXCEPTION_STACKS; k++) { 135 for (k = 0; k < N_EXCEPTION_STACKS; k++) {
137 unsigned long end; 136 unsigned long end;
138 137
139 switch (k + 1) { 138 switch (k + 1) {
140 #if DEBUG_STKSZ > EXCEPTION_STKSZ 139 #if DEBUG_STKSZ > EXCEPTION_STKSZ
141 case DEBUG_STACK: 140 case DEBUG_STACK:
142 end = cpu_pda[cpu].debugstack + DEBUG_STKSZ; 141 end = cpu_pda(cpu)->debugstack + DEBUG_STKSZ;
143 break; 142 break;
144 #endif 143 #endif
145 default: 144 default:
146 end = per_cpu(init_tss, cpu).ist[k]; 145 end = per_cpu(init_tss, cpu).ist[k];
147 break; 146 break;
148 } 147 }
149 if (stack >= end) 148 if (stack >= end)
150 continue; 149 continue;
151 if (stack >= end - EXCEPTION_STKSZ) { 150 if (stack >= end - EXCEPTION_STKSZ) {
152 if (*usedp & (1U << k)) 151 if (*usedp & (1U << k))
153 break; 152 break;
154 *usedp |= 1U << k; 153 *usedp |= 1U << k;
155 *idp = ids[k]; 154 *idp = ids[k];
156 return (unsigned long *)end; 155 return (unsigned long *)end;
157 } 156 }
158 #if DEBUG_STKSZ > EXCEPTION_STKSZ 157 #if DEBUG_STKSZ > EXCEPTION_STKSZ
159 if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) { 158 if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
160 unsigned j = N_EXCEPTION_STACKS - 1; 159 unsigned j = N_EXCEPTION_STACKS - 1;
161 160
162 do { 161 do {
163 ++j; 162 ++j;
164 end -= EXCEPTION_STKSZ; 163 end -= EXCEPTION_STKSZ;
165 ids[j][4] = '1' + (j - N_EXCEPTION_STACKS); 164 ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
166 } while (stack < end - EXCEPTION_STKSZ); 165 } while (stack < end - EXCEPTION_STKSZ);
167 if (*usedp & (1U << j)) 166 if (*usedp & (1U << j))
168 break; 167 break;
169 *usedp |= 1U << j; 168 *usedp |= 1U << j;
170 *idp = ids[j]; 169 *idp = ids[j];
171 return (unsigned long *)end; 170 return (unsigned long *)end;
172 } 171 }
173 #endif 172 #endif
174 } 173 }
175 return NULL; 174 return NULL;
176 } 175 }
177 176
178 /* 177 /*
179 * x86-64 can have upto three kernel stacks: 178 * x86-64 can have upto three kernel stacks:
180 * process stack 179 * process stack
181 * interrupt stack 180 * interrupt stack
182 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack 181 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
183 */ 182 */
184 183
185 void show_trace(unsigned long *stack) 184 void show_trace(unsigned long *stack)
186 { 185 {
187 unsigned long addr; 186 unsigned long addr;
188 const unsigned cpu = safe_smp_processor_id(); 187 const unsigned cpu = safe_smp_processor_id();
189 unsigned long *irqstack_end = (unsigned long *)cpu_pda[cpu].irqstackptr; 188 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
190 int i; 189 int i;
191 unsigned used = 0; 190 unsigned used = 0;
192 191
193 printk("\nCall Trace:"); 192 printk("\nCall Trace:");
194 193
195 #define HANDLE_STACK(cond) \ 194 #define HANDLE_STACK(cond) \
196 do while (cond) { \ 195 do while (cond) { \
197 addr = *stack++; \ 196 addr = *stack++; \
198 if (kernel_text_address(addr)) { \ 197 if (kernel_text_address(addr)) { \
199 /* \ 198 /* \
200 * If the address is either in the text segment of the \ 199 * If the address is either in the text segment of the \
201 * kernel, or in the region which contains vmalloc'ed \ 200 * kernel, or in the region which contains vmalloc'ed \
202 * memory, it *may* be the address of a calling \ 201 * memory, it *may* be the address of a calling \
203 * routine; if so, print it so that someone tracing \ 202 * routine; if so, print it so that someone tracing \
204 * down the cause of the crash will be able to figure \ 203 * down the cause of the crash will be able to figure \
205 * out the call path that was taken. \ 204 * out the call path that was taken. \
206 */ \ 205 */ \
207 i += printk_address(addr); \ 206 i += printk_address(addr); \
208 if (i > 50) { \ 207 if (i > 50) { \
209 printk("\n "); \ 208 printk("\n "); \
210 i = 0; \ 209 i = 0; \
211 } \ 210 } \
212 else \ 211 else \
213 i += printk(" "); \ 212 i += printk(" "); \
214 } \ 213 } \
215 } while (0) 214 } while (0)
216 215
217 for(i = 0; ; ) { 216 for(i = 0; ; ) {
218 const char *id; 217 const char *id;
219 unsigned long *estack_end; 218 unsigned long *estack_end;
220 estack_end = in_exception_stack(cpu, (unsigned long)stack, 219 estack_end = in_exception_stack(cpu, (unsigned long)stack,
221 &used, &id); 220 &used, &id);
222 221
223 if (estack_end) { 222 if (estack_end) {
224 i += printk(" <%s> ", id); 223 i += printk(" <%s> ", id);
225 HANDLE_STACK (stack < estack_end); 224 HANDLE_STACK (stack < estack_end);
226 i += printk(" <EOE> "); 225 i += printk(" <EOE> ");
227 stack = (unsigned long *) estack_end[-2]; 226 stack = (unsigned long *) estack_end[-2];
228 continue; 227 continue;
229 } 228 }
230 if (irqstack_end) { 229 if (irqstack_end) {
231 unsigned long *irqstack; 230 unsigned long *irqstack;
232 irqstack = irqstack_end - 231 irqstack = irqstack_end -
233 (IRQSTACKSIZE - 64) / sizeof(*irqstack); 232 (IRQSTACKSIZE - 64) / sizeof(*irqstack);
234 233
235 if (stack >= irqstack && stack < irqstack_end) { 234 if (stack >= irqstack && stack < irqstack_end) {
236 i += printk(" <IRQ> "); 235 i += printk(" <IRQ> ");
237 HANDLE_STACK (stack < irqstack_end); 236 HANDLE_STACK (stack < irqstack_end);
238 stack = (unsigned long *) (irqstack_end[-1]); 237 stack = (unsigned long *) (irqstack_end[-1]);
239 irqstack_end = NULL; 238 irqstack_end = NULL;
240 i += printk(" <EOI> "); 239 i += printk(" <EOI> ");
241 continue; 240 continue;
242 } 241 }
243 } 242 }
244 break; 243 break;
245 } 244 }
246 245
247 HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0); 246 HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0);
248 #undef HANDLE_STACK 247 #undef HANDLE_STACK
249 printk("\n"); 248 printk("\n");
250 } 249 }
251 250
252 void show_stack(struct task_struct *tsk, unsigned long * rsp) 251 void show_stack(struct task_struct *tsk, unsigned long * rsp)
253 { 252 {
254 unsigned long *stack; 253 unsigned long *stack;
255 int i; 254 int i;
256 const int cpu = safe_smp_processor_id(); 255 const int cpu = safe_smp_processor_id();
257 unsigned long *irqstack_end = (unsigned long *) (cpu_pda[cpu].irqstackptr); 256 unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
258 unsigned long *irqstack = (unsigned long *) (cpu_pda[cpu].irqstackptr - IRQSTACKSIZE); 257 unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
259 258
260 // debugging aid: "show_stack(NULL, NULL);" prints the 259 // debugging aid: "show_stack(NULL, NULL);" prints the
261 // back trace for this cpu. 260 // back trace for this cpu.
262 261
263 if (rsp == NULL) { 262 if (rsp == NULL) {
264 if (tsk) 263 if (tsk)
265 rsp = (unsigned long *)tsk->thread.rsp; 264 rsp = (unsigned long *)tsk->thread.rsp;
266 else 265 else
267 rsp = (unsigned long *)&rsp; 266 rsp = (unsigned long *)&rsp;
268 } 267 }
269 268
270 stack = rsp; 269 stack = rsp;
271 for(i=0; i < kstack_depth_to_print; i++) { 270 for(i=0; i < kstack_depth_to_print; i++) {
272 if (stack >= irqstack && stack <= irqstack_end) { 271 if (stack >= irqstack && stack <= irqstack_end) {
273 if (stack == irqstack_end) { 272 if (stack == irqstack_end) {
274 stack = (unsigned long *) (irqstack_end[-1]); 273 stack = (unsigned long *) (irqstack_end[-1]);
275 printk(" <EOI> "); 274 printk(" <EOI> ");
276 } 275 }
277 } else { 276 } else {
278 if (((long) stack & (THREAD_SIZE-1)) == 0) 277 if (((long) stack & (THREAD_SIZE-1)) == 0)
279 break; 278 break;
280 } 279 }
281 if (i && ((i % 4) == 0)) 280 if (i && ((i % 4) == 0))
282 printk("\n "); 281 printk("\n ");
283 printk("%016lx ", *stack++); 282 printk("%016lx ", *stack++);
284 touch_nmi_watchdog(); 283 touch_nmi_watchdog();
285 } 284 }
286 show_trace((unsigned long *)rsp); 285 show_trace((unsigned long *)rsp);
287 } 286 }
288 287
289 /* 288 /*
290 * The architecture-independent dump_stack generator 289 * The architecture-independent dump_stack generator
291 */ 290 */
292 void dump_stack(void) 291 void dump_stack(void)
293 { 292 {
294 unsigned long dummy; 293 unsigned long dummy;
295 show_trace(&dummy); 294 show_trace(&dummy);
296 } 295 }
297 296
298 EXPORT_SYMBOL(dump_stack); 297 EXPORT_SYMBOL(dump_stack);
299 298
300 void show_registers(struct pt_regs *regs) 299 void show_registers(struct pt_regs *regs)
301 { 300 {
302 int i; 301 int i;
303 int in_kernel = !user_mode(regs); 302 int in_kernel = !user_mode(regs);
304 unsigned long rsp; 303 unsigned long rsp;
305 const int cpu = safe_smp_processor_id(); 304 const int cpu = safe_smp_processor_id();
306 struct task_struct *cur = cpu_pda[cpu].pcurrent; 305 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
307 306
308 rsp = regs->rsp; 307 rsp = regs->rsp;
309 308
310 printk("CPU %d ", cpu); 309 printk("CPU %d ", cpu);
311 __show_regs(regs); 310 __show_regs(regs);
312 printk("Process %s (pid: %d, threadinfo %p, task %p)\n", 311 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
313 cur->comm, cur->pid, cur->thread_info, cur); 312 cur->comm, cur->pid, cur->thread_info, cur);
314 313
315 /* 314 /*
316 * When in-kernel, we also print out the stack and code at the 315 * When in-kernel, we also print out the stack and code at the
317 * time of the fault.. 316 * time of the fault..
318 */ 317 */
319 if (in_kernel) { 318 if (in_kernel) {
320 319
321 printk("Stack: "); 320 printk("Stack: ");
322 show_stack(NULL, (unsigned long*)rsp); 321 show_stack(NULL, (unsigned long*)rsp);
323 322
324 printk("\nCode: "); 323 printk("\nCode: ");
325 if(regs->rip < PAGE_OFFSET) 324 if(regs->rip < PAGE_OFFSET)
326 goto bad; 325 goto bad;
327 326
328 for(i=0;i<20;i++) 327 for(i=0;i<20;i++)
329 { 328 {
330 unsigned char c; 329 unsigned char c;
331 if(__get_user(c, &((unsigned char*)regs->rip)[i])) { 330 if(__get_user(c, &((unsigned char*)regs->rip)[i])) {
332 bad: 331 bad:
333 printk(" Bad RIP value."); 332 printk(" Bad RIP value.");
334 break; 333 break;
335 } 334 }
336 printk("%02x ", c); 335 printk("%02x ", c);
337 } 336 }
338 } 337 }
339 printk("\n"); 338 printk("\n");
340 } 339 }
341 340
342 void handle_BUG(struct pt_regs *regs) 341 void handle_BUG(struct pt_regs *regs)
343 { 342 {
344 struct bug_frame f; 343 struct bug_frame f;
345 char tmp; 344 char tmp;
346 345
347 if (user_mode(regs)) 346 if (user_mode(regs))
348 return; 347 return;
349 if (__copy_from_user(&f, (struct bug_frame *) regs->rip, 348 if (__copy_from_user(&f, (struct bug_frame *) regs->rip,
350 sizeof(struct bug_frame))) 349 sizeof(struct bug_frame)))
351 return; 350 return;
352 if (f.filename >= 0 || 351 if (f.filename >= 0 ||
353 f.ud2[0] != 0x0f || f.ud2[1] != 0x0b) 352 f.ud2[0] != 0x0f || f.ud2[1] != 0x0b)
354 return; 353 return;
355 if (__get_user(tmp, (char *)(long)f.filename)) 354 if (__get_user(tmp, (char *)(long)f.filename))
356 f.filename = (int)(long)"unmapped filename"; 355 f.filename = (int)(long)"unmapped filename";
357 printk("----------- [cut here ] --------- [please bite here ] ---------\n"); 356 printk("----------- [cut here ] --------- [please bite here ] ---------\n");
358 printk(KERN_ALERT "Kernel BUG at %.50s:%d\n", (char *)(long)f.filename, f.line); 357 printk(KERN_ALERT "Kernel BUG at %.50s:%d\n", (char *)(long)f.filename, f.line);
359 } 358 }
360 359
361 #ifdef CONFIG_BUG 360 #ifdef CONFIG_BUG
362 void out_of_line_bug(void) 361 void out_of_line_bug(void)
363 { 362 {
364 BUG(); 363 BUG();
365 } 364 }
366 #endif 365 #endif
367 366
368 static DEFINE_SPINLOCK(die_lock); 367 static DEFINE_SPINLOCK(die_lock);
369 static int die_owner = -1; 368 static int die_owner = -1;
370 369
371 unsigned long oops_begin(void) 370 unsigned long oops_begin(void)
372 { 371 {
373 int cpu = safe_smp_processor_id(); 372 int cpu = safe_smp_processor_id();
374 unsigned long flags; 373 unsigned long flags;
375 374
376 /* racy, but better than risking deadlock. */ 375 /* racy, but better than risking deadlock. */
377 local_irq_save(flags); 376 local_irq_save(flags);
378 if (!spin_trylock(&die_lock)) { 377 if (!spin_trylock(&die_lock)) {
379 if (cpu == die_owner) 378 if (cpu == die_owner)
380 /* nested oops. should stop eventually */; 379 /* nested oops. should stop eventually */;
381 else 380 else
382 spin_lock(&die_lock); 381 spin_lock(&die_lock);
383 } 382 }
384 die_owner = cpu; 383 die_owner = cpu;
385 console_verbose(); 384 console_verbose();
386 bust_spinlocks(1); 385 bust_spinlocks(1);
387 return flags; 386 return flags;
388 } 387 }
389 388
390 void oops_end(unsigned long flags) 389 void oops_end(unsigned long flags)
391 { 390 {
392 die_owner = -1; 391 die_owner = -1;
393 bust_spinlocks(0); 392 bust_spinlocks(0);
394 spin_unlock_irqrestore(&die_lock, flags); 393 spin_unlock_irqrestore(&die_lock, flags);
395 if (panic_on_oops) 394 if (panic_on_oops)
396 panic("Oops"); 395 panic("Oops");
397 } 396 }
398 397
399 void __die(const char * str, struct pt_regs * regs, long err) 398 void __die(const char * str, struct pt_regs * regs, long err)
400 { 399 {
401 static int die_counter; 400 static int die_counter;
402 printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter); 401 printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
403 #ifdef CONFIG_PREEMPT 402 #ifdef CONFIG_PREEMPT
404 printk("PREEMPT "); 403 printk("PREEMPT ");
405 #endif 404 #endif
406 #ifdef CONFIG_SMP 405 #ifdef CONFIG_SMP
407 printk("SMP "); 406 printk("SMP ");
408 #endif 407 #endif
409 #ifdef CONFIG_DEBUG_PAGEALLOC 408 #ifdef CONFIG_DEBUG_PAGEALLOC
410 printk("DEBUG_PAGEALLOC"); 409 printk("DEBUG_PAGEALLOC");
411 #endif 410 #endif
412 printk("\n"); 411 printk("\n");
413 notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV); 412 notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV);
414 show_registers(regs); 413 show_registers(regs);
415 /* Executive summary in case the oops scrolled away */ 414 /* Executive summary in case the oops scrolled away */
416 printk(KERN_ALERT "RIP "); 415 printk(KERN_ALERT "RIP ");
417 printk_address(regs->rip); 416 printk_address(regs->rip);
418 printk(" RSP <%016lx>\n", regs->rsp); 417 printk(" RSP <%016lx>\n", regs->rsp);
419 } 418 }
420 419
421 void die(const char * str, struct pt_regs * regs, long err) 420 void die(const char * str, struct pt_regs * regs, long err)
422 { 421 {
423 unsigned long flags = oops_begin(); 422 unsigned long flags = oops_begin();
424 423
425 handle_BUG(regs); 424 handle_BUG(regs);
426 __die(str, regs, err); 425 __die(str, regs, err);
427 oops_end(flags); 426 oops_end(flags);
428 do_exit(SIGSEGV); 427 do_exit(SIGSEGV);
429 } 428 }
430 429
431 void die_nmi(char *str, struct pt_regs *regs) 430 void die_nmi(char *str, struct pt_regs *regs)
432 { 431 {
433 unsigned long flags = oops_begin(); 432 unsigned long flags = oops_begin();
434 433
435 /* 434 /*
436 * We are in trouble anyway, lets at least try 435 * We are in trouble anyway, lets at least try
437 * to get a message out. 436 * to get a message out.
438 */ 437 */
439 printk(str, safe_smp_processor_id()); 438 printk(str, safe_smp_processor_id());
440 show_registers(regs); 439 show_registers(regs);
441 if (panic_on_timeout || panic_on_oops) 440 if (panic_on_timeout || panic_on_oops)
442 panic("nmi watchdog"); 441 panic("nmi watchdog");
443 printk("console shuts up ...\n"); 442 printk("console shuts up ...\n");
444 oops_end(flags); 443 oops_end(flags);
445 do_exit(SIGSEGV); 444 do_exit(SIGSEGV);
446 } 445 }
447 446
448 static void __kprobes do_trap(int trapnr, int signr, char *str, 447 static void __kprobes do_trap(int trapnr, int signr, char *str,
449 struct pt_regs * regs, long error_code, 448 struct pt_regs * regs, long error_code,
450 siginfo_t *info) 449 siginfo_t *info)
451 { 450 {
452 struct task_struct *tsk = current; 451 struct task_struct *tsk = current;
453 452
454 conditional_sti(regs); 453 conditional_sti(regs);
455 454
456 tsk->thread.error_code = error_code; 455 tsk->thread.error_code = error_code;
457 tsk->thread.trap_no = trapnr; 456 tsk->thread.trap_no = trapnr;
458 457
459 if (user_mode(regs)) { 458 if (user_mode(regs)) {
460 if (exception_trace && unhandled_signal(tsk, signr)) 459 if (exception_trace && unhandled_signal(tsk, signr))
461 printk(KERN_INFO 460 printk(KERN_INFO
462 "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n", 461 "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
463 tsk->comm, tsk->pid, str, 462 tsk->comm, tsk->pid, str,
464 regs->rip,regs->rsp,error_code); 463 regs->rip,regs->rsp,error_code);
465 464
466 if (info) 465 if (info)
467 force_sig_info(signr, info, tsk); 466 force_sig_info(signr, info, tsk);
468 else 467 else
469 force_sig(signr, tsk); 468 force_sig(signr, tsk);
470 return; 469 return;
471 } 470 }
472 471
473 472
474 /* kernel trap */ 473 /* kernel trap */
475 { 474 {
476 const struct exception_table_entry *fixup; 475 const struct exception_table_entry *fixup;
477 fixup = search_exception_tables(regs->rip); 476 fixup = search_exception_tables(regs->rip);
478 if (fixup) { 477 if (fixup) {
479 regs->rip = fixup->fixup; 478 regs->rip = fixup->fixup;
480 } else 479 } else
481 die(str, regs, error_code); 480 die(str, regs, error_code);
482 return; 481 return;
483 } 482 }
484 } 483 }
485 484
486 #define DO_ERROR(trapnr, signr, str, name) \ 485 #define DO_ERROR(trapnr, signr, str, name) \
487 asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ 486 asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
488 { \ 487 { \
489 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 488 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
490 == NOTIFY_STOP) \ 489 == NOTIFY_STOP) \
491 return; \ 490 return; \
492 do_trap(trapnr, signr, str, regs, error_code, NULL); \ 491 do_trap(trapnr, signr, str, regs, error_code, NULL); \
493 } 492 }
494 493
495 #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ 494 #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
496 asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ 495 asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
497 { \ 496 { \
498 siginfo_t info; \ 497 siginfo_t info; \
499 info.si_signo = signr; \ 498 info.si_signo = signr; \
500 info.si_errno = 0; \ 499 info.si_errno = 0; \
501 info.si_code = sicode; \ 500 info.si_code = sicode; \
502 info.si_addr = (void __user *)siaddr; \ 501 info.si_addr = (void __user *)siaddr; \
503 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 502 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
504 == NOTIFY_STOP) \ 503 == NOTIFY_STOP) \
505 return; \ 504 return; \
506 do_trap(trapnr, signr, str, regs, error_code, &info); \ 505 do_trap(trapnr, signr, str, regs, error_code, &info); \
507 } 506 }
508 507
509 DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip) 508 DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip)
510 DO_ERROR( 4, SIGSEGV, "overflow", overflow) 509 DO_ERROR( 4, SIGSEGV, "overflow", overflow)
511 DO_ERROR( 5, SIGSEGV, "bounds", bounds) 510 DO_ERROR( 5, SIGSEGV, "bounds", bounds)
512 DO_ERROR_INFO( 6, SIGILL, "invalid operand", invalid_op, ILL_ILLOPN, regs->rip) 511 DO_ERROR_INFO( 6, SIGILL, "invalid operand", invalid_op, ILL_ILLOPN, regs->rip)
513 DO_ERROR( 7, SIGSEGV, "device not available", device_not_available) 512 DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
514 DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) 513 DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
515 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) 514 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
516 DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) 515 DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
517 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) 516 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
518 DO_ERROR(18, SIGSEGV, "reserved", reserved) 517 DO_ERROR(18, SIGSEGV, "reserved", reserved)
519 DO_ERROR(12, SIGBUS, "stack segment", stack_segment) 518 DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
520 519
521 asmlinkage void do_double_fault(struct pt_regs * regs, long error_code) 520 asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
522 { 521 {
523 static const char str[] = "double fault"; 522 static const char str[] = "double fault";
524 struct task_struct *tsk = current; 523 struct task_struct *tsk = current;
525 524
526 /* Return not checked because double check cannot be ignored */ 525 /* Return not checked because double check cannot be ignored */
527 notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV); 526 notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
528 527
529 tsk->thread.error_code = error_code; 528 tsk->thread.error_code = error_code;
530 tsk->thread.trap_no = 8; 529 tsk->thread.trap_no = 8;
531 530
532 /* This is always a kernel trap and never fixable (and thus must 531 /* This is always a kernel trap and never fixable (and thus must
533 never return). */ 532 never return). */
534 for (;;) 533 for (;;)
535 die(str, regs, error_code); 534 die(str, regs, error_code);
536 } 535 }
537 536
538 asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, 537 asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
539 long error_code) 538 long error_code)
540 { 539 {
541 struct task_struct *tsk = current; 540 struct task_struct *tsk = current;
542 541
543 conditional_sti(regs); 542 conditional_sti(regs);
544 543
545 tsk->thread.error_code = error_code; 544 tsk->thread.error_code = error_code;
546 tsk->thread.trap_no = 13; 545 tsk->thread.trap_no = 13;
547 546
548 if (user_mode(regs)) { 547 if (user_mode(regs)) {
549 if (exception_trace && unhandled_signal(tsk, SIGSEGV)) 548 if (exception_trace && unhandled_signal(tsk, SIGSEGV))
550 printk(KERN_INFO 549 printk(KERN_INFO
551 "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n", 550 "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
552 tsk->comm, tsk->pid, 551 tsk->comm, tsk->pid,
553 regs->rip,regs->rsp,error_code); 552 regs->rip,regs->rsp,error_code);
554 553
555 force_sig(SIGSEGV, tsk); 554 force_sig(SIGSEGV, tsk);
556 return; 555 return;
557 } 556 }
558 557
559 /* kernel gp */ 558 /* kernel gp */
560 { 559 {
561 const struct exception_table_entry *fixup; 560 const struct exception_table_entry *fixup;
562 fixup = search_exception_tables(regs->rip); 561 fixup = search_exception_tables(regs->rip);
563 if (fixup) { 562 if (fixup) {
564 regs->rip = fixup->fixup; 563 regs->rip = fixup->fixup;
565 return; 564 return;
566 } 565 }
567 if (notify_die(DIE_GPF, "general protection fault", regs, 566 if (notify_die(DIE_GPF, "general protection fault", regs,
568 error_code, 13, SIGSEGV) == NOTIFY_STOP) 567 error_code, 13, SIGSEGV) == NOTIFY_STOP)
569 return; 568 return;
570 die("general protection fault", regs, error_code); 569 die("general protection fault", regs, error_code);
571 } 570 }
572 } 571 }
573 572
574 static void mem_parity_error(unsigned char reason, struct pt_regs * regs) 573 static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
575 { 574 {
576 printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n"); 575 printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
577 printk("You probably have a hardware problem with your RAM chips\n"); 576 printk("You probably have a hardware problem with your RAM chips\n");
578 577
579 /* Clear and disable the memory parity error line. */ 578 /* Clear and disable the memory parity error line. */
580 reason = (reason & 0xf) | 4; 579 reason = (reason & 0xf) | 4;
581 outb(reason, 0x61); 580 outb(reason, 0x61);
582 } 581 }
583 582
584 static void io_check_error(unsigned char reason, struct pt_regs * regs) 583 static void io_check_error(unsigned char reason, struct pt_regs * regs)
585 { 584 {
586 printk("NMI: IOCK error (debug interrupt?)\n"); 585 printk("NMI: IOCK error (debug interrupt?)\n");
587 show_registers(regs); 586 show_registers(regs);
588 587
589 /* Re-enable the IOCK line, wait for a few seconds */ 588 /* Re-enable the IOCK line, wait for a few seconds */
590 reason = (reason & 0xf) | 8; 589 reason = (reason & 0xf) | 8;
591 outb(reason, 0x61); 590 outb(reason, 0x61);
592 mdelay(2000); 591 mdelay(2000);
593 reason &= ~8; 592 reason &= ~8;
594 outb(reason, 0x61); 593 outb(reason, 0x61);
595 } 594 }
596 595
597 static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) 596 static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
598 { printk("Uhhuh. NMI received for unknown reason %02x.\n", reason); 597 { printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
599 printk("Dazed and confused, but trying to continue\n"); 598 printk("Dazed and confused, but trying to continue\n");
600 printk("Do you have a strange power saving mode enabled?\n"); 599 printk("Do you have a strange power saving mode enabled?\n");
601 } 600 }
602 601
603 /* Runs on IST stack. This code must keep interrupts off all the time. 602 /* Runs on IST stack. This code must keep interrupts off all the time.
604 Nested NMIs are prevented by the CPU. */ 603 Nested NMIs are prevented by the CPU. */
605 asmlinkage void default_do_nmi(struct pt_regs *regs) 604 asmlinkage void default_do_nmi(struct pt_regs *regs)
606 { 605 {
607 unsigned char reason = 0; 606 unsigned char reason = 0;
608 int cpu; 607 int cpu;
609 608
610 cpu = smp_processor_id(); 609 cpu = smp_processor_id();
611 610
612 /* Only the BSP gets external NMIs from the system. */ 611 /* Only the BSP gets external NMIs from the system. */
613 if (!cpu) 612 if (!cpu)
614 reason = get_nmi_reason(); 613 reason = get_nmi_reason();
615 614
616 if (!(reason & 0xc0)) { 615 if (!(reason & 0xc0)) {
617 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) 616 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
618 == NOTIFY_STOP) 617 == NOTIFY_STOP)
619 return; 618 return;
620 #ifdef CONFIG_X86_LOCAL_APIC 619 #ifdef CONFIG_X86_LOCAL_APIC
621 /* 620 /*
622 * Ok, so this is none of the documented NMI sources, 621 * Ok, so this is none of the documented NMI sources,
623 * so it must be the NMI watchdog. 622 * so it must be the NMI watchdog.
624 */ 623 */
625 if (nmi_watchdog > 0) { 624 if (nmi_watchdog > 0) {
626 nmi_watchdog_tick(regs,reason); 625 nmi_watchdog_tick(regs,reason);
627 return; 626 return;
628 } 627 }
629 #endif 628 #endif
630 unknown_nmi_error(reason, regs); 629 unknown_nmi_error(reason, regs);
631 return; 630 return;
632 } 631 }
633 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) 632 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
634 return; 633 return;
635 634
636 /* AK: following checks seem to be broken on modern chipsets. FIXME */ 635 /* AK: following checks seem to be broken on modern chipsets. FIXME */
637 636
638 if (reason & 0x80) 637 if (reason & 0x80)
639 mem_parity_error(reason, regs); 638 mem_parity_error(reason, regs);
640 if (reason & 0x40) 639 if (reason & 0x40)
641 io_check_error(reason, regs); 640 io_check_error(reason, regs);
642 } 641 }
643 642
644 /* runs on IST stack. */ 643 /* runs on IST stack. */
645 asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code) 644 asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
646 { 645 {
647 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) { 646 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
648 return; 647 return;
649 } 648 }
650 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); 649 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
651 return; 650 return;
652 } 651 }
653 652
654 /* Help handler running on IST stack to switch back to user stack 653 /* Help handler running on IST stack to switch back to user stack
655 for scheduling or signal handling. The actual stack switch is done in 654 for scheduling or signal handling. The actual stack switch is done in
656 entry.S */ 655 entry.S */
657 asmlinkage struct pt_regs *sync_regs(struct pt_regs *eregs) 656 asmlinkage struct pt_regs *sync_regs(struct pt_regs *eregs)
658 { 657 {
659 struct pt_regs *regs = eregs; 658 struct pt_regs *regs = eregs;
660 /* Did already sync */ 659 /* Did already sync */
661 if (eregs == (struct pt_regs *)eregs->rsp) 660 if (eregs == (struct pt_regs *)eregs->rsp)
662 ; 661 ;
663 /* Exception from user space */ 662 /* Exception from user space */
664 else if (user_mode(eregs)) 663 else if (user_mode(eregs))
665 regs = ((struct pt_regs *)current->thread.rsp0) - 1; 664 regs = ((struct pt_regs *)current->thread.rsp0) - 1;
666 /* Exception from kernel and interrupts are enabled. Move to 665 /* Exception from kernel and interrupts are enabled. Move to
667 kernel process stack. */ 666 kernel process stack. */
668 else if (eregs->eflags & X86_EFLAGS_IF) 667 else if (eregs->eflags & X86_EFLAGS_IF)
669 regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs)); 668 regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs));
670 if (eregs != regs) 669 if (eregs != regs)
671 *regs = *eregs; 670 *regs = *eregs;
672 return regs; 671 return regs;
673 } 672 }
674 673
675 /* runs on IST stack. */ 674 /* runs on IST stack. */
676 asmlinkage void __kprobes do_debug(struct pt_regs * regs, 675 asmlinkage void __kprobes do_debug(struct pt_regs * regs,
677 unsigned long error_code) 676 unsigned long error_code)
678 { 677 {
679 unsigned long condition; 678 unsigned long condition;
680 struct task_struct *tsk = current; 679 struct task_struct *tsk = current;
681 siginfo_t info; 680 siginfo_t info;
682 681
683 get_debugreg(condition, 6); 682 get_debugreg(condition, 6);
684 683
685 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, 684 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
686 SIGTRAP) == NOTIFY_STOP) 685 SIGTRAP) == NOTIFY_STOP)
687 return; 686 return;
688 687
689 conditional_sti(regs); 688 conditional_sti(regs);
690 689
691 /* Mask out spurious debug traps due to lazy DR7 setting */ 690 /* Mask out spurious debug traps due to lazy DR7 setting */
692 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { 691 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
693 if (!tsk->thread.debugreg7) { 692 if (!tsk->thread.debugreg7) {
694 goto clear_dr7; 693 goto clear_dr7;
695 } 694 }
696 } 695 }
697 696
698 tsk->thread.debugreg6 = condition; 697 tsk->thread.debugreg6 = condition;
699 698
700 /* Mask out spurious TF errors due to lazy TF clearing */ 699 /* Mask out spurious TF errors due to lazy TF clearing */
701 if (condition & DR_STEP) { 700 if (condition & DR_STEP) {
702 /* 701 /*
703 * The TF error should be masked out only if the current 702 * The TF error should be masked out only if the current
704 * process is not traced and if the TRAP flag has been set 703 * process is not traced and if the TRAP flag has been set
705 * previously by a tracing process (condition detected by 704 * previously by a tracing process (condition detected by
706 * the PT_DTRACE flag); remember that the i386 TRAP flag 705 * the PT_DTRACE flag); remember that the i386 TRAP flag
707 * can be modified by the process itself in user mode, 706 * can be modified by the process itself in user mode,
708 * allowing programs to debug themselves without the ptrace() 707 * allowing programs to debug themselves without the ptrace()
709 * interface. 708 * interface.
710 */ 709 */
711 if (!user_mode(regs)) 710 if (!user_mode(regs))
712 goto clear_TF_reenable; 711 goto clear_TF_reenable;
713 /* 712 /*
714 * Was the TF flag set by a debugger? If so, clear it now, 713 * Was the TF flag set by a debugger? If so, clear it now,
715 * so that register information is correct. 714 * so that register information is correct.
716 */ 715 */
717 if (tsk->ptrace & PT_DTRACE) { 716 if (tsk->ptrace & PT_DTRACE) {
718 regs->eflags &= ~TF_MASK; 717 regs->eflags &= ~TF_MASK;
719 tsk->ptrace &= ~PT_DTRACE; 718 tsk->ptrace &= ~PT_DTRACE;
720 } 719 }
721 } 720 }
722 721
723 /* Ok, finally something we can handle */ 722 /* Ok, finally something we can handle */
724 tsk->thread.trap_no = 1; 723 tsk->thread.trap_no = 1;
725 tsk->thread.error_code = error_code; 724 tsk->thread.error_code = error_code;
726 info.si_signo = SIGTRAP; 725 info.si_signo = SIGTRAP;
727 info.si_errno = 0; 726 info.si_errno = 0;
728 info.si_code = TRAP_BRKPT; 727 info.si_code = TRAP_BRKPT;
729 info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL; 728 info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL;
730 force_sig_info(SIGTRAP, &info, tsk); 729 force_sig_info(SIGTRAP, &info, tsk);
731 730
732 clear_dr7: 731 clear_dr7:
733 set_debugreg(0UL, 7); 732 set_debugreg(0UL, 7);
734 return; 733 return;
735 734
736 clear_TF_reenable: 735 clear_TF_reenable:
737 set_tsk_thread_flag(tsk, TIF_SINGLESTEP); 736 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
738 regs->eflags &= ~TF_MASK; 737 regs->eflags &= ~TF_MASK;
739 } 738 }
740 739
741 static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) 740 static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
742 { 741 {
743 const struct exception_table_entry *fixup; 742 const struct exception_table_entry *fixup;
744 fixup = search_exception_tables(regs->rip); 743 fixup = search_exception_tables(regs->rip);
745 if (fixup) { 744 if (fixup) {
746 regs->rip = fixup->fixup; 745 regs->rip = fixup->fixup;
747 return 1; 746 return 1;
748 } 747 }
749 notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); 748 notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
750 /* Illegal floating point operation in the kernel */ 749 /* Illegal floating point operation in the kernel */
751 current->thread.trap_no = trapnr; 750 current->thread.trap_no = trapnr;
752 die(str, regs, 0); 751 die(str, regs, 0);
753 return 0; 752 return 0;
754 } 753 }
755 754
756 /* 755 /*
757 * Note that we play around with the 'TS' bit in an attempt to get 756 * Note that we play around with the 'TS' bit in an attempt to get
758 * the correct behaviour even in the presence of the asynchronous 757 * the correct behaviour even in the presence of the asynchronous
759 * IRQ13 behaviour 758 * IRQ13 behaviour
760 */ 759 */
761 asmlinkage void do_coprocessor_error(struct pt_regs *regs) 760 asmlinkage void do_coprocessor_error(struct pt_regs *regs)
762 { 761 {
763 void __user *rip = (void __user *)(regs->rip); 762 void __user *rip = (void __user *)(regs->rip);
764 struct task_struct * task; 763 struct task_struct * task;
765 siginfo_t info; 764 siginfo_t info;
766 unsigned short cwd, swd; 765 unsigned short cwd, swd;
767 766
768 conditional_sti(regs); 767 conditional_sti(regs);
769 if (!user_mode(regs) && 768 if (!user_mode(regs) &&
770 kernel_math_error(regs, "kernel x87 math error", 16)) 769 kernel_math_error(regs, "kernel x87 math error", 16))
771 return; 770 return;
772 771
773 /* 772 /*
774 * Save the info for the exception handler and clear the error. 773 * Save the info for the exception handler and clear the error.
775 */ 774 */
776 task = current; 775 task = current;
777 save_init_fpu(task); 776 save_init_fpu(task);
778 task->thread.trap_no = 16; 777 task->thread.trap_no = 16;
779 task->thread.error_code = 0; 778 task->thread.error_code = 0;
780 info.si_signo = SIGFPE; 779 info.si_signo = SIGFPE;
781 info.si_errno = 0; 780 info.si_errno = 0;
782 info.si_code = __SI_FAULT; 781 info.si_code = __SI_FAULT;
783 info.si_addr = rip; 782 info.si_addr = rip;
784 /* 783 /*
785 * (~cwd & swd) will mask out exceptions that are not set to unmasked 784 * (~cwd & swd) will mask out exceptions that are not set to unmasked
786 * status. 0x3f is the exception bits in these regs, 0x200 is the 785 * status. 0x3f is the exception bits in these regs, 0x200 is the
787 * C1 reg you need in case of a stack fault, 0x040 is the stack 786 * C1 reg you need in case of a stack fault, 0x040 is the stack
788 * fault bit. We should only be taking one exception at a time, 787 * fault bit. We should only be taking one exception at a time,
789 * so if this combination doesn't produce any single exception, 788 * so if this combination doesn't produce any single exception,
790 * then we have a bad program that isn't synchronizing its FPU usage 789 * then we have a bad program that isn't synchronizing its FPU usage
791 * and it will suffer the consequences since we won't be able to 790 * and it will suffer the consequences since we won't be able to
792 * fully reproduce the context of the exception 791 * fully reproduce the context of the exception
793 */ 792 */
794 cwd = get_fpu_cwd(task); 793 cwd = get_fpu_cwd(task);
795 swd = get_fpu_swd(task); 794 swd = get_fpu_swd(task);
796 switch (swd & ~cwd & 0x3f) { 795 switch (swd & ~cwd & 0x3f) {
797 case 0x000: 796 case 0x000:
798 default: 797 default:
799 break; 798 break;
800 case 0x001: /* Invalid Op */ 799 case 0x001: /* Invalid Op */
801 /* 800 /*
802 * swd & 0x240 == 0x040: Stack Underflow 801 * swd & 0x240 == 0x040: Stack Underflow
803 * swd & 0x240 == 0x240: Stack Overflow 802 * swd & 0x240 == 0x240: Stack Overflow
804 * User must clear the SF bit (0x40) if set 803 * User must clear the SF bit (0x40) if set
805 */ 804 */
806 info.si_code = FPE_FLTINV; 805 info.si_code = FPE_FLTINV;
807 break; 806 break;
808 case 0x002: /* Denormalize */ 807 case 0x002: /* Denormalize */
809 case 0x010: /* Underflow */ 808 case 0x010: /* Underflow */
810 info.si_code = FPE_FLTUND; 809 info.si_code = FPE_FLTUND;
811 break; 810 break;
812 case 0x004: /* Zero Divide */ 811 case 0x004: /* Zero Divide */
813 info.si_code = FPE_FLTDIV; 812 info.si_code = FPE_FLTDIV;
814 break; 813 break;
815 case 0x008: /* Overflow */ 814 case 0x008: /* Overflow */
816 info.si_code = FPE_FLTOVF; 815 info.si_code = FPE_FLTOVF;
817 break; 816 break;
818 case 0x020: /* Precision */ 817 case 0x020: /* Precision */
819 info.si_code = FPE_FLTRES; 818 info.si_code = FPE_FLTRES;
820 break; 819 break;
821 } 820 }
822 force_sig_info(SIGFPE, &info, task); 821 force_sig_info(SIGFPE, &info, task);
823 } 822 }
824 823
825 asmlinkage void bad_intr(void) 824 asmlinkage void bad_intr(void)
826 { 825 {
827 printk("bad interrupt"); 826 printk("bad interrupt");
828 } 827 }
829 828
830 asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) 829 asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
831 { 830 {
832 void __user *rip = (void __user *)(regs->rip); 831 void __user *rip = (void __user *)(regs->rip);
833 struct task_struct * task; 832 struct task_struct * task;
834 siginfo_t info; 833 siginfo_t info;
835 unsigned short mxcsr; 834 unsigned short mxcsr;
836 835
837 conditional_sti(regs); 836 conditional_sti(regs);
838 if (!user_mode(regs) && 837 if (!user_mode(regs) &&
839 kernel_math_error(regs, "kernel simd math error", 19)) 838 kernel_math_error(regs, "kernel simd math error", 19))
840 return; 839 return;
841 840
842 /* 841 /*
843 * Save the info for the exception handler and clear the error. 842 * Save the info for the exception handler and clear the error.
844 */ 843 */
845 task = current; 844 task = current;
846 save_init_fpu(task); 845 save_init_fpu(task);
847 task->thread.trap_no = 19; 846 task->thread.trap_no = 19;
848 task->thread.error_code = 0; 847 task->thread.error_code = 0;
849 info.si_signo = SIGFPE; 848 info.si_signo = SIGFPE;
850 info.si_errno = 0; 849 info.si_errno = 0;
851 info.si_code = __SI_FAULT; 850 info.si_code = __SI_FAULT;
852 info.si_addr = rip; 851 info.si_addr = rip;
853 /* 852 /*
854 * The SIMD FPU exceptions are handled a little differently, as there 853 * The SIMD FPU exceptions are handled a little differently, as there
855 * is only a single status/control register. Thus, to determine which 854 * is only a single status/control register. Thus, to determine which
856 * unmasked exception was caught we must mask the exception mask bits 855 * unmasked exception was caught we must mask the exception mask bits
857 * at 0x1f80, and then use these to mask the exception bits at 0x3f. 856 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
858 */ 857 */
859 mxcsr = get_fpu_mxcsr(task); 858 mxcsr = get_fpu_mxcsr(task);
860 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { 859 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
861 case 0x000: 860 case 0x000:
862 default: 861 default:
863 break; 862 break;
864 case 0x001: /* Invalid Op */ 863 case 0x001: /* Invalid Op */
865 info.si_code = FPE_FLTINV; 864 info.si_code = FPE_FLTINV;
866 break; 865 break;
867 case 0x002: /* Denormalize */ 866 case 0x002: /* Denormalize */
868 case 0x010: /* Underflow */ 867 case 0x010: /* Underflow */
869 info.si_code = FPE_FLTUND; 868 info.si_code = FPE_FLTUND;
870 break; 869 break;
871 case 0x004: /* Zero Divide */ 870 case 0x004: /* Zero Divide */
872 info.si_code = FPE_FLTDIV; 871 info.si_code = FPE_FLTDIV;
873 break; 872 break;
874 case 0x008: /* Overflow */ 873 case 0x008: /* Overflow */
875 info.si_code = FPE_FLTOVF; 874 info.si_code = FPE_FLTOVF;
876 break; 875 break;
877 case 0x020: /* Precision */ 876 case 0x020: /* Precision */
878 info.si_code = FPE_FLTRES; 877 info.si_code = FPE_FLTRES;
879 break; 878 break;
880 } 879 }
881 force_sig_info(SIGFPE, &info, task); 880 force_sig_info(SIGFPE, &info, task);
882 } 881 }
883 882
884 asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs) 883 asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs)
885 { 884 {
886 } 885 }
887 886
888 asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) 887 asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
889 { 888 {
890 } 889 }
891 890
892 asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) 891 asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
893 { 892 {
894 } 893 }
895 894
896 /* 895 /*
897 * 'math_state_restore()' saves the current math information in the 896 * 'math_state_restore()' saves the current math information in the
898 * old math state array, and gets the new ones from the current task 897 * old math state array, and gets the new ones from the current task
899 * 898 *
900 * Careful.. There are problems with IBM-designed IRQ13 behaviour. 899 * Careful.. There are problems with IBM-designed IRQ13 behaviour.
901 * Don't touch unless you *really* know how it works. 900 * Don't touch unless you *really* know how it works.
902 */ 901 */
903 asmlinkage void math_state_restore(void) 902 asmlinkage void math_state_restore(void)
904 { 903 {
905 struct task_struct *me = current; 904 struct task_struct *me = current;
906 clts(); /* Allow maths ops (or we recurse) */ 905 clts(); /* Allow maths ops (or we recurse) */
907 906
908 if (!used_math()) 907 if (!used_math())
909 init_fpu(me); 908 init_fpu(me);
910 restore_fpu_checking(&me->thread.i387.fxsave); 909 restore_fpu_checking(&me->thread.i387.fxsave);
911 me->thread_info->status |= TS_USEDFPU; 910 me->thread_info->status |= TS_USEDFPU;
912 } 911 }
913 912
914 void do_call_debug(struct pt_regs *regs) 913 void do_call_debug(struct pt_regs *regs)
915 { 914 {
916 notify_die(DIE_CALL, "debug call", regs, 0, 255, SIGINT); 915 notify_die(DIE_CALL, "debug call", regs, 0, 255, SIGINT);
917 } 916 }
918 917
919 void __init trap_init(void) 918 void __init trap_init(void)
920 { 919 {
921 set_intr_gate(0,&divide_error); 920 set_intr_gate(0,&divide_error);
922 set_intr_gate_ist(1,&debug,DEBUG_STACK); 921 set_intr_gate_ist(1,&debug,DEBUG_STACK);
923 set_intr_gate_ist(2,&nmi,NMI_STACK); 922 set_intr_gate_ist(2,&nmi,NMI_STACK);
924 set_system_gate_ist(3,&int3,DEBUG_STACK); /* int3 can be called from all */ 923 set_system_gate_ist(3,&int3,DEBUG_STACK); /* int3 can be called from all */
925 set_system_gate(4,&overflow); /* int4 can be called from all */ 924 set_system_gate(4,&overflow); /* int4 can be called from all */
926 set_intr_gate(5,&bounds); 925 set_intr_gate(5,&bounds);
927 set_intr_gate(6,&invalid_op); 926 set_intr_gate(6,&invalid_op);
928 set_intr_gate(7,&device_not_available); 927 set_intr_gate(7,&device_not_available);
929 set_intr_gate_ist(8,&double_fault, DOUBLEFAULT_STACK); 928 set_intr_gate_ist(8,&double_fault, DOUBLEFAULT_STACK);
930 set_intr_gate(9,&coprocessor_segment_overrun); 929 set_intr_gate(9,&coprocessor_segment_overrun);
931 set_intr_gate(10,&invalid_TSS); 930 set_intr_gate(10,&invalid_TSS);
932 set_intr_gate(11,&segment_not_present); 931 set_intr_gate(11,&segment_not_present);
933 set_intr_gate_ist(12,&stack_segment,STACKFAULT_STACK); 932 set_intr_gate_ist(12,&stack_segment,STACKFAULT_STACK);
934 set_intr_gate(13,&general_protection); 933 set_intr_gate(13,&general_protection);
935 set_intr_gate(14,&page_fault); 934 set_intr_gate(14,&page_fault);
936 set_intr_gate(15,&spurious_interrupt_bug); 935 set_intr_gate(15,&spurious_interrupt_bug);
937 set_intr_gate(16,&coprocessor_error); 936 set_intr_gate(16,&coprocessor_error);
938 set_intr_gate(17,&alignment_check); 937 set_intr_gate(17,&alignment_check);
939 #ifdef CONFIG_X86_MCE 938 #ifdef CONFIG_X86_MCE
940 set_intr_gate_ist(18,&machine_check, MCE_STACK); 939 set_intr_gate_ist(18,&machine_check, MCE_STACK);
941 #endif 940 #endif
942 set_intr_gate(19,&simd_coprocessor_error); 941 set_intr_gate(19,&simd_coprocessor_error);
943 942
944 #ifdef CONFIG_IA32_EMULATION 943 #ifdef CONFIG_IA32_EMULATION
945 set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall); 944 set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
946 #endif 945 #endif
947 946
948 set_intr_gate(KDB_VECTOR, call_debug); 947 set_intr_gate(KDB_VECTOR, call_debug);
949 948
950 /* 949 /*
951 * Should be a barrier for any external CPU state. 950 * Should be a barrier for any external CPU state.
952 */ 951 */
953 cpu_init(); 952 cpu_init();
954 } 953 }
955 954
956 955
957 /* Actual parsing is done early in setup.c. */ 956 /* Actual parsing is done early in setup.c. */
958 static int __init oops_dummy(char *s) 957 static int __init oops_dummy(char *s)
959 { 958 {
960 panic_on_oops = 1; 959 panic_on_oops = 1;
961 return -1; 960 return -1;
962 } 961 }
963 __setup("oops=", oops_dummy); 962 __setup("oops=", oops_dummy);
964 963
965 static int __init kstack_setup(char *s) 964 static int __init kstack_setup(char *s)
966 { 965 {
967 kstack_depth_to_print = simple_strtoul(s,NULL,0); 966 kstack_depth_to_print = simple_strtoul(s,NULL,0);
968 return 0; 967 return 0;
969 } 968 }
970 __setup("kstack=", kstack_setup); 969 __setup("kstack=", kstack_setup);
971 970
972 971
arch/x86_64/kernel/x8664_ksyms.c
1 #include <linux/config.h> 1 #include <linux/config.h>
2 #include <linux/module.h> 2 #include <linux/module.h>
3 #include <linux/smp.h> 3 #include <linux/smp.h>
4 #include <linux/user.h> 4 #include <linux/user.h>
5 #include <linux/sched.h> 5 #include <linux/sched.h>
6 #include <linux/in6.h> 6 #include <linux/in6.h>
7 #include <linux/interrupt.h> 7 #include <linux/interrupt.h>
8 #include <linux/smp_lock.h> 8 #include <linux/smp_lock.h>
9 #include <linux/pm.h> 9 #include <linux/pm.h>
10 #include <linux/pci.h> 10 #include <linux/pci.h>
11 #include <linux/apm_bios.h> 11 #include <linux/apm_bios.h>
12 #include <linux/kernel.h> 12 #include <linux/kernel.h>
13 #include <linux/string.h> 13 #include <linux/string.h>
14 #include <linux/syscalls.h> 14 #include <linux/syscalls.h>
15 #include <linux/tty.h> 15 #include <linux/tty.h>
16 16
17 #include <asm/semaphore.h> 17 #include <asm/semaphore.h>
18 #include <asm/processor.h> 18 #include <asm/processor.h>
19 #include <asm/i387.h> 19 #include <asm/i387.h>
20 #include <asm/uaccess.h> 20 #include <asm/uaccess.h>
21 #include <asm/checksum.h> 21 #include <asm/checksum.h>
22 #include <asm/io.h> 22 #include <asm/io.h>
23 #include <asm/delay.h> 23 #include <asm/delay.h>
24 #include <asm/irq.h> 24 #include <asm/irq.h>
25 #include <asm/mmx.h> 25 #include <asm/mmx.h>
26 #include <asm/desc.h> 26 #include <asm/desc.h>
27 #include <asm/pgtable.h> 27 #include <asm/pgtable.h>
28 #include <asm/pgalloc.h> 28 #include <asm/pgalloc.h>
29 #include <asm/nmi.h> 29 #include <asm/nmi.h>
30 #include <asm/kdebug.h> 30 #include <asm/kdebug.h>
31 #include <asm/unistd.h> 31 #include <asm/unistd.h>
32 #include <asm/tlbflush.h> 32 #include <asm/tlbflush.h>
33 #include <asm/kdebug.h> 33 #include <asm/kdebug.h>
34 34
35 extern spinlock_t rtc_lock; 35 extern spinlock_t rtc_lock;
36 36
37 #ifdef CONFIG_SMP 37 #ifdef CONFIG_SMP
38 extern void __write_lock_failed(rwlock_t *rw); 38 extern void __write_lock_failed(rwlock_t *rw);
39 extern void __read_lock_failed(rwlock_t *rw); 39 extern void __read_lock_failed(rwlock_t *rw);
40 #endif 40 #endif
41 41
42 #if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE) 42 #if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE)
43 extern struct drive_info_struct drive_info; 43 extern struct drive_info_struct drive_info;
44 EXPORT_SYMBOL(drive_info); 44 EXPORT_SYMBOL(drive_info);
45 #endif 45 #endif
46 46
47 extern unsigned long get_cmos_time(void); 47 extern unsigned long get_cmos_time(void);
48 48
49 /* platform dependent support */ 49 /* platform dependent support */
50 EXPORT_SYMBOL(boot_cpu_data); 50 EXPORT_SYMBOL(boot_cpu_data);
51 //EXPORT_SYMBOL(dump_fpu); 51 //EXPORT_SYMBOL(dump_fpu);
52 EXPORT_SYMBOL(__ioremap); 52 EXPORT_SYMBOL(__ioremap);
53 EXPORT_SYMBOL(ioremap_nocache); 53 EXPORT_SYMBOL(ioremap_nocache);
54 EXPORT_SYMBOL(iounmap); 54 EXPORT_SYMBOL(iounmap);
55 EXPORT_SYMBOL(kernel_thread); 55 EXPORT_SYMBOL(kernel_thread);
56 EXPORT_SYMBOL(pm_idle); 56 EXPORT_SYMBOL(pm_idle);
57 EXPORT_SYMBOL(pm_power_off); 57 EXPORT_SYMBOL(pm_power_off);
58 EXPORT_SYMBOL(get_cmos_time); 58 EXPORT_SYMBOL(get_cmos_time);
59 59
60 EXPORT_SYMBOL(__down_failed); 60 EXPORT_SYMBOL(__down_failed);
61 EXPORT_SYMBOL(__down_failed_interruptible); 61 EXPORT_SYMBOL(__down_failed_interruptible);
62 EXPORT_SYMBOL(__down_failed_trylock); 62 EXPORT_SYMBOL(__down_failed_trylock);
63 EXPORT_SYMBOL(__up_wakeup); 63 EXPORT_SYMBOL(__up_wakeup);
64 /* Networking helper routines. */ 64 /* Networking helper routines. */
65 EXPORT_SYMBOL(csum_partial_copy_nocheck); 65 EXPORT_SYMBOL(csum_partial_copy_nocheck);
66 EXPORT_SYMBOL(ip_compute_csum); 66 EXPORT_SYMBOL(ip_compute_csum);
67 /* Delay loops */ 67 /* Delay loops */
68 EXPORT_SYMBOL(__udelay); 68 EXPORT_SYMBOL(__udelay);
69 EXPORT_SYMBOL(__ndelay); 69 EXPORT_SYMBOL(__ndelay);
70 EXPORT_SYMBOL(__delay); 70 EXPORT_SYMBOL(__delay);
71 EXPORT_SYMBOL(__const_udelay); 71 EXPORT_SYMBOL(__const_udelay);
72 72
73 EXPORT_SYMBOL(__get_user_1); 73 EXPORT_SYMBOL(__get_user_1);
74 EXPORT_SYMBOL(__get_user_2); 74 EXPORT_SYMBOL(__get_user_2);
75 EXPORT_SYMBOL(__get_user_4); 75 EXPORT_SYMBOL(__get_user_4);
76 EXPORT_SYMBOL(__get_user_8); 76 EXPORT_SYMBOL(__get_user_8);
77 EXPORT_SYMBOL(__put_user_1); 77 EXPORT_SYMBOL(__put_user_1);
78 EXPORT_SYMBOL(__put_user_2); 78 EXPORT_SYMBOL(__put_user_2);
79 EXPORT_SYMBOL(__put_user_4); 79 EXPORT_SYMBOL(__put_user_4);
80 EXPORT_SYMBOL(__put_user_8); 80 EXPORT_SYMBOL(__put_user_8);
81 81
82 EXPORT_SYMBOL(strncpy_from_user); 82 EXPORT_SYMBOL(strncpy_from_user);
83 EXPORT_SYMBOL(__strncpy_from_user); 83 EXPORT_SYMBOL(__strncpy_from_user);
84 EXPORT_SYMBOL(clear_user); 84 EXPORT_SYMBOL(clear_user);
85 EXPORT_SYMBOL(__clear_user); 85 EXPORT_SYMBOL(__clear_user);
86 EXPORT_SYMBOL(copy_user_generic); 86 EXPORT_SYMBOL(copy_user_generic);
87 EXPORT_SYMBOL(copy_from_user); 87 EXPORT_SYMBOL(copy_from_user);
88 EXPORT_SYMBOL(copy_to_user); 88 EXPORT_SYMBOL(copy_to_user);
89 EXPORT_SYMBOL(copy_in_user); 89 EXPORT_SYMBOL(copy_in_user);
90 EXPORT_SYMBOL(strnlen_user); 90 EXPORT_SYMBOL(strnlen_user);
91 91
92 #ifdef CONFIG_PCI 92 #ifdef CONFIG_PCI
93 EXPORT_SYMBOL(pci_mem_start); 93 EXPORT_SYMBOL(pci_mem_start);
94 #endif 94 #endif
95 95
96 EXPORT_SYMBOL(copy_page); 96 EXPORT_SYMBOL(copy_page);
97 EXPORT_SYMBOL(clear_page); 97 EXPORT_SYMBOL(clear_page);
98 98
99 EXPORT_SYMBOL(cpu_pda); 99 EXPORT_SYMBOL(_cpu_pda);
100 #ifdef CONFIG_SMP 100 #ifdef CONFIG_SMP
101 EXPORT_SYMBOL(cpu_data); 101 EXPORT_SYMBOL(cpu_data);
102 EXPORT_SYMBOL(__write_lock_failed); 102 EXPORT_SYMBOL(__write_lock_failed);
103 EXPORT_SYMBOL(__read_lock_failed); 103 EXPORT_SYMBOL(__read_lock_failed);
104 104
105 EXPORT_SYMBOL(smp_call_function); 105 EXPORT_SYMBOL(smp_call_function);
106 EXPORT_SYMBOL(cpu_callout_map); 106 EXPORT_SYMBOL(cpu_callout_map);
107 #endif 107 #endif
108 108
109 #ifdef CONFIG_VT 109 #ifdef CONFIG_VT
110 EXPORT_SYMBOL(screen_info); 110 EXPORT_SYMBOL(screen_info);
111 #endif 111 #endif
112 112
113 EXPORT_SYMBOL(get_wchan); 113 EXPORT_SYMBOL(get_wchan);
114 114
115 EXPORT_SYMBOL(rtc_lock); 115 EXPORT_SYMBOL(rtc_lock);
116 116
117 EXPORT_SYMBOL_GPL(set_nmi_callback); 117 EXPORT_SYMBOL_GPL(set_nmi_callback);
118 EXPORT_SYMBOL_GPL(unset_nmi_callback); 118 EXPORT_SYMBOL_GPL(unset_nmi_callback);
119 119
120 /* Export string functions. We normally rely on gcc builtin for most of these, 120 /* Export string functions. We normally rely on gcc builtin for most of these,
121 but gcc sometimes decides not to inline them. */ 121 but gcc sometimes decides not to inline them. */
122 #undef memcpy 122 #undef memcpy
123 #undef memset 123 #undef memset
124 #undef memmove 124 #undef memmove
125 #undef strlen 125 #undef strlen
126 126
127 extern void * memset(void *,int,__kernel_size_t); 127 extern void * memset(void *,int,__kernel_size_t);
128 extern size_t strlen(const char *); 128 extern size_t strlen(const char *);
129 extern void * memmove(void * dest,const void *src,size_t count); 129 extern void * memmove(void * dest,const void *src,size_t count);
130 extern void * memcpy(void *,const void *,__kernel_size_t); 130 extern void * memcpy(void *,const void *,__kernel_size_t);
131 extern void * __memcpy(void *,const void *,__kernel_size_t); 131 extern void * __memcpy(void *,const void *,__kernel_size_t);
132 132
133 EXPORT_SYMBOL(memset); 133 EXPORT_SYMBOL(memset);
134 EXPORT_SYMBOL(strlen); 134 EXPORT_SYMBOL(strlen);
135 EXPORT_SYMBOL(memmove); 135 EXPORT_SYMBOL(memmove);
136 EXPORT_SYMBOL(memcpy); 136 EXPORT_SYMBOL(memcpy);
137 EXPORT_SYMBOL(__memcpy); 137 EXPORT_SYMBOL(__memcpy);
138 138
139 #ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM 139 #ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
140 /* prototypes are wrong, these are assembly with custom calling functions */ 140 /* prototypes are wrong, these are assembly with custom calling functions */
141 extern void rwsem_down_read_failed_thunk(void); 141 extern void rwsem_down_read_failed_thunk(void);
142 extern void rwsem_wake_thunk(void); 142 extern void rwsem_wake_thunk(void);
143 extern void rwsem_downgrade_thunk(void); 143 extern void rwsem_downgrade_thunk(void);
144 extern void rwsem_down_write_failed_thunk(void); 144 extern void rwsem_down_write_failed_thunk(void);
145 EXPORT_SYMBOL(rwsem_down_read_failed_thunk); 145 EXPORT_SYMBOL(rwsem_down_read_failed_thunk);
146 EXPORT_SYMBOL(rwsem_wake_thunk); 146 EXPORT_SYMBOL(rwsem_wake_thunk);
147 EXPORT_SYMBOL(rwsem_downgrade_thunk); 147 EXPORT_SYMBOL(rwsem_downgrade_thunk);
148 EXPORT_SYMBOL(rwsem_down_write_failed_thunk); 148 EXPORT_SYMBOL(rwsem_down_write_failed_thunk);
149 #endif 149 #endif
150 150
151 EXPORT_SYMBOL(empty_zero_page); 151 EXPORT_SYMBOL(empty_zero_page);
152 152
153 EXPORT_SYMBOL(die_chain); 153 EXPORT_SYMBOL(die_chain);
154 EXPORT_SYMBOL(register_die_notifier); 154 EXPORT_SYMBOL(register_die_notifier);
155 155
156 #ifdef CONFIG_SMP 156 #ifdef CONFIG_SMP
157 EXPORT_SYMBOL(cpu_sibling_map); 157 EXPORT_SYMBOL(cpu_sibling_map);
158 EXPORT_SYMBOL(smp_num_siblings); 158 EXPORT_SYMBOL(smp_num_siblings);
159 #endif 159 #endif
160 160
161 extern void do_softirq_thunk(void); 161 extern void do_softirq_thunk(void);
162 EXPORT_SYMBOL(do_softirq_thunk); 162 EXPORT_SYMBOL(do_softirq_thunk);
163 163
164 #ifdef CONFIG_BUG 164 #ifdef CONFIG_BUG
165 EXPORT_SYMBOL(out_of_line_bug); 165 EXPORT_SYMBOL(out_of_line_bug);
166 #endif 166 #endif
167 167
168 EXPORT_SYMBOL(init_level4_pgt); 168 EXPORT_SYMBOL(init_level4_pgt);
169 169
170 extern unsigned long __supported_pte_mask; 170 extern unsigned long __supported_pte_mask;
171 EXPORT_SYMBOL(__supported_pte_mask); 171 EXPORT_SYMBOL(__supported_pte_mask);
172 172
173 #ifdef CONFIG_SMP 173 #ifdef CONFIG_SMP
174 EXPORT_SYMBOL(flush_tlb_page); 174 EXPORT_SYMBOL(flush_tlb_page);
175 #endif 175 #endif
176 176
177 EXPORT_SYMBOL(cpu_khz); 177 EXPORT_SYMBOL(cpu_khz);
178 178
179 EXPORT_SYMBOL(load_gs_index); 179 EXPORT_SYMBOL(load_gs_index);
180 180
181 181
arch/x86_64/mm/numa.c
1 /* 1 /*
2 * Generic VM initialization for x86-64 NUMA setups. 2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs. 3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */ 4 */
5 #include <linux/kernel.h> 5 #include <linux/kernel.h>
6 #include <linux/mm.h> 6 #include <linux/mm.h>
7 #include <linux/string.h> 7 #include <linux/string.h>
8 #include <linux/init.h> 8 #include <linux/init.h>
9 #include <linux/bootmem.h> 9 #include <linux/bootmem.h>
10 #include <linux/mmzone.h> 10 #include <linux/mmzone.h>
11 #include <linux/ctype.h> 11 #include <linux/ctype.h>
12 #include <linux/module.h> 12 #include <linux/module.h>
13 #include <linux/nodemask.h> 13 #include <linux/nodemask.h>
14 14
15 #include <asm/e820.h> 15 #include <asm/e820.h>
16 #include <asm/proto.h> 16 #include <asm/proto.h>
17 #include <asm/dma.h> 17 #include <asm/dma.h>
18 #include <asm/numa.h> 18 #include <asm/numa.h>
19 #include <asm/acpi.h> 19 #include <asm/acpi.h>
20 20
21 #ifndef Dprintk 21 #ifndef Dprintk
22 #define Dprintk(x...) 22 #define Dprintk(x...)
23 #endif 23 #endif
24 24
25 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 25 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
26 bootmem_data_t plat_node_bdata[MAX_NUMNODES]; 26 bootmem_data_t plat_node_bdata[MAX_NUMNODES];
27 27
28 int memnode_shift; 28 int memnode_shift;
29 u8 memnodemap[NODEMAPSIZE]; 29 u8 memnodemap[NODEMAPSIZE];
30 30
31 unsigned char cpu_to_node[NR_CPUS] __read_mostly = { 31 unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
32 [0 ... NR_CPUS-1] = NUMA_NO_NODE 32 [0 ... NR_CPUS-1] = NUMA_NO_NODE
33 }; 33 };
34 unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { 34 unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
35 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 35 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
36 }; 36 };
37 cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; 37 cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
38 38
39 int numa_off __initdata; 39 int numa_off __initdata;
40 40
41 41
42 /* 42 /*
43 * Given a shift value, try to populate memnodemap[] 43 * Given a shift value, try to populate memnodemap[]
44 * Returns : 44 * Returns :
45 * 1 if OK 45 * 1 if OK
46 * 0 if memnodmap[] too small (of shift too small) 46 * 0 if memnodmap[] too small (of shift too small)
47 * -1 if node overlap or lost ram (shift too big) 47 * -1 if node overlap or lost ram (shift too big)
48 */ 48 */
49 static int __init 49 static int __init
50 populate_memnodemap(const struct node *nodes, int numnodes, int shift) 50 populate_memnodemap(const struct node *nodes, int numnodes, int shift)
51 { 51 {
52 int i; 52 int i;
53 int res = -1; 53 int res = -1;
54 unsigned long addr, end; 54 unsigned long addr, end;
55 55
56 if (shift >= 64) 56 if (shift >= 64)
57 return -1; 57 return -1;
58 memset(memnodemap, 0xff, sizeof(memnodemap)); 58 memset(memnodemap, 0xff, sizeof(memnodemap));
59 for (i = 0; i < numnodes; i++) { 59 for (i = 0; i < numnodes; i++) {
60 addr = nodes[i].start; 60 addr = nodes[i].start;
61 end = nodes[i].end; 61 end = nodes[i].end;
62 if (addr >= end) 62 if (addr >= end)
63 continue; 63 continue;
64 if ((end >> shift) >= NODEMAPSIZE) 64 if ((end >> shift) >= NODEMAPSIZE)
65 return 0; 65 return 0;
66 do { 66 do {
67 if (memnodemap[addr >> shift] != 0xff) 67 if (memnodemap[addr >> shift] != 0xff)
68 return -1; 68 return -1;
69 memnodemap[addr >> shift] = i; 69 memnodemap[addr >> shift] = i;
70 addr += (1UL << shift); 70 addr += (1UL << shift);
71 } while (addr < end); 71 } while (addr < end);
72 res = 1; 72 res = 1;
73 } 73 }
74 return res; 74 return res;
75 } 75 }
76 76
77 int __init compute_hash_shift(struct node *nodes, int numnodes) 77 int __init compute_hash_shift(struct node *nodes, int numnodes)
78 { 78 {
79 int shift = 20; 79 int shift = 20;
80 80
81 while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0) 81 while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
82 shift++; 82 shift++;
83 83
84 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", 84 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
85 shift); 85 shift);
86 86
87 if (populate_memnodemap(nodes, numnodes, shift) != 1) { 87 if (populate_memnodemap(nodes, numnodes, shift) != 1) {
88 printk(KERN_INFO 88 printk(KERN_INFO
89 "Your memory is not aligned you need to rebuild your kernel " 89 "Your memory is not aligned you need to rebuild your kernel "
90 "with a bigger NODEMAPSIZE shift=%d\n", 90 "with a bigger NODEMAPSIZE shift=%d\n",
91 shift); 91 shift);
92 return -1; 92 return -1;
93 } 93 }
94 return shift; 94 return shift;
95 } 95 }
96 96
97 #ifdef CONFIG_SPARSEMEM 97 #ifdef CONFIG_SPARSEMEM
98 int early_pfn_to_nid(unsigned long pfn) 98 int early_pfn_to_nid(unsigned long pfn)
99 { 99 {
100 return phys_to_nid(pfn << PAGE_SHIFT); 100 return phys_to_nid(pfn << PAGE_SHIFT);
101 } 101 }
102 #endif 102 #endif
103 103
104 /* Initialize bootmem allocator for a node */ 104 /* Initialize bootmem allocator for a node */
105 void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) 105 void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
106 { 106 {
107 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; 107 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
108 unsigned long nodedata_phys; 108 unsigned long nodedata_phys;
109 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); 109 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
110 110
111 start = round_up(start, ZONE_ALIGN); 111 start = round_up(start, ZONE_ALIGN);
112 112
113 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end); 113 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
114 114
115 start_pfn = start >> PAGE_SHIFT; 115 start_pfn = start >> PAGE_SHIFT;
116 end_pfn = end >> PAGE_SHIFT; 116 end_pfn = end >> PAGE_SHIFT;
117 117
118 nodedata_phys = find_e820_area(start, end, pgdat_size); 118 nodedata_phys = find_e820_area(start, end, pgdat_size);
119 if (nodedata_phys == -1L) 119 if (nodedata_phys == -1L)
120 panic("Cannot find memory pgdat in node %d\n", nodeid); 120 panic("Cannot find memory pgdat in node %d\n", nodeid);
121 121
122 Dprintk("nodedata_phys %lx\n", nodedata_phys); 122 Dprintk("nodedata_phys %lx\n", nodedata_phys);
123 123
124 node_data[nodeid] = phys_to_virt(nodedata_phys); 124 node_data[nodeid] = phys_to_virt(nodedata_phys);
125 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); 125 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
126 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; 126 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
127 NODE_DATA(nodeid)->node_start_pfn = start_pfn; 127 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
128 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; 128 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
129 129
130 /* Find a place for the bootmem map */ 130 /* Find a place for the bootmem map */
131 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 131 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
132 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); 132 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
133 bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT); 133 bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
134 if (bootmap_start == -1L) 134 if (bootmap_start == -1L)
135 panic("Not enough continuous space for bootmap on node %d", nodeid); 135 panic("Not enough continuous space for bootmap on node %d", nodeid);
136 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); 136 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
137 137
138 bootmap_size = init_bootmem_node(NODE_DATA(nodeid), 138 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
139 bootmap_start >> PAGE_SHIFT, 139 bootmap_start >> PAGE_SHIFT,
140 start_pfn, end_pfn); 140 start_pfn, end_pfn);
141 141
142 e820_bootmem_free(NODE_DATA(nodeid), start, end); 142 e820_bootmem_free(NODE_DATA(nodeid), start, end);
143 143
144 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); 144 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
145 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT); 145 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
146 node_set_online(nodeid); 146 node_set_online(nodeid);
147 } 147 }
148 148
149 /* Initialize final allocator for a zone */ 149 /* Initialize final allocator for a zone */
150 void __init setup_node_zones(int nodeid) 150 void __init setup_node_zones(int nodeid)
151 { 151 {
152 unsigned long start_pfn, end_pfn; 152 unsigned long start_pfn, end_pfn;
153 unsigned long zones[MAX_NR_ZONES]; 153 unsigned long zones[MAX_NR_ZONES];
154 unsigned long holes[MAX_NR_ZONES]; 154 unsigned long holes[MAX_NR_ZONES];
155 155
156 start_pfn = node_start_pfn(nodeid); 156 start_pfn = node_start_pfn(nodeid);
157 end_pfn = node_end_pfn(nodeid); 157 end_pfn = node_end_pfn(nodeid);
158 158
159 Dprintk(KERN_INFO "Setting up node %d %lx-%lx\n", 159 Dprintk(KERN_INFO "Setting up node %d %lx-%lx\n",
160 nodeid, start_pfn, end_pfn); 160 nodeid, start_pfn, end_pfn);
161 161
162 size_zones(zones, holes, start_pfn, end_pfn); 162 size_zones(zones, holes, start_pfn, end_pfn);
163 free_area_init_node(nodeid, NODE_DATA(nodeid), zones, 163 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
164 start_pfn, holes); 164 start_pfn, holes);
165 } 165 }
166 166
167 void __init numa_init_array(void) 167 void __init numa_init_array(void)
168 { 168 {
169 int rr, i; 169 int rr, i;
170 /* There are unfortunately some poorly designed mainboards around 170 /* There are unfortunately some poorly designed mainboards around
171 that only connect memory to a single CPU. This breaks the 1:1 cpu->node 171 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
172 mapping. To avoid this fill in the mapping for all possible 172 mapping. To avoid this fill in the mapping for all possible
173 CPUs, as the number of CPUs is not known yet. 173 CPUs, as the number of CPUs is not known yet.
174 We round robin the existing nodes. */ 174 We round robin the existing nodes. */
175 rr = first_node(node_online_map); 175 rr = first_node(node_online_map);
176 for (i = 0; i < NR_CPUS; i++) { 176 for (i = 0; i < NR_CPUS; i++) {
177 if (cpu_to_node[i] != NUMA_NO_NODE) 177 if (cpu_to_node[i] != NUMA_NO_NODE)
178 continue; 178 continue;
179 numa_set_node(i, rr); 179 numa_set_node(i, rr);
180 rr = next_node(rr, node_online_map); 180 rr = next_node(rr, node_online_map);
181 if (rr == MAX_NUMNODES) 181 if (rr == MAX_NUMNODES)
182 rr = first_node(node_online_map); 182 rr = first_node(node_online_map);
183 } 183 }
184 184
185 } 185 }
186 186
187 #ifdef CONFIG_NUMA_EMU 187 #ifdef CONFIG_NUMA_EMU
188 int numa_fake __initdata = 0; 188 int numa_fake __initdata = 0;
189 189
190 /* Numa emulation */ 190 /* Numa emulation */
191 static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn) 191 static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
192 { 192 {
193 int i; 193 int i;
194 struct node nodes[MAX_NUMNODES]; 194 struct node nodes[MAX_NUMNODES];
195 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake; 195 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
196 196
197 /* Kludge needed for the hash function */ 197 /* Kludge needed for the hash function */
198 if (hweight64(sz) > 1) { 198 if (hweight64(sz) > 1) {
199 unsigned long x = 1; 199 unsigned long x = 1;
200 while ((x << 1) < sz) 200 while ((x << 1) < sz)
201 x <<= 1; 201 x <<= 1;
202 if (x < sz/2) 202 if (x < sz/2)
203 printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n"); 203 printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
204 sz = x; 204 sz = x;
205 } 205 }
206 206
207 memset(&nodes,0,sizeof(nodes)); 207 memset(&nodes,0,sizeof(nodes));
208 for (i = 0; i < numa_fake; i++) { 208 for (i = 0; i < numa_fake; i++) {
209 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz; 209 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
210 if (i == numa_fake-1) 210 if (i == numa_fake-1)
211 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start; 211 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
212 nodes[i].end = nodes[i].start + sz; 212 nodes[i].end = nodes[i].start + sz;
213 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", 213 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
214 i, 214 i,
215 nodes[i].start, nodes[i].end, 215 nodes[i].start, nodes[i].end,
216 (nodes[i].end - nodes[i].start) >> 20); 216 (nodes[i].end - nodes[i].start) >> 20);
217 node_set_online(i); 217 node_set_online(i);
218 } 218 }
219 memnode_shift = compute_hash_shift(nodes, numa_fake); 219 memnode_shift = compute_hash_shift(nodes, numa_fake);
220 if (memnode_shift < 0) { 220 if (memnode_shift < 0) {
221 memnode_shift = 0; 221 memnode_shift = 0;
222 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n"); 222 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
223 return -1; 223 return -1;
224 } 224 }
225 for_each_online_node(i) 225 for_each_online_node(i)
226 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 226 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
227 numa_init_array(); 227 numa_init_array();
228 return 0; 228 return 0;
229 } 229 }
230 #endif 230 #endif
231 231
232 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) 232 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
233 { 233 {
234 int i; 234 int i;
235 235
236 #ifdef CONFIG_NUMA_EMU 236 #ifdef CONFIG_NUMA_EMU
237 if (numa_fake && !numa_emulation(start_pfn, end_pfn)) 237 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
238 return; 238 return;
239 #endif 239 #endif
240 240
241 #ifdef CONFIG_ACPI_NUMA 241 #ifdef CONFIG_ACPI_NUMA
242 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, 242 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
243 end_pfn << PAGE_SHIFT)) 243 end_pfn << PAGE_SHIFT))
244 return; 244 return;
245 #endif 245 #endif
246 246
247 #ifdef CONFIG_K8_NUMA 247 #ifdef CONFIG_K8_NUMA
248 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT)) 248 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
249 return; 249 return;
250 #endif 250 #endif
251 printk(KERN_INFO "%s\n", 251 printk(KERN_INFO "%s\n",
252 numa_off ? "NUMA turned off" : "No NUMA configuration found"); 252 numa_off ? "NUMA turned off" : "No NUMA configuration found");
253 253
254 printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 254 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
255 start_pfn << PAGE_SHIFT, 255 start_pfn << PAGE_SHIFT,
256 end_pfn << PAGE_SHIFT); 256 end_pfn << PAGE_SHIFT);
257 /* setup dummy node covering all memory */ 257 /* setup dummy node covering all memory */
258 memnode_shift = 63; 258 memnode_shift = 63;
259 memnodemap[0] = 0; 259 memnodemap[0] = 0;
260 nodes_clear(node_online_map); 260 nodes_clear(node_online_map);
261 node_set_online(0); 261 node_set_online(0);
262 for (i = 0; i < NR_CPUS; i++) 262 for (i = 0; i < NR_CPUS; i++)
263 numa_set_node(i, 0); 263 numa_set_node(i, 0);
264 node_to_cpumask[0] = cpumask_of_cpu(0); 264 node_to_cpumask[0] = cpumask_of_cpu(0);
265 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); 265 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
266 } 266 }
267 267
268 __cpuinit void numa_add_cpu(int cpu) 268 __cpuinit void numa_add_cpu(int cpu)
269 { 269 {
270 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]); 270 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
271 } 271 }
272 272
273 void __cpuinit numa_set_node(int cpu, int node) 273 void __cpuinit numa_set_node(int cpu, int node)
274 { 274 {
275 cpu_pda[cpu].nodenumber = node; 275 cpu_pda(cpu)->nodenumber = node;
276 cpu_to_node[cpu] = node; 276 cpu_to_node[cpu] = node;
277 } 277 }
278 278
279 unsigned long __init numa_free_all_bootmem(void) 279 unsigned long __init numa_free_all_bootmem(void)
280 { 280 {
281 int i; 281 int i;
282 unsigned long pages = 0; 282 unsigned long pages = 0;
283 for_each_online_node(i) { 283 for_each_online_node(i) {
284 pages += free_all_bootmem_node(NODE_DATA(i)); 284 pages += free_all_bootmem_node(NODE_DATA(i));
285 } 285 }
286 return pages; 286 return pages;
287 } 287 }
288 288
289 #ifdef CONFIG_SPARSEMEM 289 #ifdef CONFIG_SPARSEMEM
290 static void __init arch_sparse_init(void) 290 static void __init arch_sparse_init(void)
291 { 291 {
292 int i; 292 int i;
293 293
294 for_each_online_node(i) 294 for_each_online_node(i)
295 memory_present(i, node_start_pfn(i), node_end_pfn(i)); 295 memory_present(i, node_start_pfn(i), node_end_pfn(i));
296 296
297 sparse_init(); 297 sparse_init();
298 } 298 }
299 #else 299 #else
300 #define arch_sparse_init() do {} while (0) 300 #define arch_sparse_init() do {} while (0)
301 #endif 301 #endif
302 302
303 void __init paging_init(void) 303 void __init paging_init(void)
304 { 304 {
305 int i; 305 int i;
306 306
307 arch_sparse_init(); 307 arch_sparse_init();
308 308
309 for_each_online_node(i) { 309 for_each_online_node(i) {
310 setup_node_zones(i); 310 setup_node_zones(i);
311 } 311 }
312 } 312 }
313 313
314 /* [numa=off] */ 314 /* [numa=off] */
315 __init int numa_setup(char *opt) 315 __init int numa_setup(char *opt)
316 { 316 {
317 if (!strncmp(opt,"off",3)) 317 if (!strncmp(opt,"off",3))
318 numa_off = 1; 318 numa_off = 1;
319 #ifdef CONFIG_NUMA_EMU 319 #ifdef CONFIG_NUMA_EMU
320 if(!strncmp(opt, "fake=", 5)) { 320 if(!strncmp(opt, "fake=", 5)) {
321 numa_fake = simple_strtoul(opt+5,NULL,0); ; 321 numa_fake = simple_strtoul(opt+5,NULL,0); ;
322 if (numa_fake >= MAX_NUMNODES) 322 if (numa_fake >= MAX_NUMNODES)
323 numa_fake = MAX_NUMNODES; 323 numa_fake = MAX_NUMNODES;
324 } 324 }
325 #endif 325 #endif
326 #ifdef CONFIG_ACPI_NUMA 326 #ifdef CONFIG_ACPI_NUMA
327 if (!strncmp(opt,"noacpi",6)) 327 if (!strncmp(opt,"noacpi",6))
328 acpi_numa = -1; 328 acpi_numa = -1;
329 #endif 329 #endif
330 return 1; 330 return 1;
331 } 331 }
332 332
333 /* 333 /*
334 * Setup early cpu_to_node. 334 * Setup early cpu_to_node.
335 * 335 *
336 * Populate cpu_to_node[] only if x86_cpu_to_apicid[], 336 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
337 * and apicid_to_node[] tables have valid entries for a CPU. 337 * and apicid_to_node[] tables have valid entries for a CPU.
338 * This means we skip cpu_to_node[] initialisation for NUMA 338 * This means we skip cpu_to_node[] initialisation for NUMA
339 * emulation and faking node case (when running a kernel compiled 339 * emulation and faking node case (when running a kernel compiled
340 * for NUMA on a non NUMA box), which is OK as cpu_to_node[] 340 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
341 * is already initialized in a round robin manner at numa_init_array, 341 * is already initialized in a round robin manner at numa_init_array,
342 * prior to this call, and this initialization is good enough 342 * prior to this call, and this initialization is good enough
343 * for the fake NUMA cases. 343 * for the fake NUMA cases.
344 */ 344 */
345 void __init init_cpu_to_node(void) 345 void __init init_cpu_to_node(void)
346 { 346 {
347 int i; 347 int i;
348 for (i = 0; i < NR_CPUS; i++) { 348 for (i = 0; i < NR_CPUS; i++) {
349 u8 apicid = x86_cpu_to_apicid[i]; 349 u8 apicid = x86_cpu_to_apicid[i];
350 if (apicid == BAD_APICID) 350 if (apicid == BAD_APICID)
351 continue; 351 continue;
352 if (apicid_to_node[apicid] == NUMA_NO_NODE) 352 if (apicid_to_node[apicid] == NUMA_NO_NODE)
353 continue; 353 continue;
354 cpu_to_node[i] = apicid_to_node[apicid]; 354 cpu_to_node[i] = apicid_to_node[apicid];
355 } 355 }
356 } 356 }
357 357
358 EXPORT_SYMBOL(cpu_to_node); 358 EXPORT_SYMBOL(cpu_to_node);
359 EXPORT_SYMBOL(node_to_cpumask); 359 EXPORT_SYMBOL(node_to_cpumask);
360 EXPORT_SYMBOL(memnode_shift); 360 EXPORT_SYMBOL(memnode_shift);
361 EXPORT_SYMBOL(memnodemap); 361 EXPORT_SYMBOL(memnodemap);
362 EXPORT_SYMBOL(node_data); 362 EXPORT_SYMBOL(node_data);
363 363
include/asm-x86_64/pda.h
1 #ifndef X86_64_PDA_H 1 #ifndef X86_64_PDA_H
2 #define X86_64_PDA_H 2 #define X86_64_PDA_H
3 3
4 #ifndef __ASSEMBLY__ 4 #ifndef __ASSEMBLY__
5 #include <linux/stddef.h> 5 #include <linux/stddef.h>
6 #include <linux/types.h> 6 #include <linux/types.h>
7 #include <linux/cache.h> 7 #include <linux/cache.h>
8 #include <asm/page.h> 8 #include <asm/page.h>
9 9
10 /* Per processor datastructure. %gs points to it while the kernel runs */ 10 /* Per processor datastructure. %gs points to it while the kernel runs */
11 struct x8664_pda { 11 struct x8664_pda {
12 struct task_struct *pcurrent; /* Current process */ 12 struct task_struct *pcurrent; /* Current process */
13 unsigned long data_offset; /* Per cpu data offset from linker address */ 13 unsigned long data_offset; /* Per cpu data offset from linker address */
14 unsigned long kernelstack; /* top of kernel stack for current */ 14 unsigned long kernelstack; /* top of kernel stack for current */
15 unsigned long oldrsp; /* user rsp for system call */ 15 unsigned long oldrsp; /* user rsp for system call */
16 #if DEBUG_STKSZ > EXCEPTION_STKSZ 16 #if DEBUG_STKSZ > EXCEPTION_STKSZ
17 unsigned long debugstack; /* #DB/#BP stack. */ 17 unsigned long debugstack; /* #DB/#BP stack. */
18 #endif 18 #endif
19 int irqcount; /* Irq nesting counter. Starts with -1 */ 19 int irqcount; /* Irq nesting counter. Starts with -1 */
20 int cpunumber; /* Logical CPU number */ 20 int cpunumber; /* Logical CPU number */
21 char *irqstackptr; /* top of irqstack */ 21 char *irqstackptr; /* top of irqstack */
22 int nodenumber; /* number of current node */ 22 int nodenumber; /* number of current node */
23 unsigned int __softirq_pending; 23 unsigned int __softirq_pending;
24 unsigned int __nmi_count; /* number of NMI on this CPUs */ 24 unsigned int __nmi_count; /* number of NMI on this CPUs */
25 struct mm_struct *active_mm; 25 struct mm_struct *active_mm;
26 int mmu_state; 26 int mmu_state;
27 unsigned apic_timer_irqs; 27 unsigned apic_timer_irqs;
28 } ____cacheline_aligned_in_smp; 28 } ____cacheline_aligned_in_smp;
29 29
30 extern struct x8664_pda cpu_pda[]; 30 extern struct x8664_pda _cpu_pda[];
31
32 #define cpu_pda(i) (&_cpu_pda[i])
31 33
32 /* 34 /*
33 * There is no fast way to get the base address of the PDA, all the accesses 35 * There is no fast way to get the base address of the PDA, all the accesses
34 * have to mention %fs/%gs. So it needs to be done this Torvaldian way. 36 * have to mention %fs/%gs. So it needs to be done this Torvaldian way.
35 */ 37 */
36 #define sizeof_field(type,field) (sizeof(((type *)0)->field)) 38 #define sizeof_field(type,field) (sizeof(((type *)0)->field))
37 #define typeof_field(type,field) typeof(((type *)0)->field) 39 #define typeof_field(type,field) typeof(((type *)0)->field)
38 40
39 extern void __bad_pda_field(void); 41 extern void __bad_pda_field(void);
40 42
41 #define pda_offset(field) offsetof(struct x8664_pda, field) 43 #define pda_offset(field) offsetof(struct x8664_pda, field)
42 44
43 #define pda_to_op(op,field,val) do { \ 45 #define pda_to_op(op,field,val) do { \
44 typedef typeof_field(struct x8664_pda, field) T__; \ 46 typedef typeof_field(struct x8664_pda, field) T__; \
45 switch (sizeof_field(struct x8664_pda, field)) { \ 47 switch (sizeof_field(struct x8664_pda, field)) { \
46 case 2: \ 48 case 2: \
47 asm volatile(op "w %0,%%gs:%P1"::"ri" ((T__)val),"i"(pda_offset(field)):"memory"); break; \ 49 asm volatile(op "w %0,%%gs:%P1"::"ri" ((T__)val),"i"(pda_offset(field)):"memory"); break; \
48 case 4: \ 50 case 4: \
49 asm volatile(op "l %0,%%gs:%P1"::"ri" ((T__)val),"i"(pda_offset(field)):"memory"); break; \ 51 asm volatile(op "l %0,%%gs:%P1"::"ri" ((T__)val),"i"(pda_offset(field)):"memory"); break; \
50 case 8: \ 52 case 8: \
51 asm volatile(op "q %0,%%gs:%P1"::"ri" ((T__)val),"i"(pda_offset(field)):"memory"); break; \ 53 asm volatile(op "q %0,%%gs:%P1"::"ri" ((T__)val),"i"(pda_offset(field)):"memory"); break; \
52 default: __bad_pda_field(); \ 54 default: __bad_pda_field(); \
53 } \ 55 } \
54 } while (0) 56 } while (0)
55 57
56 /* 58 /*
57 * AK: PDA read accesses should be neither volatile nor have an memory clobber. 59 * AK: PDA read accesses should be neither volatile nor have an memory clobber.
58 * Unfortunately removing them causes all hell to break lose currently. 60 * Unfortunately removing them causes all hell to break lose currently.
59 */ 61 */
60 #define pda_from_op(op,field) ({ \ 62 #define pda_from_op(op,field) ({ \
61 typeof_field(struct x8664_pda, field) ret__; \ 63 typeof_field(struct x8664_pda, field) ret__; \
62 switch (sizeof_field(struct x8664_pda, field)) { \ 64 switch (sizeof_field(struct x8664_pda, field)) { \
63 case 2: \ 65 case 2: \
64 asm volatile(op "w %%gs:%P1,%0":"=r" (ret__):"i"(pda_offset(field)):"memory"); break;\ 66 asm volatile(op "w %%gs:%P1,%0":"=r" (ret__):"i"(pda_offset(field)):"memory"); break;\
65 case 4: \ 67 case 4: \
66 asm volatile(op "l %%gs:%P1,%0":"=r" (ret__):"i"(pda_offset(field)):"memory"); break;\ 68 asm volatile(op "l %%gs:%P1,%0":"=r" (ret__):"i"(pda_offset(field)):"memory"); break;\
67 case 8: \ 69 case 8: \
68 asm volatile(op "q %%gs:%P1,%0":"=r" (ret__):"i"(pda_offset(field)):"memory"); break;\ 70 asm volatile(op "q %%gs:%P1,%0":"=r" (ret__):"i"(pda_offset(field)):"memory"); break;\
69 default: __bad_pda_field(); \ 71 default: __bad_pda_field(); \
70 } \ 72 } \
71 ret__; }) 73 ret__; })
72 74
73 75
74 #define read_pda(field) pda_from_op("mov",field) 76 #define read_pda(field) pda_from_op("mov",field)
75 #define write_pda(field,val) pda_to_op("mov",field,val) 77 #define write_pda(field,val) pda_to_op("mov",field,val)
76 #define add_pda(field,val) pda_to_op("add",field,val) 78 #define add_pda(field,val) pda_to_op("add",field,val)
77 #define sub_pda(field,val) pda_to_op("sub",field,val) 79 #define sub_pda(field,val) pda_to_op("sub",field,val)
78 #define or_pda(field,val) pda_to_op("or",field,val) 80 #define or_pda(field,val) pda_to_op("or",field,val)
79 81
80 #endif 82 #endif
81 83
82 #define PDA_STACKOFFSET (5*8) 84 #define PDA_STACKOFFSET (5*8)
83 85
84 #endif 86 #endif
85 87
include/asm-x86_64/percpu.h
1 #ifndef _ASM_X8664_PERCPU_H_ 1 #ifndef _ASM_X8664_PERCPU_H_
2 #define _ASM_X8664_PERCPU_H_ 2 #define _ASM_X8664_PERCPU_H_
3 #include <linux/compiler.h> 3 #include <linux/compiler.h>
4 4
5 /* Same as asm-generic/percpu.h, except that we store the per cpu offset 5 /* Same as asm-generic/percpu.h, except that we store the per cpu offset
6 in the PDA. Longer term the PDA and every per cpu variable 6 in the PDA. Longer term the PDA and every per cpu variable
7 should be just put into a single section and referenced directly 7 should be just put into a single section and referenced directly
8 from %gs */ 8 from %gs */
9 9
10 #ifdef CONFIG_SMP 10 #ifdef CONFIG_SMP
11 11
12 #include <asm/pda.h> 12 #include <asm/pda.h>
13 13
14 #define __per_cpu_offset(cpu) (cpu_pda[cpu].data_offset) 14 #define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset)
15 #define __my_cpu_offset() read_pda(data_offset) 15 #define __my_cpu_offset() read_pda(data_offset)
16 16
17 /* Separate out the type, so (int[3], foo) works. */ 17 /* Separate out the type, so (int[3], foo) works. */
18 #define DEFINE_PER_CPU(type, name) \ 18 #define DEFINE_PER_CPU(type, name) \
19 __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name 19 __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
20 20
21 /* var is in discarded region: offset to particular copy we want */ 21 /* var is in discarded region: offset to particular copy we want */
22 #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu))) 22 #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu)))
23 #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset())) 23 #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset()))
24 24
25 /* A macro to avoid #include hell... */ 25 /* A macro to avoid #include hell... */
26 #define percpu_modcopy(pcpudst, src, size) \ 26 #define percpu_modcopy(pcpudst, src, size) \
27 do { \ 27 do { \
28 unsigned int __i; \ 28 unsigned int __i; \
29 for (__i = 0; __i < NR_CPUS; __i++) \ 29 for (__i = 0; __i < NR_CPUS; __i++) \
30 if (cpu_possible(__i)) \ 30 if (cpu_possible(__i)) \
31 memcpy((pcpudst)+__per_cpu_offset(__i), \ 31 memcpy((pcpudst)+__per_cpu_offset(__i), \
32 (src), (size)); \ 32 (src), (size)); \
33 } while (0) 33 } while (0)
34 34
35 extern void setup_per_cpu_areas(void); 35 extern void setup_per_cpu_areas(void);
36 36
37 #else /* ! SMP */ 37 #else /* ! SMP */
38 38
39 #define DEFINE_PER_CPU(type, name) \ 39 #define DEFINE_PER_CPU(type, name) \
40 __typeof__(type) per_cpu__##name 40 __typeof__(type) per_cpu__##name
41 41
42 #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var)) 42 #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var))
43 #define __get_cpu_var(var) per_cpu__##var 43 #define __get_cpu_var(var) per_cpu__##var
44 44
45 #endif /* SMP */ 45 #endif /* SMP */
46 46
47 #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name 47 #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
48 48
49 #define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var) 49 #define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
50 #define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var) 50 #define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
51 51
52 #endif /* _ASM_X8664_PERCPU_H_ */ 52 #endif /* _ASM_X8664_PERCPU_H_ */
53 53