Commit 5033cba087f6ac773002123aafbea1aad4267682
Committed by
Linus Torvalds
1 parent
dd2a13054f
Exists in
master
and in
7 other branches
[PATCH] kexec: x86 kexec core
This is the i386 implementation of kexec. Signed-off-by: Eric Biederman <ebiederm@xmission.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Showing 7 changed files with 429 additions and 1 deletions Side-by-side Diff
arch/i386/Kconfig
... | ... | @@ -953,6 +953,23 @@ |
953 | 953 | |
954 | 954 | Don't change this unless you know what you are doing. |
955 | 955 | |
956 | +config KEXEC | |
957 | + bool "kexec system call (EXPERIMENTAL)" | |
958 | + depends on EXPERIMENTAL | |
959 | + help | |
960 | + kexec is a system call that implements the ability to shutdown your | |
961 | + current kernel, and to start another kernel. It is like a reboot | |
962 | + but it is indepedent of the system firmware. And like a reboot | |
963 | + you can start any kernel with it, not just Linux. | |
964 | + | |
965 | + The name comes from the similiarity to the exec system call. | |
966 | + | |
967 | + It is an ongoing process to be certain the hardware in a machine | |
968 | + is properly shutdown, so do not be surprised if this code does not | |
969 | + initially work for you. It may help to enable device hotplugging | |
970 | + support. As of this writing the exact hardware interface is | |
971 | + strongly in flux, so no good recommendation can be made. | |
972 | + | |
956 | 973 | endmenu |
957 | 974 | |
958 | 975 |
arch/i386/kernel/Makefile
... | ... | @@ -24,6 +24,7 @@ |
24 | 24 | obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o |
25 | 25 | obj-$(CONFIG_X86_IO_APIC) += io_apic.o |
26 | 26 | obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups.o |
27 | +obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o | |
27 | 28 | obj-$(CONFIG_X86_NUMAQ) += numaq.o |
28 | 29 | obj-$(CONFIG_X86_SUMMIT_NUMA) += summit.o |
29 | 30 | obj-$(CONFIG_KPROBES) += kprobes.o |
arch/i386/kernel/crash.c
1 | +/* | |
2 | + * Architecture specific (i386) functions for kexec based crash dumps. | |
3 | + * | |
4 | + * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) | |
5 | + * | |
6 | + * Copyright (C) IBM Corporation, 2004. All rights reserved. | |
7 | + * | |
8 | + */ | |
9 | + | |
10 | +#include <linux/init.h> | |
11 | +#include <linux/types.h> | |
12 | +#include <linux/kernel.h> | |
13 | +#include <linux/smp.h> | |
14 | +#include <linux/irq.h> | |
15 | +#include <linux/reboot.h> | |
16 | +#include <linux/kexec.h> | |
17 | +#include <linux/irq.h> | |
18 | +#include <linux/delay.h> | |
19 | +#include <linux/elf.h> | |
20 | +#include <linux/elfcore.h> | |
21 | + | |
22 | +#include <asm/processor.h> | |
23 | +#include <asm/hardirq.h> | |
24 | +#include <asm/nmi.h> | |
25 | +#include <asm/hw_irq.h> | |
26 | + | |
27 | +#define MAX_NOTE_BYTES 1024 | |
28 | +typedef u32 note_buf_t[MAX_NOTE_BYTES/4]; | |
29 | + | |
30 | +note_buf_t crash_notes[NR_CPUS]; | |
31 | + | |
32 | +void machine_crash_shutdown(void) | |
33 | +{ | |
34 | + /* This function is only called after the system | |
35 | + * has paniced or is otherwise in a critical state. | |
36 | + * The minimum amount of code to allow a kexec'd kernel | |
37 | + * to run successfully needs to happen here. | |
38 | + * | |
39 | + * In practice this means shooting down the other cpus in | |
40 | + * an SMP system. | |
41 | + */ | |
42 | +} |
arch/i386/kernel/machine_kexec.c
1 | +/* | |
2 | + * machine_kexec.c - handle transition of Linux booting another kernel | |
3 | + * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> | |
4 | + * | |
5 | + * This source code is licensed under the GNU General Public License, | |
6 | + * Version 2. See the file COPYING for more details. | |
7 | + */ | |
8 | + | |
9 | +#include <linux/mm.h> | |
10 | +#include <linux/kexec.h> | |
11 | +#include <linux/delay.h> | |
12 | +#include <asm/pgtable.h> | |
13 | +#include <asm/pgalloc.h> | |
14 | +#include <asm/tlbflush.h> | |
15 | +#include <asm/mmu_context.h> | |
16 | +#include <asm/io.h> | |
17 | +#include <asm/apic.h> | |
18 | +#include <asm/cpufeature.h> | |
19 | + | |
20 | +static inline unsigned long read_cr3(void) | |
21 | +{ | |
22 | + unsigned long cr3; | |
23 | + asm volatile("movl %%cr3,%0": "=r"(cr3)); | |
24 | + return cr3; | |
25 | +} | |
26 | + | |
27 | +#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) | |
28 | + | |
29 | +#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) | |
30 | +#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) | |
31 | +#define L2_ATTR (_PAGE_PRESENT) | |
32 | + | |
33 | +#define LEVEL0_SIZE (1UL << 12UL) | |
34 | + | |
35 | +#ifndef CONFIG_X86_PAE | |
36 | +#define LEVEL1_SIZE (1UL << 22UL) | |
37 | +static u32 pgtable_level1[1024] PAGE_ALIGNED; | |
38 | + | |
39 | +static void identity_map_page(unsigned long address) | |
40 | +{ | |
41 | + unsigned long level1_index, level2_index; | |
42 | + u32 *pgtable_level2; | |
43 | + | |
44 | + /* Find the current page table */ | |
45 | + pgtable_level2 = __va(read_cr3()); | |
46 | + | |
47 | + /* Find the indexes of the physical address to identity map */ | |
48 | + level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE; | |
49 | + level2_index = address / LEVEL1_SIZE; | |
50 | + | |
51 | + /* Identity map the page table entry */ | |
52 | + pgtable_level1[level1_index] = address | L0_ATTR; | |
53 | + pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR; | |
54 | + | |
55 | + /* Flush the tlb so the new mapping takes effect. | |
56 | + * Global tlb entries are not flushed but that is not an issue. | |
57 | + */ | |
58 | + load_cr3(pgtable_level2); | |
59 | +} | |
60 | + | |
61 | +#else | |
62 | +#define LEVEL1_SIZE (1UL << 21UL) | |
63 | +#define LEVEL2_SIZE (1UL << 30UL) | |
64 | +static u64 pgtable_level1[512] PAGE_ALIGNED; | |
65 | +static u64 pgtable_level2[512] PAGE_ALIGNED; | |
66 | + | |
67 | +static void identity_map_page(unsigned long address) | |
68 | +{ | |
69 | + unsigned long level1_index, level2_index, level3_index; | |
70 | + u64 *pgtable_level3; | |
71 | + | |
72 | + /* Find the current page table */ | |
73 | + pgtable_level3 = __va(read_cr3()); | |
74 | + | |
75 | + /* Find the indexes of the physical address to identity map */ | |
76 | + level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE; | |
77 | + level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE; | |
78 | + level3_index = address / LEVEL2_SIZE; | |
79 | + | |
80 | + /* Identity map the page table entry */ | |
81 | + pgtable_level1[level1_index] = address | L0_ATTR; | |
82 | + pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR; | |
83 | + set_64bit(&pgtable_level3[level3_index], __pa(pgtable_level2) | L2_ATTR); | |
84 | + | |
85 | + /* Flush the tlb so the new mapping takes effect. | |
86 | + * Global tlb entries are not flushed but that is not an issue. | |
87 | + */ | |
88 | + load_cr3(pgtable_level3); | |
89 | +} | |
90 | +#endif | |
91 | + | |
92 | + | |
93 | +static void set_idt(void *newidt, __u16 limit) | |
94 | +{ | |
95 | + unsigned char curidt[6]; | |
96 | + | |
97 | + /* ia32 supports unaliged loads & stores */ | |
98 | + (*(__u16 *)(curidt)) = limit; | |
99 | + (*(__u32 *)(curidt +2)) = (unsigned long)(newidt); | |
100 | + | |
101 | + __asm__ __volatile__ ( | |
102 | + "lidt %0\n" | |
103 | + : "=m" (curidt) | |
104 | + ); | |
105 | +}; | |
106 | + | |
107 | + | |
108 | +static void set_gdt(void *newgdt, __u16 limit) | |
109 | +{ | |
110 | + unsigned char curgdt[6]; | |
111 | + | |
112 | + /* ia32 supports unaligned loads & stores */ | |
113 | + (*(__u16 *)(curgdt)) = limit; | |
114 | + (*(__u32 *)(curgdt +2)) = (unsigned long)(newgdt); | |
115 | + | |
116 | + __asm__ __volatile__ ( | |
117 | + "lgdt %0\n" | |
118 | + : "=m" (curgdt) | |
119 | + ); | |
120 | +}; | |
121 | + | |
122 | +static void load_segments(void) | |
123 | +{ | |
124 | +#define __STR(X) #X | |
125 | +#define STR(X) __STR(X) | |
126 | + | |
127 | + __asm__ __volatile__ ( | |
128 | + "\tljmp $"STR(__KERNEL_CS)",$1f\n" | |
129 | + "\t1:\n" | |
130 | + "\tmovl $"STR(__KERNEL_DS)",%eax\n" | |
131 | + "\tmovl %eax,%ds\n" | |
132 | + "\tmovl %eax,%es\n" | |
133 | + "\tmovl %eax,%fs\n" | |
134 | + "\tmovl %eax,%gs\n" | |
135 | + "\tmovl %eax,%ss\n" | |
136 | + ); | |
137 | +#undef STR | |
138 | +#undef __STR | |
139 | +} | |
140 | + | |
141 | +typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)( | |
142 | + unsigned long indirection_page, unsigned long reboot_code_buffer, | |
143 | + unsigned long start_address, unsigned int has_pae) ATTRIB_NORET; | |
144 | + | |
145 | +const extern unsigned char relocate_new_kernel[]; | |
146 | +extern void relocate_new_kernel_end(void); | |
147 | +const extern unsigned int relocate_new_kernel_size; | |
148 | + | |
149 | +/* | |
150 | + * A architecture hook called to validate the | |
151 | + * proposed image and prepare the control pages | |
152 | + * as needed. The pages for KEXEC_CONTROL_CODE_SIZE | |
153 | + * have been allocated, but the segments have yet | |
154 | + * been copied into the kernel. | |
155 | + * | |
156 | + * Do what every setup is needed on image and the | |
157 | + * reboot code buffer to allow us to avoid allocations | |
158 | + * later. | |
159 | + * | |
160 | + * Currently nothing. | |
161 | + */ | |
162 | +int machine_kexec_prepare(struct kimage *image) | |
163 | +{ | |
164 | + return 0; | |
165 | +} | |
166 | + | |
167 | +/* | |
168 | + * Undo anything leftover by machine_kexec_prepare | |
169 | + * when an image is freed. | |
170 | + */ | |
171 | +void machine_kexec_cleanup(struct kimage *image) | |
172 | +{ | |
173 | +} | |
174 | + | |
175 | +/* | |
176 | + * Do not allocate memory (or fail in any way) in machine_kexec(). | |
177 | + * We are past the point of no return, committed to rebooting now. | |
178 | + */ | |
179 | +NORET_TYPE void machine_kexec(struct kimage *image) | |
180 | +{ | |
181 | + unsigned long page_list; | |
182 | + unsigned long reboot_code_buffer; | |
183 | + relocate_new_kernel_t rnk; | |
184 | + | |
185 | + /* Interrupts aren't acceptable while we reboot */ | |
186 | + local_irq_disable(); | |
187 | + | |
188 | + /* Compute some offsets */ | |
189 | + reboot_code_buffer = page_to_pfn(image->control_code_page) << PAGE_SHIFT; | |
190 | + page_list = image->head; | |
191 | + | |
192 | + /* Set up an identity mapping for the reboot_code_buffer */ | |
193 | + identity_map_page(reboot_code_buffer); | |
194 | + | |
195 | + /* copy it out */ | |
196 | + memcpy((void *)reboot_code_buffer, relocate_new_kernel, relocate_new_kernel_size); | |
197 | + | |
198 | + /* The segment registers are funny things, they are | |
199 | + * automatically loaded from a table, in memory wherever you | |
200 | + * set them to a specific selector, but this table is never | |
201 | + * accessed again you set the segment to a different selector. | |
202 | + * | |
203 | + * The more common model is are caches where the behide | |
204 | + * the scenes work is done, but is also dropped at arbitrary | |
205 | + * times. | |
206 | + * | |
207 | + * I take advantage of this here by force loading the | |
208 | + * segments, before I zap the gdt with an invalid value. | |
209 | + */ | |
210 | + load_segments(); | |
211 | + /* The gdt & idt are now invalid. | |
212 | + * If you want to load them you must set up your own idt & gdt. | |
213 | + */ | |
214 | + set_gdt(phys_to_virt(0),0); | |
215 | + set_idt(phys_to_virt(0),0); | |
216 | + | |
217 | + /* now call it */ | |
218 | + rnk = (relocate_new_kernel_t) reboot_code_buffer; | |
219 | + (*rnk)(page_list, reboot_code_buffer, image->start, cpu_has_pae); | |
220 | +} |
arch/i386/kernel/relocate_kernel.S
1 | +/* | |
2 | + * relocate_kernel.S - put the kernel image in place to boot | |
3 | + * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> | |
4 | + * | |
5 | + * This source code is licensed under the GNU General Public License, | |
6 | + * Version 2. See the file COPYING for more details. | |
7 | + */ | |
8 | + | |
9 | +#include <linux/linkage.h> | |
10 | + | |
11 | + /* | |
12 | + * Must be relocatable PIC code callable as a C function, that once | |
13 | + * it starts can not use the previous processes stack. | |
14 | + */ | |
15 | + .globl relocate_new_kernel | |
16 | +relocate_new_kernel: | |
17 | + /* read the arguments and say goodbye to the stack */ | |
18 | + movl 4(%esp), %ebx /* page_list */ | |
19 | + movl 8(%esp), %ebp /* reboot_code_buffer */ | |
20 | + movl 12(%esp), %edx /* start address */ | |
21 | + movl 16(%esp), %ecx /* cpu_has_pae */ | |
22 | + | |
23 | + /* zero out flags, and disable interrupts */ | |
24 | + pushl $0 | |
25 | + popfl | |
26 | + | |
27 | + /* set a new stack at the bottom of our page... */ | |
28 | + lea 4096(%ebp), %esp | |
29 | + | |
30 | + /* store the parameters back on the stack */ | |
31 | + pushl %edx /* store the start address */ | |
32 | + | |
33 | + /* Set cr0 to a known state: | |
34 | + * 31 0 == Paging disabled | |
35 | + * 18 0 == Alignment check disabled | |
36 | + * 16 0 == Write protect disabled | |
37 | + * 3 0 == No task switch | |
38 | + * 2 0 == Don't do FP software emulation. | |
39 | + * 0 1 == Proctected mode enabled | |
40 | + */ | |
41 | + movl %cr0, %eax | |
42 | + andl $~((1<<31)|(1<<18)|(1<<16)|(1<<3)|(1<<2)), %eax | |
43 | + orl $(1<<0), %eax | |
44 | + movl %eax, %cr0 | |
45 | + | |
46 | + /* clear cr4 if applicable */ | |
47 | + testl %ecx, %ecx | |
48 | + jz 1f | |
49 | + /* Set cr4 to a known state: | |
50 | + * Setting everything to zero seems safe. | |
51 | + */ | |
52 | + movl %cr4, %eax | |
53 | + andl $0, %eax | |
54 | + movl %eax, %cr4 | |
55 | + | |
56 | + jmp 1f | |
57 | +1: | |
58 | + | |
59 | + /* Flush the TLB (needed?) */ | |
60 | + xorl %eax, %eax | |
61 | + movl %eax, %cr3 | |
62 | + | |
63 | + /* Do the copies */ | |
64 | + movl %ebx, %ecx | |
65 | + jmp 1f | |
66 | + | |
67 | +0: /* top, read another word from the indirection page */ | |
68 | + movl (%ebx), %ecx | |
69 | + addl $4, %ebx | |
70 | +1: | |
71 | + testl $0x1, %ecx /* is it a destination page */ | |
72 | + jz 2f | |
73 | + movl %ecx, %edi | |
74 | + andl $0xfffff000, %edi | |
75 | + jmp 0b | |
76 | +2: | |
77 | + testl $0x2, %ecx /* is it an indirection page */ | |
78 | + jz 2f | |
79 | + movl %ecx, %ebx | |
80 | + andl $0xfffff000, %ebx | |
81 | + jmp 0b | |
82 | +2: | |
83 | + testl $0x4, %ecx /* is it the done indicator */ | |
84 | + jz 2f | |
85 | + jmp 3f | |
86 | +2: | |
87 | + testl $0x8, %ecx /* is it the source indicator */ | |
88 | + jz 0b /* Ignore it otherwise */ | |
89 | + movl %ecx, %esi /* For every source page do a copy */ | |
90 | + andl $0xfffff000, %esi | |
91 | + | |
92 | + movl $1024, %ecx | |
93 | + rep ; movsl | |
94 | + jmp 0b | |
95 | + | |
96 | +3: | |
97 | + | |
98 | + /* To be certain of avoiding problems with self-modifying code | |
99 | + * I need to execute a serializing instruction here. | |
100 | + * So I flush the TLB, it's handy, and not processor dependent. | |
101 | + */ | |
102 | + xorl %eax, %eax | |
103 | + movl %eax, %cr3 | |
104 | + | |
105 | + /* set all of the registers to known values */ | |
106 | + /* leave %esp alone */ | |
107 | + | |
108 | + xorl %eax, %eax | |
109 | + xorl %ebx, %ebx | |
110 | + xorl %ecx, %ecx | |
111 | + xorl %edx, %edx | |
112 | + xorl %esi, %esi | |
113 | + xorl %edi, %edi | |
114 | + xorl %ebp, %ebp | |
115 | + ret | |
116 | +relocate_new_kernel_end: | |
117 | + | |
118 | + .globl relocate_new_kernel_size | |
119 | +relocate_new_kernel_size: | |
120 | + .long relocate_new_kernel_end - relocate_new_kernel |
arch/i386/kernel/syscall_table.S
... | ... | @@ -283,7 +283,7 @@ |
283 | 283 | .long sys_mq_timedreceive /* 280 */ |
284 | 284 | .long sys_mq_notify |
285 | 285 | .long sys_mq_getsetattr |
286 | - .long sys_ni_syscall /* reserved for kexec */ | |
286 | + .long sys_kexec_load | |
287 | 287 | .long sys_waitid |
288 | 288 | .long sys_ni_syscall /* 285 */ /* available */ |
289 | 289 | .long sys_add_key |
include/asm-i386/kexec.h
1 | +#ifndef _I386_KEXEC_H | |
2 | +#define _I386_KEXEC_H | |
3 | + | |
4 | +#include <asm/fixmap.h> | |
5 | + | |
6 | +/* | |
7 | + * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return. | |
8 | + * I.e. Maximum page that is mapped directly into kernel memory, | |
9 | + * and kmap is not required. | |
10 | + * | |
11 | + * Someone correct me if FIXADDR_START - PAGEOFFSET is not the correct | |
12 | + * calculation for the amount of memory directly mappable into the | |
13 | + * kernel memory space. | |
14 | + */ | |
15 | + | |
16 | +/* Maximum physical address we can use pages from */ | |
17 | +#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL) | |
18 | +/* Maximum address we can reach in physical address mode */ | |
19 | +#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL) | |
20 | +/* Maximum address we can use for the control code buffer */ | |
21 | +#define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE | |
22 | + | |
23 | +#define KEXEC_CONTROL_CODE_SIZE 4096 | |
24 | + | |
25 | +/* The native architecture */ | |
26 | +#define KEXEC_ARCH KEXEC_ARCH_386 | |
27 | + | |
28 | +#endif /* _I386_KEXEC_H */ |