Commit f8f0fdcd40449d318f8dc30c1b361b0b7f54134a

Authored by Rusty Russell
Committed by Linus Torvalds
1 parent bff672e630

lguest: documentation VI: Switcher

Documentation: The Switcher

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 2 changed files with 276 additions and 46 deletions Side-by-side Diff

drivers/lguest/core.c
... ... @@ -393,46 +393,89 @@
393 393 write_cr0(cr0|8);
394 394 }
395 395  
  396 +/*S:010
  397 + * We are getting close to the Switcher.
  398 + *
  399 + * Remember that each CPU has two pages which are visible to the Guest when it
  400 + * runs on that CPU. This has to contain the state for that Guest: we copy the
  401 + * state in just before we run the Guest.
  402 + *
  403 + * Each Guest has "changed" flags which indicate what has changed in the Guest
  404 + * since it last ran. We saw this set in interrupts_and_traps.c and
  405 + * segments.c.
  406 + */
396 407 static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages)
397 408 {
  409 + /* Copying all this data can be quite expensive. We usually run the
  410 + * same Guest we ran last time (and that Guest hasn't run anywhere else
  411 + * meanwhile). If that's not the case, we pretend everything in the
  412 + * Guest has changed. */
398 413 if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) {
399 414 __get_cpu_var(last_guest) = lg;
400 415 lg->last_pages = pages;
401 416 lg->changed = CHANGED_ALL;
402 417 }
403 418  
404   - /* These are pretty cheap, so we do them unconditionally. */
  419 + /* These copies are pretty cheap, so we do them unconditionally: */
  420 + /* Save the current Host top-level page directory. */
405 421 pages->state.host_cr3 = __pa(current->mm->pgd);
  422 + /* Set up the Guest's page tables to see this CPU's pages (and no
  423 + * other CPU's pages). */
406 424 map_switcher_in_guest(lg, pages);
  425 + /* Set up the two "TSS" members which tell the CPU what stack to use
  426 + * for traps which do directly into the Guest (ie. traps at privilege
  427 + * level 1). */
407 428 pages->state.guest_tss.esp1 = lg->esp1;
408 429 pages->state.guest_tss.ss1 = lg->ss1;
409 430  
410   - /* Copy direct trap entries. */
  431 + /* Copy direct-to-Guest trap entries. */
411 432 if (lg->changed & CHANGED_IDT)
412 433 copy_traps(lg, pages->state.guest_idt, default_idt_entries);
413 434  
414   - /* Copy all GDT entries but the TSS. */
  435 + /* Copy all GDT entries which the Guest can change. */
415 436 if (lg->changed & CHANGED_GDT)
416 437 copy_gdt(lg, pages->state.guest_gdt);
417 438 /* If only the TLS entries have changed, copy them. */
418 439 else if (lg->changed & CHANGED_GDT_TLS)
419 440 copy_gdt_tls(lg, pages->state.guest_gdt);
420 441  
  442 + /* Mark the Guest as unchanged for next time. */
421 443 lg->changed = 0;
422 444 }
423 445  
  446 +/* Finally: the code to actually call into the Switcher to run the Guest. */
424 447 static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
425 448 {
  449 + /* This is a dummy value we need for GCC's sake. */
426 450 unsigned int clobber;
427 451  
  452 + /* Copy the guest-specific information into this CPU's "struct
  453 + * lguest_pages". */
428 454 copy_in_guest_info(lg, pages);
429 455  
430   - /* Put eflags on stack, lcall does rest: suitable for iret return. */
  456 + /* Now: we push the "eflags" register on the stack, then do an "lcall".
  457 + * This is how we change from using the kernel code segment to using
  458 + * the dedicated lguest code segment, as well as jumping into the
  459 + * Switcher.
  460 + *
  461 + * The lcall also pushes the old code segment (KERNEL_CS) onto the
  462 + * stack, then the address of this call. This stack layout happens to
  463 + * exactly match the stack of an interrupt... */
431 464 asm volatile("pushf; lcall *lguest_entry"
  465 + /* This is how we tell GCC that %eax ("a") and %ebx ("b")
  466 + * are changed by this routine. The "=" means output. */
432 467 : "=a"(clobber), "=b"(clobber)
  468 + /* %eax contains the pages pointer. ("0" refers to the
  469 + * 0-th argument above, ie "a"). %ebx contains the
  470 + * physical address of the Guest's top-level page
  471 + * directory. */
433 472 : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir))
  473 + /* We tell gcc that all these registers could change,
  474 + * which means we don't have to save and restore them in
  475 + * the Switcher. */
434 476 : "memory", "%edx", "%ecx", "%edi", "%esi");
435 477 }
  478 +/*:*/
436 479  
437 480 /*H:030 Let's jump straight to the the main loop which runs the Guest.
438 481 * Remember, this is called by the Launcher reading /dev/lguest, and we keep
drivers/lguest/switcher.S
... ... @@ -6,41 +6,131 @@
6 6 * are feeling invigorated and refreshed then the next, more challenging stage
7 7 * can be found in "make Guest". :*/
8 8  
  9 +/*S:100
  10 + * Welcome to the Switcher itself!
  11 + *
  12 + * This file contains the low-level code which changes the CPU to run the Guest
  13 + * code, and returns to the Host when something happens. Understand this, and
  14 + * you understand the heart of our journey.
  15 + *
  16 + * Because this is in assembler rather than C, our tale switches from prose to
  17 + * verse. First I tried limericks:
  18 + *
  19 + * There once was an eax reg,
  20 + * To which our pointer was fed,
  21 + * It needed an add,
  22 + * Which asm-offsets.h had
  23 + * But this limerick is hurting my head.
  24 + *
  25 + * Next I tried haikus, but fitting the required reference to the seasons in
  26 + * every stanza was quickly becoming tiresome:
  27 + *
  28 + * The %eax reg
  29 + * Holds "struct lguest_pages" now:
  30 + * Cherry blossoms fall.
  31 + *
  32 + * Then I started with Heroic Verse, but the rhyming requirement leeched away
  33 + * the content density and led to some uniquely awful oblique rhymes:
  34 + *
  35 + * These constants are coming from struct offsets
  36 + * For use within the asm switcher text.
  37 + *
  38 + * Finally, I settled for something between heroic hexameter, and normal prose
  39 + * with inappropriate linebreaks. Anyway, it aint no Shakespeare.
  40 + */
  41 +
  42 +// Not all kernel headers work from assembler
  43 +// But these ones are needed: the ENTRY() define
  44 +// And constants extracted from struct offsets
  45 +// To avoid magic numbers and breakage:
  46 +// Should they change the compiler can't save us
  47 +// Down here in the depths of assembler code.
9 48 #include <linux/linkage.h>
10 49 #include <asm/asm-offsets.h>
11 50 #include "lg.h"
12 51  
  52 +// We mark the start of the code to copy
  53 +// It's placed in .text tho it's never run here
  54 +// You'll see the trick macro at the end
  55 +// Which interleaves data and text to effect.
13 56 .text
14 57 ENTRY(start_switcher_text)
15 58  
16   -/* %eax points to lguest pages for this CPU. %ebx contains cr3 value.
17   - All normal registers can be clobbered! */
  59 +// When we reach switch_to_guest we have just left
  60 +// The safe and comforting shores of C code
  61 +// %eax has the "struct lguest_pages" to use
  62 +// Where we save state and still see it from the Guest
  63 +// And %ebx holds the Guest shadow pagetable:
  64 +// Once set we have truly left Host behind.
18 65 ENTRY(switch_to_guest)
19   - /* Save host segments on host stack. */
  66 + // We told gcc all its regs could fade,
  67 + // Clobbered by our journey into the Guest
  68 + // We could have saved them, if we tried
  69 + // But time is our master and cycles count.
  70 +
  71 + // Segment registers must be saved for the Host
  72 + // We push them on the Host stack for later
20 73 pushl %es
21 74 pushl %ds
22 75 pushl %gs
23 76 pushl %fs
24   - /* With CONFIG_FRAME_POINTER, gcc doesn't let us clobber this! */
  77 + // But the compiler is fickle, and heeds
  78 + // No warning of %ebp clobbers
  79 + // When frame pointers are used. That register
  80 + // Must be saved and restored or chaos strikes.
25 81 pushl %ebp
26   - /* Save host stack. */
  82 + // The Host's stack is done, now save it away
  83 + // In our "struct lguest_pages" at offset
  84 + // Distilled into asm-offsets.h
27 85 movl %esp, LGUEST_PAGES_host_sp(%eax)
28   - /* Switch to guest stack: if we get NMI we expect to be there. */
  86 +
  87 + // All saved and there's now five steps before us:
  88 + // Stack, GDT, IDT, TSS
  89 + // And last of all the page tables are flipped.
  90 +
  91 + // Yet beware that our stack pointer must be
  92 + // Always valid lest an NMI hits
  93 + // %edx does the duty here as we juggle
  94 + // %eax is lguest_pages: our stack lies within.
29 95 movl %eax, %edx
30 96 addl $LGUEST_PAGES_regs, %edx
31 97 movl %edx, %esp
32   - /* Switch to guest's GDT, IDT. */
  98 +
  99 + // The Guest's GDT we so carefully
  100 + // Placed in the "struct lguest_pages" before
33 101 lgdt LGUEST_PAGES_guest_gdt_desc(%eax)
  102 +
  103 + // The Guest's IDT we did partially
  104 + // Move to the "struct lguest_pages" as well.
34 105 lidt LGUEST_PAGES_guest_idt_desc(%eax)
35   - /* Switch to guest's TSS while GDT still writable. */
  106 +
  107 + // The TSS entry which controls traps
  108 + // Must be loaded up with "ltr" now:
  109 + // For after we switch over our page tables
  110 + // It (as the rest) will be writable no more.
  111 + // (The GDT entry TSS needs
  112 + // Changes type when we load it: damn Intel!)
36 113 movl $(GDT_ENTRY_TSS*8), %edx
37 114 ltr %dx
38   - /* Set host's TSS GDT entry to available (clear byte 5 bit 2). */
  115 +
  116 + // Look back now, before we take this last step!
  117 + // The Host's TSS entry was also marked used;
  118 + // Let's clear it again, ere we return.
  119 + // The GDT descriptor of the Host
  120 + // Points to the table after two "size" bytes
39 121 movl (LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx
  122 + // Clear the type field of "used" (byte 5, bit 2)
40 123 andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx)
41   - /* Switch to guest page tables: lguest_pages->state now read-only. */
  124 +
  125 + // Once our page table's switched, the Guest is live!
  126 + // The Host fades as we run this final step.
  127 + // Our "struct lguest_pages" is now read-only.
42 128 movl %ebx, %cr3
43   - /* Restore guest regs */
  129 +
  130 + // The page table change did one tricky thing:
  131 + // The Guest's register page has been mapped
  132 + // Writable onto our %esp (stack) --
  133 + // We can simply pop off all Guest regs.
44 134 popl %ebx
45 135 popl %ecx
46 136 popl %edx
47 137  
48 138  
49 139  
... ... @@ -52,12 +142,27 @@
52 142 popl %fs
53 143 popl %ds
54 144 popl %es
55   - /* Skip error code and trap number */
  145 +
  146 + // Near the base of the stack lurk two strange fields
  147 + // Which we fill as we exit the Guest
  148 + // These are the trap number and its error
  149 + // We can simply step past them on our way.
56 150 addl $8, %esp
  151 +
  152 + // The last five stack slots hold return address
  153 + // And everything needed to change privilege
  154 + // Into the Guest privilege level of 1,
  155 + // And the stack where the Guest had last left it.
  156 + // Interrupts are turned back on: we are Guest.
57 157 iret
58 158  
  159 +// There are two paths where we switch to the Host
  160 +// So we put the routine in a macro.
  161 +// We are on our way home, back to the Host
  162 +// Interrupted out of the Guest, we come here.
59 163 #define SWITCH_TO_HOST \
60   - /* Save guest state */ \
  164 + /* We save the Guest state: all registers first \
  165 + * Laid out just as "struct lguest_regs" defines */ \
61 166 pushl %es; \
62 167 pushl %ds; \
63 168 pushl %fs; \
64 169  
65 170  
66 171  
67 172  
68 173  
69 174  
70 175  
71 176  
72 177  
73 178  
74 179  
75 180  
76 181  
77 182  
78 183  
... ... @@ -69,58 +174,119 @@
69 174 pushl %edx; \
70 175 pushl %ecx; \
71 176 pushl %ebx; \
72   - /* Load lguest ds segment for convenience. */ \
  177 + /* Our stack and our code are using segments \
  178 + * Set in the TSS and IDT \
  179 + * Yet if we were to touch data we'd use \
  180 + * Whatever data segment the Guest had. \
  181 + * Load the lguest ds segment for now. */ \
73 182 movl $(LGUEST_DS), %eax; \
74 183 movl %eax, %ds; \
75   - /* Figure out where we are, based on stack (at top of regs). */ \
  184 + /* So where are we? Which CPU, which struct? \
  185 + * The stack is our clue: our TSS sets \
  186 + * It at the end of "struct lguest_pages" \
  187 + * And we then pushed and pushed and pushed Guest regs: \
  188 + * Now stack points atop the "struct lguest_regs". \
  189 + * Subtract that offset, and we find our struct. */ \
76 190 movl %esp, %eax; \
77 191 subl $LGUEST_PAGES_regs, %eax; \
78   - /* Put trap number in %ebx before we switch cr3 and lose it. */ \
  192 + /* Save our trap number: the switch will obscure it \
  193 + * (The Guest regs are not mapped here in the Host) \
  194 + * %ebx holds it safe for deliver_to_host */ \
79 195 movl LGUEST_PAGES_regs_trapnum(%eax), %ebx; \
80   - /* Switch to host page tables (host GDT, IDT and stack are in host \
81   - mem, so need this first) */ \
  196 + /* The Host GDT, IDT and stack! \
  197 + * All these lie safely hidden from the Guest: \
  198 + * We must return to the Host page tables \
  199 + * (Hence that was saved in struct lguest_pages) */ \
82 200 movl LGUEST_PAGES_host_cr3(%eax), %edx; \
83 201 movl %edx, %cr3; \
84   - /* Set guest's TSS to available (clear byte 5 bit 2). */ \
  202 + /* As before, when we looked back at the Host \
  203 + * As we left and marked TSS unused \
  204 + * So must we now for the Guest left behind. */ \
85 205 andb $0xFD, (LGUEST_PAGES_guest_gdt+GDT_ENTRY_TSS*8+5)(%eax); \
86   - /* Switch to host's GDT & IDT. */ \
  206 + /* Switch to Host's GDT, IDT. */ \
87 207 lgdt LGUEST_PAGES_host_gdt_desc(%eax); \
88 208 lidt LGUEST_PAGES_host_idt_desc(%eax); \
89   - /* Switch to host's stack. */ \
  209 + /* Restore the Host's stack where it's saved regs lie */ \
90 210 movl LGUEST_PAGES_host_sp(%eax), %esp; \
91   - /* Switch to host's TSS */ \
  211 + /* Last the TSS: our Host is complete */ \
92 212 movl $(GDT_ENTRY_TSS*8), %edx; \
93 213 ltr %dx; \
  214 + /* Restore now the regs saved right at the first. */ \
94 215 popl %ebp; \
95 216 popl %fs; \
96 217 popl %gs; \
97 218 popl %ds; \
98 219 popl %es
99 220  
100   -/* Return to run_guest_once. */
  221 +// Here's where we come when the Guest has just trapped:
  222 +// (Which trap we'll see has been pushed on the stack).
  223 +// We need only switch back, and the Host will decode
  224 +// Why we came home, and what needs to be done.
101 225 return_to_host:
102 226 SWITCH_TO_HOST
103 227 iret
104 228  
  229 +// An interrupt, with some cause external
  230 +// Has ajerked us rudely from the Guest's code
  231 +// Again we must return home to the Host
105 232 deliver_to_host:
106 233 SWITCH_TO_HOST
107   - /* Decode IDT and jump to hosts' irq handler. When that does iret, it
108   - * will return to run_guest_once. This is a feature. */
  234 + // But now we must go home via that place
  235 + // Where that interrupt was supposed to go
  236 + // Had we not been ensconced, running the Guest.
  237 + // Here we see the cleverness of our stack:
  238 + // The Host stack is formed like an interrupt
  239 + // With EIP, CS and EFLAGS layered.
  240 + // Interrupt handlers end with "iret"
  241 + // And that will take us home at long long last.
  242 +
  243 + // But first we must find the handler to call!
  244 + // The IDT descriptor for the Host
  245 + // Has two bytes for size, and four for address:
  246 + // %edx will hold it for us for now.
109 247 movl (LGUEST_PAGES_host_idt_desc+2)(%eax), %edx
  248 + // We now know the table address we need,
  249 + // And saved the trap's number inside %ebx.
  250 + // Yet the pointer to the handler is smeared
  251 + // Across the bits of the table entry.
  252 + // What oracle can tell us how to extract
  253 + // From such a convoluted encoding?
  254 + // I consulted gcc, and it gave
  255 + // These instructions, which I gladly credit:
110 256 leal (%edx,%ebx,8), %eax
111 257 movzwl (%eax),%edx
112 258 movl 4(%eax), %eax
113 259 xorw %ax, %ax
114 260 orl %eax, %edx
  261 + // Now the address of the handler's in %edx
  262 + // We call it now: its "iret" takes us home.
115 263 jmp *%edx
116 264  
117   -/* Real hardware interrupts are delivered straight to the host. Others
118   - cause us to return to run_guest_once so it can decide what to do. Note
119   - that some of these are overridden by the guest to deliver directly, and
120   - never enter here (see load_guest_idt_entry). */
  265 +// Every interrupt can come to us here
  266 +// But we must truly tell each apart.
  267 +// They number two hundred and fifty six
  268 +// And each must land in a different spot,
  269 +// Push its number on stack, and join the stream.
  270 +
  271 +// And worse, a mere six of the traps stand apart
  272 +// And push on their stack an addition:
  273 +// An error number, thirty two bits long
  274 +// So we punish the other two fifty
  275 +// And make them push a zero so they match.
  276 +
  277 +// Yet two fifty six entries is long
  278 +// And all will look most the same as the last
  279 +// So we create a macro which can make
  280 +// As many entries as we need to fill.
  281 +
  282 +// Note the change to .data then .text:
  283 +// We plant the address of each entry
  284 +// Into a (data) table for the Host
  285 +// To know where each Guest interrupt should go.
121 286 .macro IRQ_STUB N TARGET
122 287 .data; .long 1f; .text; 1:
123   - /* Make an error number for most traps, which don't have one. */
  288 + // Trap eight, ten through fourteen and seventeen
  289 + // Supply an error number. Else zero.
124 290 .if (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17)
125 291 pushl $0
126 292 .endif
... ... @@ -129,6 +295,8 @@
129 295 ALIGN
130 296 .endm
131 297  
  298 +// This macro creates numerous entries
  299 +// Using GAS macros which out-power C's.
132 300 .macro IRQ_STUBS FIRST LAST TARGET
133 301 irq=\FIRST
134 302 .rept \LAST-\FIRST+1
135 303  
136 304  
137 305  
... ... @@ -137,25 +305,44 @@
137 305 .endr
138 306 .endm
139 307  
140   -/* We intercept every interrupt, because we may need to switch back to
141   - * host. Unfortunately we can't tell them apart except by entry
142   - * point, so we need 256 entry points.
143   - */
  308 +// Here's the marker for our pointer table
  309 +// Laid in the data section just before
  310 +// Each macro places the address of code
  311 +// Forming an array: each one points to text
  312 +// Which handles interrupt in its turn.
144 313 .data
145 314 .global default_idt_entries
146 315 default_idt_entries:
147 316 .text
148   - IRQ_STUBS 0 1 return_to_host /* First two traps */
149   - IRQ_STUB 2 handle_nmi /* NMI */
150   - IRQ_STUBS 3 31 return_to_host /* Rest of traps */
151   - IRQ_STUBS 32 127 deliver_to_host /* Real interrupts */
152   - IRQ_STUB 128 return_to_host /* System call (overridden) */
153   - IRQ_STUBS 129 255 deliver_to_host /* Other real interrupts */
  317 + // The first two traps go straight back to the Host
  318 + IRQ_STUBS 0 1 return_to_host
  319 + // We'll say nothing, yet, about NMI
  320 + IRQ_STUB 2 handle_nmi
  321 + // Other traps also return to the Host
  322 + IRQ_STUBS 3 31 return_to_host
  323 + // All interrupts go via their handlers
  324 + IRQ_STUBS 32 127 deliver_to_host
  325 + // 'Cept system calls coming from userspace
  326 + // Are to go to the Guest, never the Host.
  327 + IRQ_STUB 128 return_to_host
  328 + IRQ_STUBS 129 255 deliver_to_host
154 329  
155   -/* We ignore NMI and return. */
  330 +// The NMI, what a fabulous beast
  331 +// Which swoops in and stops us no matter that
  332 +// We're suspended between heaven and hell,
  333 +// (Or more likely between the Host and Guest)
  334 +// When in it comes! We are dazed and confused
  335 +// So we do the simplest thing which one can.
  336 +// Though we've pushed the trap number and zero
  337 +// We discard them, return, and hope we live.
156 338 handle_nmi:
157 339 addl $8, %esp
158 340 iret
159 341  
  342 +// We are done; all that's left is Mastery
  343 +// And "make Mastery" is a journey long
  344 +// Designed to make your fingers itch to code.
  345 +
  346 +// Here ends the text, the file and poem.
160 347 ENTRY(end_switcher_text)