Commit f8f0fdcd authored by Rusty Russell's avatar Rusty Russell Committed by Linus Torvalds

lguest: documentation VI: Switcher

Documentation: The Switcher
Signed-off-by: default avatarRusty Russell <rusty@rustcorp.com.au>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent bff672e6
...@@ -393,46 +393,89 @@ static void set_ts(void) ...@@ -393,46 +393,89 @@ static void set_ts(void)
write_cr0(cr0|8); write_cr0(cr0|8);
} }
/*S:010
* We are getting close to the Switcher.
*
* Remember that each CPU has two pages which are visible to the Guest when it
* runs on that CPU. This has to contain the state for that Guest: we copy the
* state in just before we run the Guest.
*
* Each Guest has "changed" flags which indicate what has changed in the Guest
* since it last ran. We saw this set in interrupts_and_traps.c and
* segments.c.
*/
static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages) static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages)
{ {
/* Copying all this data can be quite expensive. We usually run the
* same Guest we ran last time (and that Guest hasn't run anywhere else
* meanwhile). If that's not the case, we pretend everything in the
* Guest has changed. */
if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) { if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) {
__get_cpu_var(last_guest) = lg; __get_cpu_var(last_guest) = lg;
lg->last_pages = pages; lg->last_pages = pages;
lg->changed = CHANGED_ALL; lg->changed = CHANGED_ALL;
} }
/* These are pretty cheap, so we do them unconditionally. */ /* These copies are pretty cheap, so we do them unconditionally: */
/* Save the current Host top-level page directory. */
pages->state.host_cr3 = __pa(current->mm->pgd); pages->state.host_cr3 = __pa(current->mm->pgd);
/* Set up the Guest's page tables to see this CPU's pages (and no
* other CPU's pages). */
map_switcher_in_guest(lg, pages); map_switcher_in_guest(lg, pages);
/* Set up the two "TSS" members which tell the CPU what stack to use
* for traps which do directly into the Guest (ie. traps at privilege
* level 1). */
pages->state.guest_tss.esp1 = lg->esp1; pages->state.guest_tss.esp1 = lg->esp1;
pages->state.guest_tss.ss1 = lg->ss1; pages->state.guest_tss.ss1 = lg->ss1;
/* Copy direct trap entries. */ /* Copy direct-to-Guest trap entries. */
if (lg->changed & CHANGED_IDT) if (lg->changed & CHANGED_IDT)
copy_traps(lg, pages->state.guest_idt, default_idt_entries); copy_traps(lg, pages->state.guest_idt, default_idt_entries);
/* Copy all GDT entries but the TSS. */ /* Copy all GDT entries which the Guest can change. */
if (lg->changed & CHANGED_GDT) if (lg->changed & CHANGED_GDT)
copy_gdt(lg, pages->state.guest_gdt); copy_gdt(lg, pages->state.guest_gdt);
/* If only the TLS entries have changed, copy them. */ /* If only the TLS entries have changed, copy them. */
else if (lg->changed & CHANGED_GDT_TLS) else if (lg->changed & CHANGED_GDT_TLS)
copy_gdt_tls(lg, pages->state.guest_gdt); copy_gdt_tls(lg, pages->state.guest_gdt);
/* Mark the Guest as unchanged for next time. */
lg->changed = 0; lg->changed = 0;
} }
/* Finally: the code to actually call into the Switcher to run the Guest. */
static void run_guest_once(struct lguest *lg, struct lguest_pages *pages) static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
{ {
/* This is a dummy value we need for GCC's sake. */
unsigned int clobber; unsigned int clobber;
/* Copy the guest-specific information into this CPU's "struct
* lguest_pages". */
copy_in_guest_info(lg, pages); copy_in_guest_info(lg, pages);
/* Put eflags on stack, lcall does rest: suitable for iret return. */ /* Now: we push the "eflags" register on the stack, then do an "lcall".
* This is how we change from using the kernel code segment to using
* the dedicated lguest code segment, as well as jumping into the
* Switcher.
*
* The lcall also pushes the old code segment (KERNEL_CS) onto the
* stack, then the address of this call. This stack layout happens to
* exactly match the stack of an interrupt... */
asm volatile("pushf; lcall *lguest_entry" asm volatile("pushf; lcall *lguest_entry"
/* This is how we tell GCC that %eax ("a") and %ebx ("b")
* are changed by this routine. The "=" means output. */
: "=a"(clobber), "=b"(clobber) : "=a"(clobber), "=b"(clobber)
/* %eax contains the pages pointer. ("0" refers to the
* 0-th argument above, ie "a"). %ebx contains the
* physical address of the Guest's top-level page
* directory. */
: "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir)) : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir))
/* We tell gcc that all these registers could change,
* which means we don't have to save and restore them in
* the Switcher. */
: "memory", "%edx", "%ecx", "%edi", "%esi"); : "memory", "%edx", "%ecx", "%edi", "%esi");
} }
/*:*/
/*H:030 Let's jump straight to the the main loop which runs the Guest. /*H:030 Let's jump straight to the the main loop which runs the Guest.
* Remember, this is called by the Launcher reading /dev/lguest, and we keep * Remember, this is called by the Launcher reading /dev/lguest, and we keep
......
...@@ -6,41 +6,131 @@ ...@@ -6,41 +6,131 @@
* are feeling invigorated and refreshed then the next, more challenging stage * are feeling invigorated and refreshed then the next, more challenging stage
* can be found in "make Guest". :*/ * can be found in "make Guest". :*/
/*S:100
* Welcome to the Switcher itself!
*
* This file contains the low-level code which changes the CPU to run the Guest
* code, and returns to the Host when something happens. Understand this, and
* you understand the heart of our journey.
*
* Because this is in assembler rather than C, our tale switches from prose to
* verse. First I tried limericks:
*
* There once was an eax reg,
* To which our pointer was fed,
* It needed an add,
* Which asm-offsets.h had
* But this limerick is hurting my head.
*
* Next I tried haikus, but fitting the required reference to the seasons in
* every stanza was quickly becoming tiresome:
*
* The %eax reg
* Holds "struct lguest_pages" now:
* Cherry blossoms fall.
*
* Then I started with Heroic Verse, but the rhyming requirement leeched away
* the content density and led to some uniquely awful oblique rhymes:
*
* These constants are coming from struct offsets
* For use within the asm switcher text.
*
* Finally, I settled for something between heroic hexameter, and normal prose
* with inappropriate linebreaks. Anyway, it aint no Shakespeare.
*/
// Not all kernel headers work from assembler
// But these ones are needed: the ENTRY() define
// And constants extracted from struct offsets
// To avoid magic numbers and breakage:
// Should they change the compiler can't save us
// Down here in the depths of assembler code.
#include <linux/linkage.h> #include <linux/linkage.h>
#include <asm/asm-offsets.h> #include <asm/asm-offsets.h>
#include "lg.h" #include "lg.h"
// We mark the start of the code to copy
// It's placed in .text tho it's never run here
// You'll see the trick macro at the end
// Which interleaves data and text to effect.
.text .text
ENTRY(start_switcher_text) ENTRY(start_switcher_text)
/* %eax points to lguest pages for this CPU. %ebx contains cr3 value. // When we reach switch_to_guest we have just left
All normal registers can be clobbered! */ // The safe and comforting shores of C code
// %eax has the "struct lguest_pages" to use
// Where we save state and still see it from the Guest
// And %ebx holds the Guest shadow pagetable:
// Once set we have truly left Host behind.
ENTRY(switch_to_guest) ENTRY(switch_to_guest)
/* Save host segments on host stack. */ // We told gcc all its regs could fade,
// Clobbered by our journey into the Guest
// We could have saved them, if we tried
// But time is our master and cycles count.
// Segment registers must be saved for the Host
// We push them on the Host stack for later
pushl %es pushl %es
pushl %ds pushl %ds
pushl %gs pushl %gs
pushl %fs pushl %fs
/* With CONFIG_FRAME_POINTER, gcc doesn't let us clobber this! */ // But the compiler is fickle, and heeds
// No warning of %ebp clobbers
// When frame pointers are used. That register
// Must be saved and restored or chaos strikes.
pushl %ebp pushl %ebp
/* Save host stack. */ // The Host's stack is done, now save it away
// In our "struct lguest_pages" at offset
// Distilled into asm-offsets.h
movl %esp, LGUEST_PAGES_host_sp(%eax) movl %esp, LGUEST_PAGES_host_sp(%eax)
/* Switch to guest stack: if we get NMI we expect to be there. */
// All saved and there's now five steps before us:
// Stack, GDT, IDT, TSS
// And last of all the page tables are flipped.
// Yet beware that our stack pointer must be
// Always valid lest an NMI hits
// %edx does the duty here as we juggle
// %eax is lguest_pages: our stack lies within.
movl %eax, %edx movl %eax, %edx
addl $LGUEST_PAGES_regs, %edx addl $LGUEST_PAGES_regs, %edx
movl %edx, %esp movl %edx, %esp
/* Switch to guest's GDT, IDT. */
// The Guest's GDT we so carefully
// Placed in the "struct lguest_pages" before
lgdt LGUEST_PAGES_guest_gdt_desc(%eax) lgdt LGUEST_PAGES_guest_gdt_desc(%eax)
// The Guest's IDT we did partially
// Move to the "struct lguest_pages" as well.
lidt LGUEST_PAGES_guest_idt_desc(%eax) lidt LGUEST_PAGES_guest_idt_desc(%eax)
/* Switch to guest's TSS while GDT still writable. */
// The TSS entry which controls traps
// Must be loaded up with "ltr" now:
// For after we switch over our page tables
// It (as the rest) will be writable no more.
// (The GDT entry TSS needs
// Changes type when we load it: damn Intel!)
movl $(GDT_ENTRY_TSS*8), %edx movl $(GDT_ENTRY_TSS*8), %edx
ltr %dx ltr %dx
/* Set host's TSS GDT entry to available (clear byte 5 bit 2). */
// Look back now, before we take this last step!
// The Host's TSS entry was also marked used;
// Let's clear it again, ere we return.
// The GDT descriptor of the Host
// Points to the table after two "size" bytes
movl (LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx movl (LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx
// Clear the type field of "used" (byte 5, bit 2)
andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx) andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx)
/* Switch to guest page tables: lguest_pages->state now read-only. */
// Once our page table's switched, the Guest is live!
// The Host fades as we run this final step.
// Our "struct lguest_pages" is now read-only.
movl %ebx, %cr3 movl %ebx, %cr3
/* Restore guest regs */
// The page table change did one tricky thing:
// The Guest's register page has been mapped
// Writable onto our %esp (stack) --
// We can simply pop off all Guest regs.
popl %ebx popl %ebx
popl %ecx popl %ecx
popl %edx popl %edx
...@@ -52,12 +142,27 @@ ENTRY(switch_to_guest) ...@@ -52,12 +142,27 @@ ENTRY(switch_to_guest)
popl %fs popl %fs
popl %ds popl %ds
popl %es popl %es
/* Skip error code and trap number */
// Near the base of the stack lurk two strange fields
// Which we fill as we exit the Guest
// These are the trap number and its error
// We can simply step past them on our way.
addl $8, %esp addl $8, %esp
// The last five stack slots hold return address
// And everything needed to change privilege
// Into the Guest privilege level of 1,
// And the stack where the Guest had last left it.
// Interrupts are turned back on: we are Guest.
iret iret
// There are two paths where we switch to the Host
// So we put the routine in a macro.
// We are on our way home, back to the Host
// Interrupted out of the Guest, we come here.
#define SWITCH_TO_HOST \ #define SWITCH_TO_HOST \
/* Save guest state */ \ /* We save the Guest state: all registers first \
* Laid out just as "struct lguest_regs" defines */ \
pushl %es; \ pushl %es; \
pushl %ds; \ pushl %ds; \
pushl %fs; \ pushl %fs; \
...@@ -69,58 +174,119 @@ ENTRY(switch_to_guest) ...@@ -69,58 +174,119 @@ ENTRY(switch_to_guest)
pushl %edx; \ pushl %edx; \
pushl %ecx; \ pushl %ecx; \
pushl %ebx; \ pushl %ebx; \
/* Load lguest ds segment for convenience. */ \ /* Our stack and our code are using segments \
* Set in the TSS and IDT \
* Yet if we were to touch data we'd use \
* Whatever data segment the Guest had. \
* Load the lguest ds segment for now. */ \
movl $(LGUEST_DS), %eax; \ movl $(LGUEST_DS), %eax; \
movl %eax, %ds; \ movl %eax, %ds; \
/* Figure out where we are, based on stack (at top of regs). */ \ /* So where are we? Which CPU, which struct? \
* The stack is our clue: our TSS sets \
* It at the end of "struct lguest_pages" \
* And we then pushed and pushed and pushed Guest regs: \
* Now stack points atop the "struct lguest_regs". \
* Subtract that offset, and we find our struct. */ \
movl %esp, %eax; \ movl %esp, %eax; \
subl $LGUEST_PAGES_regs, %eax; \ subl $LGUEST_PAGES_regs, %eax; \
/* Put trap number in %ebx before we switch cr3 and lose it. */ \ /* Save our trap number: the switch will obscure it \
* (The Guest regs are not mapped here in the Host) \
* %ebx holds it safe for deliver_to_host */ \
movl LGUEST_PAGES_regs_trapnum(%eax), %ebx; \ movl LGUEST_PAGES_regs_trapnum(%eax), %ebx; \
/* Switch to host page tables (host GDT, IDT and stack are in host \ /* The Host GDT, IDT and stack! \
mem, so need this first) */ \ * All these lie safely hidden from the Guest: \
* We must return to the Host page tables \
* (Hence that was saved in struct lguest_pages) */ \
movl LGUEST_PAGES_host_cr3(%eax), %edx; \ movl LGUEST_PAGES_host_cr3(%eax), %edx; \
movl %edx, %cr3; \ movl %edx, %cr3; \
/* Set guest's TSS to available (clear byte 5 bit 2). */ \ /* As before, when we looked back at the Host \
* As we left and marked TSS unused \
* So must we now for the Guest left behind. */ \
andb $0xFD, (LGUEST_PAGES_guest_gdt+GDT_ENTRY_TSS*8+5)(%eax); \ andb $0xFD, (LGUEST_PAGES_guest_gdt+GDT_ENTRY_TSS*8+5)(%eax); \
/* Switch to host's GDT & IDT. */ \ /* Switch to Host's GDT, IDT. */ \
lgdt LGUEST_PAGES_host_gdt_desc(%eax); \ lgdt LGUEST_PAGES_host_gdt_desc(%eax); \
lidt LGUEST_PAGES_host_idt_desc(%eax); \ lidt LGUEST_PAGES_host_idt_desc(%eax); \
/* Switch to host's stack. */ \ /* Restore the Host's stack where it's saved regs lie */ \
movl LGUEST_PAGES_host_sp(%eax), %esp; \ movl LGUEST_PAGES_host_sp(%eax), %esp; \
/* Switch to host's TSS */ \ /* Last the TSS: our Host is complete */ \
movl $(GDT_ENTRY_TSS*8), %edx; \ movl $(GDT_ENTRY_TSS*8), %edx; \
ltr %dx; \ ltr %dx; \
/* Restore now the regs saved right at the first. */ \
popl %ebp; \ popl %ebp; \
popl %fs; \ popl %fs; \
popl %gs; \ popl %gs; \
popl %ds; \ popl %ds; \
popl %es popl %es
/* Return to run_guest_once. */ // Here's where we come when the Guest has just trapped:
// (Which trap we'll see has been pushed on the stack).
// We need only switch back, and the Host will decode
// Why we came home, and what needs to be done.
return_to_host: return_to_host:
SWITCH_TO_HOST SWITCH_TO_HOST
iret iret
// An interrupt, with some cause external
// Has ajerked us rudely from the Guest's code
// Again we must return home to the Host
deliver_to_host: deliver_to_host:
SWITCH_TO_HOST SWITCH_TO_HOST
/* Decode IDT and jump to hosts' irq handler. When that does iret, it // But now we must go home via that place
* will return to run_guest_once. This is a feature. */ // Where that interrupt was supposed to go
// Had we not been ensconced, running the Guest.
// Here we see the cleverness of our stack:
// The Host stack is formed like an interrupt
// With EIP, CS and EFLAGS layered.
// Interrupt handlers end with "iret"
// And that will take us home at long long last.
// But first we must find the handler to call!
// The IDT descriptor for the Host
// Has two bytes for size, and four for address:
// %edx will hold it for us for now.
movl (LGUEST_PAGES_host_idt_desc+2)(%eax), %edx movl (LGUEST_PAGES_host_idt_desc+2)(%eax), %edx
// We now know the table address we need,
// And saved the trap's number inside %ebx.
// Yet the pointer to the handler is smeared
// Across the bits of the table entry.
// What oracle can tell us how to extract
// From such a convoluted encoding?
// I consulted gcc, and it gave
// These instructions, which I gladly credit:
leal (%edx,%ebx,8), %eax leal (%edx,%ebx,8), %eax
movzwl (%eax),%edx movzwl (%eax),%edx
movl 4(%eax), %eax movl 4(%eax), %eax
xorw %ax, %ax xorw %ax, %ax
orl %eax, %edx orl %eax, %edx
// Now the address of the handler's in %edx
// We call it now: its "iret" takes us home.
jmp *%edx jmp *%edx
/* Real hardware interrupts are delivered straight to the host. Others // Every interrupt can come to us here
cause us to return to run_guest_once so it can decide what to do. Note // But we must truly tell each apart.
that some of these are overridden by the guest to deliver directly, and // They number two hundred and fifty six
never enter here (see load_guest_idt_entry). */ // And each must land in a different spot,
// Push its number on stack, and join the stream.
// And worse, a mere six of the traps stand apart
// And push on their stack an addition:
// An error number, thirty two bits long
// So we punish the other two fifty
// And make them push a zero so they match.
// Yet two fifty six entries is long
// And all will look most the same as the last
// So we create a macro which can make
// As many entries as we need to fill.
// Note the change to .data then .text:
// We plant the address of each entry
// Into a (data) table for the Host
// To know where each Guest interrupt should go.
.macro IRQ_STUB N TARGET .macro IRQ_STUB N TARGET
.data; .long 1f; .text; 1: .data; .long 1f; .text; 1:
/* Make an error number for most traps, which don't have one. */ // Trap eight, ten through fourteen and seventeen
// Supply an error number. Else zero.
.if (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17) .if (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17)
pushl $0 pushl $0
.endif .endif
...@@ -129,6 +295,8 @@ deliver_to_host: ...@@ -129,6 +295,8 @@ deliver_to_host:
ALIGN ALIGN
.endm .endm
// This macro creates numerous entries
// Using GAS macros which out-power C's.
.macro IRQ_STUBS FIRST LAST TARGET .macro IRQ_STUBS FIRST LAST TARGET
irq=\FIRST irq=\FIRST
.rept \LAST-\FIRST+1 .rept \LAST-\FIRST+1
...@@ -137,24 +305,43 @@ deliver_to_host: ...@@ -137,24 +305,43 @@ deliver_to_host:
.endr .endr
.endm .endm
/* We intercept every interrupt, because we may need to switch back to // Here's the marker for our pointer table
* host. Unfortunately we can't tell them apart except by entry // Laid in the data section just before
* point, so we need 256 entry points. // Each macro places the address of code
*/ // Forming an array: each one points to text
// Which handles interrupt in its turn.
.data .data
.global default_idt_entries .global default_idt_entries
default_idt_entries: default_idt_entries:
.text .text
IRQ_STUBS 0 1 return_to_host /* First two traps */ // The first two traps go straight back to the Host
IRQ_STUB 2 handle_nmi /* NMI */ IRQ_STUBS 0 1 return_to_host
IRQ_STUBS 3 31 return_to_host /* Rest of traps */ // We'll say nothing, yet, about NMI
IRQ_STUBS 32 127 deliver_to_host /* Real interrupts */ IRQ_STUB 2 handle_nmi
IRQ_STUB 128 return_to_host /* System call (overridden) */ // Other traps also return to the Host
IRQ_STUBS 129 255 deliver_to_host /* Other real interrupts */ IRQ_STUBS 3 31 return_to_host
// All interrupts go via their handlers
/* We ignore NMI and return. */ IRQ_STUBS 32 127 deliver_to_host
// 'Cept system calls coming from userspace
// Are to go to the Guest, never the Host.
IRQ_STUB 128 return_to_host
IRQ_STUBS 129 255 deliver_to_host
// The NMI, what a fabulous beast
// Which swoops in and stops us no matter that
// We're suspended between heaven and hell,
// (Or more likely between the Host and Guest)
// When in it comes! We are dazed and confused
// So we do the simplest thing which one can.
// Though we've pushed the trap number and zero
// We discard them, return, and hope we live.
handle_nmi: handle_nmi:
addl $8, %esp addl $8, %esp
iret iret
// We are done; all that's left is Mastery
// And "make Mastery" is a journey long
// Designed to make your fingers itch to code.
// Here ends the text, the file and poem.
ENTRY(end_switcher_text) ENTRY(end_switcher_text)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment