#include <linux/linkage.h>1#include <linux/lguest.h>2#include <asm/lguest_hcall.h>3#include <asm/asm-offsets.h>4#include <asm/thread_info.h>5#include <asm/processor-flags.h>67/*G:0208* Our story starts with the kernel booting into startup_32 in9* arch/x86/kernel/head_32.S. It expects a boot header, which is created by10* the bootloader (the Launcher in our case).11*12* The startup_32 function does very little: it clears the uninitialized global13* C variables which we expect to be zero (ie. BSS) and then copies the boot14* header and kernel command line somewhere safe. Finally it checks the15* 'hardware_subarch' field. This was introduced in 2.6.24 for lguest and Xen:16* if it's set to '1' (lguest's assigned number), then it calls us here.17*18* WARNING: be very careful here! We're running at addresses equal to physical19* addesses (around 0), not above PAGE_OFFSET as most code expectes20* (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any21* data without remembering to subtract __PAGE_OFFSET!22*23* The .section line puts this code in .init.text so it will be discarded after24* boot.25*/26.section .init.text, "ax", @progbits27ENTRY(lguest_entry)28/*29* We make the "initialization" hypercall now to tell the Host about30* us, and also find out where it put our page tables.31*/32movl $LHCALL_LGUEST_INIT, %eax33movl $lguest_data - __PAGE_OFFSET, %ebx34int $LGUEST_TRAP_ENTRY3536/* Set up the initial stack so we can run C code. */37movl $(init_thread_union+THREAD_SIZE),%esp3839/* Jumps are relative: we're running __PAGE_OFFSET too low. */40jmp lguest_init+__PAGE_OFFSET4142/*G:05543* We create a macro which puts the assembler code between lgstart_ and lgend_44* markers. These templates are put in the .text section: they can't be45* discarded after boot as we may need to patch modules, too.46*/47.text48#define LGUEST_PATCH(name, insns...) \49lgstart_##name: insns; lgend_##name:; \50.globl lgstart_##name; .globl lgend_##name5152LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)53LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)5455/*G:03356* But using those wrappers is inefficient (we'll see why that doesn't matter57* for save_fl and irq_disable later). If we write our routines carefully in58* assembler, we can avoid clobbering any registers and avoid jumping through59* the wrapper functions.60*61* I skipped over our first piece of assembler, but this one is worth studying62* in a bit more detail so I'll describe in easy stages. First, the routine to63* enable interrupts:64*/65ENTRY(lg_irq_enable)66/*67* The reverse of irq_disable, this sets lguest_data.irq_enabled to68* X86_EFLAGS_IF (ie. "Interrupts enabled").69*/70movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled71/*72* But now we need to check if the Host wants to know: there might have73* been interrupts waiting to be delivered, in which case it will have74* set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we75* jump to send_interrupts, otherwise we're done.76*/77testl $0, lguest_data+LGUEST_DATA_irq_pending78jnz send_interrupts79/*80* One cool thing about x86 is that you can do many things without using81* a register. In this case, the normal path hasn't needed to save or82* restore any registers at all!83*/84ret85send_interrupts:86/*87* OK, now we need a register: eax is used for the hypercall number,88* which is LHCALL_SEND_INTERRUPTS.89*90* We used not to bother with this pending detection at all, which was91* much simpler. Sooner or later the Host would realize it had to92* send us an interrupt. But that turns out to make performance 793* times worse on a simple tcp benchmark. So now we do this the hard94* way.95*/96pushl %eax97movl $LHCALL_SEND_INTERRUPTS, %eax98/*99* This is a vmcall instruction (same thing that KVM uses). Older100* assembler versions might not know the "vmcall" instruction, so we101* create one manually here.102*/103.byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */104/* Put eax back the way we found it. */105popl %eax106ret107108/*109* Finally, the "popf" or "restore flags" routine. The %eax register holds the110* flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're111* enabling interrupts again, if it's 0 we're leaving them off.112*/113ENTRY(lg_restore_fl)114/* This is just "lguest_data.irq_enabled = flags;" */115movl %eax, lguest_data+LGUEST_DATA_irq_enabled116/*117* Now, if the %eax value has enabled interrupts and118* lguest_data.irq_pending is set, we want to tell the Host so it can119* deliver any outstanding interrupts. Fortunately, both values will120* be X86_EFLAGS_IF (ie. 512) in that case, and the "testl"121* instruction will AND them together for us. If both are set, we122* jump to send_interrupts.123*/124testl lguest_data+LGUEST_DATA_irq_pending, %eax125jnz send_interrupts126/* Again, the normal path has used no extra registers. Clever, huh? */127ret128/*:*/129130/* These demark the EIP range where host should never deliver interrupts. */131.global lguest_noirq_start132.global lguest_noirq_end133134/*M:004135* When the Host reflects a trap or injects an interrupt into the Guest, it136* sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled,137* so the Guest iret logic does the right thing when restoring it. However,138* when the Host sets the Guest up for direct traps, such as system calls, the139* processor is the one to push eflags onto the stack, and the interrupt bit140* will be 1 (in reality, interrupts are always enabled in the Guest).141*142* This turns out to be harmless: the only trap which should happen under Linux143* with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc144* regions), which has to be reflected through the Host anyway. If another145* trap *does* go off when interrupts are disabled, the Guest will panic, and146* we'll never get to this iret!147:*/148149/*G:045150* There is one final paravirt_op that the Guest implements, and glancing at it151* you can see why I left it to last. It's *cool*! It's in *assembler*!152*153* The "iret" instruction is used to return from an interrupt or trap. The154* stack looks like this:155* old address156* old code segment & privilege level157* old processor flags ("eflags")158*159* The "iret" instruction pops those values off the stack and restores them all160* at once. The only problem is that eflags includes the Interrupt Flag which161* the Guest can't change: the CPU will simply ignore it when we do an "iret".162* So we have to copy eflags from the stack to lguest_data.irq_enabled before163* we do the "iret".164*165* There are two problems with this: firstly, we need to use a register to do166* the copy and secondly, the whole thing needs to be atomic. The first167* problem is easy to solve: push %eax on the stack so we can use it, and then168* restore it at the end just before the real "iret".169*170* The second is harder: copying eflags to lguest_data.irq_enabled will turn171* interrupts on before we're finished, so we could be interrupted before we172* return to userspace or wherever. Our solution to this is to surround the173* code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the174* Host that it is *never* to interrupt us there, even if interrupts seem to be175* enabled.176*/177ENTRY(lguest_iret)178pushl %eax179movl 12(%esp), %eax180lguest_noirq_start:181/*182* Note the %ss: segment prefix here. Normal data accesses use the183* "ds" segment, but that will have already been restored for whatever184* we're returning to (such as userspace): we can't trust it. The %ss:185* prefix makes sure we use the stack segment, which is still valid.186*/187movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled188popl %eax189iret190lguest_noirq_end:191192193