/******************************************************************************1* xen.h2*3* Guest OS interface to Xen.4*5* Copyright (c) 2004, K A Fraser6*/78#ifndef __XEN_PUBLIC_XEN_H__9#define __XEN_PUBLIC_XEN_H__1011#include <asm/xen/interface.h>12#include <asm/pvclock-abi.h>1314/*15* XEN "SYSTEM CALLS" (a.k.a. HYPERCALLS).16*/1718/*19* x86_32: EAX = vector; EBX, ECX, EDX, ESI, EDI = args 1, 2, 3, 4, 5.20* EAX = return value21* (argument registers may be clobbered on return)22* x86_64: RAX = vector; RDI, RSI, RDX, R10, R8, R9 = args 1, 2, 3, 4, 5, 6.23* RAX = return value24* (argument registers not clobbered on return; RCX, R11 are)25*/26#define __HYPERVISOR_set_trap_table 027#define __HYPERVISOR_mmu_update 128#define __HYPERVISOR_set_gdt 229#define __HYPERVISOR_stack_switch 330#define __HYPERVISOR_set_callbacks 431#define __HYPERVISOR_fpu_taskswitch 532#define __HYPERVISOR_sched_op_compat 633#define __HYPERVISOR_dom0_op 734#define __HYPERVISOR_set_debugreg 835#define __HYPERVISOR_get_debugreg 936#define __HYPERVISOR_update_descriptor 1037#define __HYPERVISOR_memory_op 1238#define __HYPERVISOR_multicall 1339#define __HYPERVISOR_update_va_mapping 1440#define __HYPERVISOR_set_timer_op 1541#define __HYPERVISOR_event_channel_op_compat 1642#define __HYPERVISOR_xen_version 1743#define __HYPERVISOR_console_io 1844#define __HYPERVISOR_physdev_op_compat 1945#define __HYPERVISOR_grant_table_op 2046#define __HYPERVISOR_vm_assist 2147#define __HYPERVISOR_update_va_mapping_otherdomain 2248#define __HYPERVISOR_iret 23 /* x86 only */49#define __HYPERVISOR_vcpu_op 2450#define __HYPERVISOR_set_segment_base 25 /* x86/64 only */51#define __HYPERVISOR_mmuext_op 2652#define __HYPERVISOR_acm_op 2753#define __HYPERVISOR_nmi_op 2854#define __HYPERVISOR_sched_op 2955#define __HYPERVISOR_callback_op 3056#define __HYPERVISOR_xenoprof_op 3157#define __HYPERVISOR_event_channel_op 3258#define __HYPERVISOR_physdev_op 3359#define __HYPERVISOR_hvm_op 3460#define __HYPERVISOR_tmem_op 386162/* Architecture-specific hypercall definitions. */63#define __HYPERVISOR_arch_0 4864#define __HYPERVISOR_arch_1 4965#define __HYPERVISOR_arch_2 5066#define __HYPERVISOR_arch_3 5167#define __HYPERVISOR_arch_4 5268#define __HYPERVISOR_arch_5 5369#define __HYPERVISOR_arch_6 5470#define __HYPERVISOR_arch_7 557172/*73* VIRTUAL INTERRUPTS74*75* Virtual interrupts that a guest OS may receive from Xen.76*/77#define VIRQ_TIMER 0 /* Timebase update, and/or requested timeout. */78#define VIRQ_DEBUG 1 /* Request guest to dump debug info. */79#define VIRQ_CONSOLE 2 /* (DOM0) Bytes received on emergency console. */80#define VIRQ_DOM_EXC 3 /* (DOM0) Exceptional event for some domain. */81#define VIRQ_DEBUGGER 6 /* (DOM0) A domain has paused for debugging. */8283/* Architecture-specific VIRQ definitions. */84#define VIRQ_ARCH_0 1685#define VIRQ_ARCH_1 1786#define VIRQ_ARCH_2 1887#define VIRQ_ARCH_3 1988#define VIRQ_ARCH_4 2089#define VIRQ_ARCH_5 2190#define VIRQ_ARCH_6 2291#define VIRQ_ARCH_7 239293#define NR_VIRQS 2494/*95* MMU-UPDATE REQUESTS96*97* HYPERVISOR_mmu_update() accepts a list of (ptr, val) pairs.98* A foreigndom (FD) can be specified (or DOMID_SELF for none).99* Where the FD has some effect, it is described below.100* ptr[1:0] specifies the appropriate MMU_* command.101*102* ptr[1:0] == MMU_NORMAL_PT_UPDATE:103* Updates an entry in a page table. If updating an L1 table, and the new104* table entry is valid/present, the mapped frame must belong to the FD, if105* an FD has been specified. If attempting to map an I/O page then the106* caller assumes the privilege of the FD.107* FD == DOMID_IO: Permit /only/ I/O mappings, at the priv level of the caller.108* FD == DOMID_XEN: Map restricted areas of Xen's heap space.109* ptr[:2] -- Machine address of the page-table entry to modify.110* val -- Value to write.111*112* ptr[1:0] == MMU_MACHPHYS_UPDATE:113* Updates an entry in the machine->pseudo-physical mapping table.114* ptr[:2] -- Machine address within the frame whose mapping to modify.115* The frame must belong to the FD, if one is specified.116* val -- Value to write into the mapping entry.117*118* ptr[1:0] == MMU_PT_UPDATE_PRESERVE_AD:119* As MMU_NORMAL_PT_UPDATE above, but A/D bits currently in the PTE are ORed120* with those in @val.121*/122#define MMU_NORMAL_PT_UPDATE 0 /* checked '*ptr = val'. ptr is MA. */123#define MMU_MACHPHYS_UPDATE 1 /* ptr = MA of frame to modify entry for */124#define MMU_PT_UPDATE_PRESERVE_AD 2 /* atomically: *ptr = val | (*ptr&(A|D)) */125126/*127* MMU EXTENDED OPERATIONS128*129* HYPERVISOR_mmuext_op() accepts a list of mmuext_op structures.130* A foreigndom (FD) can be specified (or DOMID_SELF for none).131* Where the FD has some effect, it is described below.132*133* cmd: MMUEXT_(UN)PIN_*_TABLE134* mfn: Machine frame number to be (un)pinned as a p.t. page.135* The frame must belong to the FD, if one is specified.136*137* cmd: MMUEXT_NEW_BASEPTR138* mfn: Machine frame number of new page-table base to install in MMU.139*140* cmd: MMUEXT_NEW_USER_BASEPTR [x86/64 only]141* mfn: Machine frame number of new page-table base to install in MMU142* when in user space.143*144* cmd: MMUEXT_TLB_FLUSH_LOCAL145* No additional arguments. Flushes local TLB.146*147* cmd: MMUEXT_INVLPG_LOCAL148* linear_addr: Linear address to be flushed from the local TLB.149*150* cmd: MMUEXT_TLB_FLUSH_MULTI151* vcpumask: Pointer to bitmap of VCPUs to be flushed.152*153* cmd: MMUEXT_INVLPG_MULTI154* linear_addr: Linear address to be flushed.155* vcpumask: Pointer to bitmap of VCPUs to be flushed.156*157* cmd: MMUEXT_TLB_FLUSH_ALL158* No additional arguments. Flushes all VCPUs' TLBs.159*160* cmd: MMUEXT_INVLPG_ALL161* linear_addr: Linear address to be flushed from all VCPUs' TLBs.162*163* cmd: MMUEXT_FLUSH_CACHE164* No additional arguments. Writes back and flushes cache contents.165*166* cmd: MMUEXT_SET_LDT167* linear_addr: Linear address of LDT base (NB. must be page-aligned).168* nr_ents: Number of entries in LDT.169*/170#define MMUEXT_PIN_L1_TABLE 0171#define MMUEXT_PIN_L2_TABLE 1172#define MMUEXT_PIN_L3_TABLE 2173#define MMUEXT_PIN_L4_TABLE 3174#define MMUEXT_UNPIN_TABLE 4175#define MMUEXT_NEW_BASEPTR 5176#define MMUEXT_TLB_FLUSH_LOCAL 6177#define MMUEXT_INVLPG_LOCAL 7178#define MMUEXT_TLB_FLUSH_MULTI 8179#define MMUEXT_INVLPG_MULTI 9180#define MMUEXT_TLB_FLUSH_ALL 10181#define MMUEXT_INVLPG_ALL 11182#define MMUEXT_FLUSH_CACHE 12183#define MMUEXT_SET_LDT 13184#define MMUEXT_NEW_USER_BASEPTR 15185186#ifndef __ASSEMBLY__187struct mmuext_op {188unsigned int cmd;189union {190/* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR */191unsigned long mfn;192/* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */193unsigned long linear_addr;194} arg1;195union {196/* SET_LDT */197unsigned int nr_ents;198/* TLB_FLUSH_MULTI, INVLPG_MULTI */199void *vcpumask;200} arg2;201};202DEFINE_GUEST_HANDLE_STRUCT(mmuext_op);203#endif204205/* These are passed as 'flags' to update_va_mapping. They can be ORed. */206/* When specifying UVMF_MULTI, also OR in a pointer to a CPU bitmap. */207/* UVMF_LOCAL is merely UVMF_MULTI with a NULL bitmap pointer. */208#define UVMF_NONE (0UL<<0) /* No flushing at all. */209#define UVMF_TLB_FLUSH (1UL<<0) /* Flush entire TLB(s). */210#define UVMF_INVLPG (2UL<<0) /* Flush only one entry. */211#define UVMF_FLUSHTYPE_MASK (3UL<<0)212#define UVMF_MULTI (0UL<<2) /* Flush subset of TLBs. */213#define UVMF_LOCAL (0UL<<2) /* Flush local TLB. */214#define UVMF_ALL (1UL<<2) /* Flush all TLBs. */215216/*217* Commands to HYPERVISOR_console_io().218*/219#define CONSOLEIO_write 0220#define CONSOLEIO_read 1221222/*223* Commands to HYPERVISOR_vm_assist().224*/225#define VMASST_CMD_enable 0226#define VMASST_CMD_disable 1227#define VMASST_TYPE_4gb_segments 0228#define VMASST_TYPE_4gb_segments_notify 1229#define VMASST_TYPE_writable_pagetables 2230#define VMASST_TYPE_pae_extended_cr3 3231#define MAX_VMASST_TYPE 3232233#ifndef __ASSEMBLY__234235typedef uint16_t domid_t;236237/* Domain ids >= DOMID_FIRST_RESERVED cannot be used for ordinary domains. */238#define DOMID_FIRST_RESERVED (0x7FF0U)239240/* DOMID_SELF is used in certain contexts to refer to oneself. */241#define DOMID_SELF (0x7FF0U)242243/*244* DOMID_IO is used to restrict page-table updates to mapping I/O memory.245* Although no Foreign Domain need be specified to map I/O pages, DOMID_IO246* is useful to ensure that no mappings to the OS's own heap are accidentally247* installed. (e.g., in Linux this could cause havoc as reference counts248* aren't adjusted on the I/O-mapping code path).249* This only makes sense in MMUEXT_SET_FOREIGNDOM, but in that context can250* be specified by any calling domain.251*/252#define DOMID_IO (0x7FF1U)253254/*255* DOMID_XEN is used to allow privileged domains to map restricted parts of256* Xen's heap space (e.g., the machine_to_phys table).257* This only makes sense in MMUEXT_SET_FOREIGNDOM, and is only permitted if258* the caller is privileged.259*/260#define DOMID_XEN (0x7FF2U)261262/*263* Send an array of these to HYPERVISOR_mmu_update().264* NB. The fields are natural pointer/address size for this architecture.265*/266struct mmu_update {267uint64_t ptr; /* Machine address of PTE. */268uint64_t val; /* New contents of PTE. */269};270DEFINE_GUEST_HANDLE_STRUCT(mmu_update);271272/*273* Send an array of these to HYPERVISOR_multicall().274* NB. The fields are natural register size for this architecture.275*/276struct multicall_entry {277unsigned long op;278long result;279unsigned long args[6];280};281DEFINE_GUEST_HANDLE_STRUCT(multicall_entry);282283/*284* Event channel endpoints per domain:285* 1024 if a long is 32 bits; 4096 if a long is 64 bits.286*/287#define NR_EVENT_CHANNELS (sizeof(unsigned long) * sizeof(unsigned long) * 64)288289struct vcpu_time_info {290/*291* Updates to the following values are preceded and followed292* by an increment of 'version'. The guest can therefore293* detect updates by looking for changes to 'version'. If the294* least-significant bit of the version number is set then an295* update is in progress and the guest must wait to read a296* consistent set of values. The correct way to interact with297* the version number is similar to Linux's seqlock: see the298* implementations of read_seqbegin/read_seqretry.299*/300uint32_t version;301uint32_t pad0;302uint64_t tsc_timestamp; /* TSC at last update of time vals. */303uint64_t system_time; /* Time, in nanosecs, since boot. */304/*305* Current system time:306* system_time + ((tsc - tsc_timestamp) << tsc_shift) * tsc_to_system_mul307* CPU frequency (Hz):308* ((10^9 << 32) / tsc_to_system_mul) >> tsc_shift309*/310uint32_t tsc_to_system_mul;311int8_t tsc_shift;312int8_t pad1[3];313}; /* 32 bytes */314315struct vcpu_info {316/*317* 'evtchn_upcall_pending' is written non-zero by Xen to indicate318* a pending notification for a particular VCPU. It is then cleared319* by the guest OS /before/ checking for pending work, thus avoiding320* a set-and-check race. Note that the mask is only accessed by Xen321* on the CPU that is currently hosting the VCPU. This means that the322* pending and mask flags can be updated by the guest without special323* synchronisation (i.e., no need for the x86 LOCK prefix).324* This may seem suboptimal because if the pending flag is set by325* a different CPU then an IPI may be scheduled even when the mask326* is set. However, note:327* 1. The task of 'interrupt holdoff' is covered by the per-event-328* channel mask bits. A 'noisy' event that is continually being329* triggered can be masked at source at this very precise330* granularity.331* 2. The main purpose of the per-VCPU mask is therefore to restrict332* reentrant execution: whether for concurrency control, or to333* prevent unbounded stack usage. Whatever the purpose, we expect334* that the mask will be asserted only for short periods at a time,335* and so the likelihood of a 'spurious' IPI is suitably small.336* The mask is read before making an event upcall to the guest: a337* non-zero mask therefore guarantees that the VCPU will not receive338* an upcall activation. The mask is cleared when the VCPU requests339* to block: this avoids wakeup-waiting races.340*/341uint8_t evtchn_upcall_pending;342uint8_t evtchn_upcall_mask;343unsigned long evtchn_pending_sel;344struct arch_vcpu_info arch;345struct pvclock_vcpu_time_info time;346}; /* 64 bytes (x86) */347348/*349* Xen/kernel shared data -- pointer provided in start_info.350* NB. We expect that this struct is smaller than a page.351*/352struct shared_info {353struct vcpu_info vcpu_info[MAX_VIRT_CPUS];354355/*356* A domain can create "event channels" on which it can send and receive357* asynchronous event notifications. There are three classes of event that358* are delivered by this mechanism:359* 1. Bi-directional inter- and intra-domain connections. Domains must360* arrange out-of-band to set up a connection (usually by allocating361* an unbound 'listener' port and avertising that via a storage service362* such as xenstore).363* 2. Physical interrupts. A domain with suitable hardware-access364* privileges can bind an event-channel port to a physical interrupt365* source.366* 3. Virtual interrupts ('events'). A domain can bind an event-channel367* port to a virtual interrupt source, such as the virtual-timer368* device or the emergency console.369*370* Event channels are addressed by a "port index". Each channel is371* associated with two bits of information:372* 1. PENDING -- notifies the domain that there is a pending notification373* to be processed. This bit is cleared by the guest.374* 2. MASK -- if this bit is clear then a 0->1 transition of PENDING375* will cause an asynchronous upcall to be scheduled. This bit is only376* updated by the guest. It is read-only within Xen. If a channel377* becomes pending while the channel is masked then the 'edge' is lost378* (i.e., when the channel is unmasked, the guest must manually handle379* pending notifications as no upcall will be scheduled by Xen).380*381* To expedite scanning of pending notifications, any 0->1 pending382* transition on an unmasked channel causes a corresponding bit in a383* per-vcpu selector word to be set. Each bit in the selector covers a384* 'C long' in the PENDING bitfield array.385*/386unsigned long evtchn_pending[sizeof(unsigned long) * 8];387unsigned long evtchn_mask[sizeof(unsigned long) * 8];388389/*390* Wallclock time: updated only by control software. Guests should base391* their gettimeofday() syscall on this wallclock-base value.392*/393struct pvclock_wall_clock wc;394395struct arch_shared_info arch;396397};398399/*400* Start-of-day memory layout for the initial domain (DOM0):401* 1. The domain is started within contiguous virtual-memory region.402* 2. The contiguous region begins and ends on an aligned 4MB boundary.403* 3. The region start corresponds to the load address of the OS image.404* If the load address is not 4MB aligned then the address is rounded down.405* 4. This the order of bootstrap elements in the initial virtual region:406* a. relocated kernel image407* b. initial ram disk [mod_start, mod_len]408* c. list of allocated page frames [mfn_list, nr_pages]409* d. start_info_t structure [register ESI (x86)]410* e. bootstrap page tables [pt_base, CR3 (x86)]411* f. bootstrap stack [register ESP (x86)]412* 5. Bootstrap elements are packed together, but each is 4kB-aligned.413* 6. The initial ram disk may be omitted.414* 7. The list of page frames forms a contiguous 'pseudo-physical' memory415* layout for the domain. In particular, the bootstrap virtual-memory416* region is a 1:1 mapping to the first section of the pseudo-physical map.417* 8. All bootstrap elements are mapped read-writable for the guest OS. The418* only exception is the bootstrap page table, which is mapped read-only.419* 9. There is guaranteed to be at least 512kB padding after the final420* bootstrap element. If necessary, the bootstrap virtual region is421* extended by an extra 4MB to ensure this.422*/423424#define MAX_GUEST_CMDLINE 1024425struct start_info {426/* THE FOLLOWING ARE FILLED IN BOTH ON INITIAL BOOT AND ON RESUME. */427char magic[32]; /* "xen-<version>-<platform>". */428unsigned long nr_pages; /* Total pages allocated to this domain. */429unsigned long shared_info; /* MACHINE address of shared info struct. */430uint32_t flags; /* SIF_xxx flags. */431unsigned long store_mfn; /* MACHINE page number of shared page. */432uint32_t store_evtchn; /* Event channel for store communication. */433union {434struct {435unsigned long mfn; /* MACHINE page number of console page. */436uint32_t evtchn; /* Event channel for console page. */437} domU;438struct {439uint32_t info_off; /* Offset of console_info struct. */440uint32_t info_size; /* Size of console_info struct from start.*/441} dom0;442} console;443/* THE FOLLOWING ARE ONLY FILLED IN ON INITIAL BOOT (NOT RESUME). */444unsigned long pt_base; /* VIRTUAL address of page directory. */445unsigned long nr_pt_frames; /* Number of bootstrap p.t. frames. */446unsigned long mfn_list; /* VIRTUAL address of page-frame list. */447unsigned long mod_start; /* VIRTUAL address of pre-loaded module. */448unsigned long mod_len; /* Size (bytes) of pre-loaded module. */449int8_t cmd_line[MAX_GUEST_CMDLINE];450};451452/* These flags are passed in the 'flags' field of start_info_t. */453#define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */454#define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */455456typedef uint64_t cpumap_t;457458typedef uint8_t xen_domain_handle_t[16];459460/* Turn a plain number into a C unsigned long constant. */461#define __mk_unsigned_long(x) x ## UL462#define mk_unsigned_long(x) __mk_unsigned_long(x)463464#define TMEM_SPEC_VERSION 1465466struct tmem_op {467uint32_t cmd;468int32_t pool_id;469union {470struct { /* for cmd == TMEM_NEW_POOL */471uint64_t uuid[2];472uint32_t flags;473} new;474struct {475uint64_t oid[3];476uint32_t index;477uint32_t tmem_offset;478uint32_t pfn_offset;479uint32_t len;480GUEST_HANDLE(void) gmfn; /* guest machine page frame */481} gen;482} u;483};484485#else /* __ASSEMBLY__ */486487/* In assembly code we cannot use C numeric constant suffixes. */488#define mk_unsigned_long(x) x489490#endif /* !__ASSEMBLY__ */491492#endif /* __XEN_PUBLIC_XEN_H__ */493494495