// SPDX-License-Identifier: GPL-2.01/*2* Device driver to expose SGX enclave memory to KVM guests.3*4* Copyright(c) 2021 Intel Corporation.5*/67#include <linux/kvm_types.h>8#include <linux/miscdevice.h>9#include <linux/mm.h>10#include <linux/mman.h>11#include <linux/sched/mm.h>12#include <linux/sched/signal.h>13#include <linux/slab.h>14#include <linux/xarray.h>15#include <asm/sgx.h>16#include <uapi/asm/sgx.h>1718#include "encls.h"19#include "sgx.h"2021struct sgx_vepc {22struct xarray page_array;23struct mutex lock;24};2526/*27* Temporary SECS pages that cannot be EREMOVE'd due to having child in other28* virtual EPC instances, and the lock to protect it.29*/30static struct mutex zombie_secs_pages_lock;31static struct list_head zombie_secs_pages;3233static int __sgx_vepc_fault(struct sgx_vepc *vepc,34struct vm_area_struct *vma, unsigned long addr)35{36struct sgx_epc_page *epc_page;37unsigned long index, pfn;38int ret;3940WARN_ON(!mutex_is_locked(&vepc->lock));4142/* Calculate index of EPC page in virtual EPC's page_array */43index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);4445epc_page = xa_load(&vepc->page_array, index);46if (epc_page)47return 0;4849epc_page = sgx_alloc_epc_page(vepc, false);50if (IS_ERR(epc_page))51return PTR_ERR(epc_page);5253ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL));54if (ret)55goto err_free;5657pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page));5859ret = vmf_insert_pfn(vma, addr, pfn);60if (ret != VM_FAULT_NOPAGE) {61ret = -EFAULT;62goto err_delete;63}6465return 0;6667err_delete:68xa_erase(&vepc->page_array, index);69err_free:70sgx_free_epc_page(epc_page);71return ret;72}7374static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf)75{76struct vm_area_struct *vma = vmf->vma;77struct sgx_vepc *vepc = vma->vm_private_data;78int ret;7980mutex_lock(&vepc->lock);81ret = __sgx_vepc_fault(vepc, vma, vmf->address);82mutex_unlock(&vepc->lock);8384if (!ret)85return VM_FAULT_NOPAGE;8687if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) {88mmap_read_unlock(vma->vm_mm);89return VM_FAULT_RETRY;90}9192return VM_FAULT_SIGBUS;93}9495static const struct vm_operations_struct sgx_vepc_vm_ops = {96.fault = sgx_vepc_fault,97};9899static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma)100{101struct sgx_vepc *vepc = file->private_data;102103if (!(vma->vm_flags & VM_SHARED))104return -EINVAL;105106vma->vm_ops = &sgx_vepc_vm_ops;107/* Don't copy VMA in fork() */108vm_flags_set(vma, VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY);109vma->vm_private_data = vepc;110111return 0;112}113114static int sgx_vepc_remove_page(struct sgx_epc_page *epc_page)115{116/*117* Take a previously guest-owned EPC page and return it to the118* general EPC page pool.119*120* Guests can not be trusted to have left this page in a good121* state, so run EREMOVE on the page unconditionally. In the122* case that a guest properly EREMOVE'd this page, a superfluous123* EREMOVE is harmless.124*/125return __eremove(sgx_get_epc_virt_addr(epc_page));126}127128static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)129{130int ret = sgx_vepc_remove_page(epc_page);131if (ret) {132/*133* Only SGX_CHILD_PRESENT is expected, which is because of134* EREMOVE'ing an SECS still with child, in which case it can135* be handled by EREMOVE'ing the SECS again after all pages in136* virtual EPC have been EREMOVE'd. See comments in below in137* sgx_vepc_release().138*139* The user of virtual EPC (KVM) needs to guarantee there's no140* logical processor is still running in the enclave in guest,141* otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be142* handled here.143*/144WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE,145ret, ret);146return ret;147}148149sgx_free_epc_page(epc_page);150return 0;151}152153static long sgx_vepc_remove_all(struct sgx_vepc *vepc)154{155struct sgx_epc_page *entry;156unsigned long index;157long failures = 0;158159xa_for_each(&vepc->page_array, index, entry) {160int ret = sgx_vepc_remove_page(entry);161if (ret) {162if (ret == SGX_CHILD_PRESENT) {163/* The page is a SECS, userspace will retry. */164failures++;165} else {166/*167* Report errors due to #GP or SGX_ENCLAVE_ACT; do not168* WARN, as userspace can induce said failures by169* calling the ioctl concurrently on multiple vEPCs or170* while one or more CPUs is running the enclave. Only171* a #PF on EREMOVE indicates a kernel/hardware issue.172*/173WARN_ON_ONCE(encls_faulted(ret) &&174ENCLS_TRAPNR(ret) != X86_TRAP_GP);175return -EBUSY;176}177}178cond_resched();179}180181/*182* Return the number of SECS pages that failed to be removed, so183* userspace knows that it has to retry.184*/185return failures;186}187188static int sgx_vepc_release(struct inode *inode, struct file *file)189{190struct sgx_vepc *vepc = file->private_data;191struct sgx_epc_page *epc_page, *tmp, *entry;192unsigned long index;193194LIST_HEAD(secs_pages);195196xa_for_each(&vepc->page_array, index, entry) {197/*198* Remove all normal, child pages. sgx_vepc_free_page()199* will fail if EREMOVE fails, but this is OK and expected on200* SECS pages. Those can only be EREMOVE'd *after* all their201* child pages. Retries below will clean them up.202*/203if (sgx_vepc_free_page(entry))204continue;205206xa_erase(&vepc->page_array, index);207cond_resched();208}209210/*211* Retry EREMOVE'ing pages. This will clean up any SECS pages that212* only had children in this 'epc' area.213*/214xa_for_each(&vepc->page_array, index, entry) {215epc_page = entry;216/*217* An EREMOVE failure here means that the SECS page still218* has children. But, since all children in this 'sgx_vepc'219* have been removed, the SECS page must have a child on220* another instance.221*/222if (sgx_vepc_free_page(epc_page))223list_add_tail(&epc_page->list, &secs_pages);224225xa_erase(&vepc->page_array, index);226cond_resched();227}228229/*230* SECS pages are "pinned" by child pages, and "unpinned" once all231* children have been EREMOVE'd. A child page in this instance232* may have pinned an SECS page encountered in an earlier release(),233* creating a zombie. Since some children were EREMOVE'd above,234* try to EREMOVE all zombies in the hopes that one was unpinned.235*/236mutex_lock(&zombie_secs_pages_lock);237list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) {238/*239* Speculatively remove the page from the list of zombies,240* if the page is successfully EREMOVE'd it will be added to241* the list of free pages. If EREMOVE fails, throw the page242* on the local list, which will be spliced on at the end.243*/244list_del(&epc_page->list);245246if (sgx_vepc_free_page(epc_page))247list_add_tail(&epc_page->list, &secs_pages);248cond_resched();249}250251if (!list_empty(&secs_pages))252list_splice_tail(&secs_pages, &zombie_secs_pages);253mutex_unlock(&zombie_secs_pages_lock);254255xa_destroy(&vepc->page_array);256kfree(vepc);257258sgx_dec_usage_count();259return 0;260}261262static int __sgx_vepc_open(struct inode *inode, struct file *file)263{264struct sgx_vepc *vepc;265266vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL);267if (!vepc)268return -ENOMEM;269mutex_init(&vepc->lock);270xa_init(&vepc->page_array);271272file->private_data = vepc;273274return 0;275}276277static int sgx_vepc_open(struct inode *inode, struct file *file)278{279int ret;280281ret = sgx_inc_usage_count();282if (ret)283return ret;284285ret = __sgx_vepc_open(inode, file);286if (ret) {287sgx_dec_usage_count();288return ret;289}290291return 0;292}293294static long sgx_vepc_ioctl(struct file *file,295unsigned int cmd, unsigned long arg)296{297struct sgx_vepc *vepc = file->private_data;298299switch (cmd) {300case SGX_IOC_VEPC_REMOVE_ALL:301if (arg)302return -EINVAL;303return sgx_vepc_remove_all(vepc);304305default:306return -ENOTTY;307}308}309310static const struct file_operations sgx_vepc_fops = {311.owner = THIS_MODULE,312.open = sgx_vepc_open,313.unlocked_ioctl = sgx_vepc_ioctl,314.compat_ioctl = sgx_vepc_ioctl,315.release = sgx_vepc_release,316.mmap = sgx_vepc_mmap,317};318319static struct miscdevice sgx_vepc_dev = {320.minor = MISC_DYNAMIC_MINOR,321.name = "sgx_vepc",322.nodename = "sgx_vepc",323.fops = &sgx_vepc_fops,324};325326int __init sgx_vepc_init(void)327{328/* SGX virtualization requires KVM to work */329if (!cpu_feature_enabled(X86_FEATURE_VMX))330return -ENODEV;331332INIT_LIST_HEAD(&zombie_secs_pages);333mutex_init(&zombie_secs_pages_lock);334335return misc_register(&sgx_vepc_dev);336}337338/**339* sgx_virt_ecreate() - Run ECREATE on behalf of guest340* @pageinfo: Pointer to PAGEINFO structure341* @secs: Userspace pointer to SECS page342* @trapnr: trap number injected to guest in case of ECREATE error343*344* Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose345* of enforcing policies of guest's enclaves, and return the trap number346* which should be injected to guest in case of any ECREATE error.347*348* Return:349* - 0: ECREATE was successful.350* - <0: on error.351*/352int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,353int *trapnr)354{355int ret;356357/*358* @secs is an untrusted, userspace-provided address. It comes from359* KVM and is assumed to be a valid pointer which points somewhere in360* userspace. This can fault and call SGX or other fault handlers when361* userspace mapping @secs doesn't exist.362*363* Add a WARN() to make sure @secs is already valid userspace pointer364* from caller (KVM), who should already have handled invalid pointer365* case (for instance, made by malicious guest). All other checks,366* such as alignment of @secs, are deferred to ENCLS itself.367*/368if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE)))369return -EINVAL;370371__uaccess_begin();372ret = __ecreate(pageinfo, (void *)secs);373__uaccess_end();374375if (encls_faulted(ret)) {376*trapnr = ENCLS_TRAPNR(ret);377return -EFAULT;378}379380/* ECREATE doesn't return an error code, it faults or succeeds. */381WARN_ON_ONCE(ret);382return 0;383}384EXPORT_SYMBOL_FOR_KVM(sgx_virt_ecreate);385386static int __sgx_virt_einit(void __user *sigstruct, void __user *token,387void __user *secs)388{389int ret;390391/*392* Make sure all userspace pointers from caller (KVM) are valid.393* All other checks deferred to ENCLS itself. Also see comment394* for @secs in sgx_virt_ecreate().395*/396#define SGX_EINITTOKEN_SIZE 304397if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) ||398!access_ok(token, SGX_EINITTOKEN_SIZE) ||399!access_ok(secs, PAGE_SIZE)))400return -EINVAL;401402__uaccess_begin();403ret = __einit((void *)sigstruct, (void *)token, (void *)secs);404__uaccess_end();405406return ret;407}408409/**410* sgx_virt_einit() - Run EINIT on behalf of guest411* @sigstruct: Userspace pointer to SIGSTRUCT structure412* @token: Userspace pointer to EINITTOKEN structure413* @secs: Userspace pointer to SECS page414* @lepubkeyhash: Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values415* @trapnr: trap number injected to guest in case of EINIT error416*417* Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available418* in host, SGX driver may rewrite the hardware values at wish, therefore KVM419* needs to update hardware values to guest's virtual MSR values in order to420* ensure EINIT is executed with expected hardware values.421*422* Return:423* - 0: EINIT was successful.424* - <0: on error.425*/426int sgx_virt_einit(void __user *sigstruct, void __user *token,427void __user *secs, u64 *lepubkeyhash, int *trapnr)428{429int ret;430431if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) {432ret = __sgx_virt_einit(sigstruct, token, secs);433} else {434preempt_disable();435436sgx_update_lepubkeyhash(lepubkeyhash);437438ret = __sgx_virt_einit(sigstruct, token, secs);439preempt_enable();440}441442/* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */443if (ret == -EINVAL)444return ret;445446if (encls_faulted(ret)) {447*trapnr = ENCLS_TRAPNR(ret);448return -EFAULT;449}450451return ret;452}453EXPORT_SYMBOL_FOR_KVM(sgx_virt_einit);454455456