// SPDX-License-Identifier: GPL-2.01/*2* Device driver to expose SGX enclave memory to KVM guests.3*4* Copyright(c) 2021 Intel Corporation.5*/67#include <linux/miscdevice.h>8#include <linux/mm.h>9#include <linux/mman.h>10#include <linux/sched/mm.h>11#include <linux/sched/signal.h>12#include <linux/slab.h>13#include <linux/xarray.h>14#include <asm/sgx.h>15#include <uapi/asm/sgx.h>1617#include "encls.h"18#include "sgx.h"1920struct sgx_vepc {21struct xarray page_array;22struct mutex lock;23};2425/*26* Temporary SECS pages that cannot be EREMOVE'd due to having child in other27* virtual EPC instances, and the lock to protect it.28*/29static struct mutex zombie_secs_pages_lock;30static struct list_head zombie_secs_pages;3132static int __sgx_vepc_fault(struct sgx_vepc *vepc,33struct vm_area_struct *vma, unsigned long addr)34{35struct sgx_epc_page *epc_page;36unsigned long index, pfn;37int ret;3839WARN_ON(!mutex_is_locked(&vepc->lock));4041/* Calculate index of EPC page in virtual EPC's page_array */42index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);4344epc_page = xa_load(&vepc->page_array, index);45if (epc_page)46return 0;4748epc_page = sgx_alloc_epc_page(vepc, false);49if (IS_ERR(epc_page))50return PTR_ERR(epc_page);5152ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL));53if (ret)54goto err_free;5556pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page));5758ret = vmf_insert_pfn(vma, addr, pfn);59if (ret != VM_FAULT_NOPAGE) {60ret = -EFAULT;61goto err_delete;62}6364return 0;6566err_delete:67xa_erase(&vepc->page_array, index);68err_free:69sgx_free_epc_page(epc_page);70return ret;71}7273static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf)74{75struct vm_area_struct *vma = vmf->vma;76struct sgx_vepc *vepc = vma->vm_private_data;77int ret;7879mutex_lock(&vepc->lock);80ret = __sgx_vepc_fault(vepc, vma, vmf->address);81mutex_unlock(&vepc->lock);8283if (!ret)84return VM_FAULT_NOPAGE;8586if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) {87mmap_read_unlock(vma->vm_mm);88return VM_FAULT_RETRY;89}9091return VM_FAULT_SIGBUS;92}9394static const struct vm_operations_struct sgx_vepc_vm_ops = {95.fault = sgx_vepc_fault,96};9798static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma)99{100struct sgx_vepc *vepc = file->private_data;101102if (!(vma->vm_flags & VM_SHARED))103return -EINVAL;104105vma->vm_ops = &sgx_vepc_vm_ops;106/* Don't copy VMA in fork() */107vm_flags_set(vma, VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY);108vma->vm_private_data = vepc;109110return 0;111}112113static int sgx_vepc_remove_page(struct sgx_epc_page *epc_page)114{115/*116* Take a previously guest-owned EPC page and return it to the117* general EPC page pool.118*119* Guests can not be trusted to have left this page in a good120* state, so run EREMOVE on the page unconditionally. In the121* case that a guest properly EREMOVE'd this page, a superfluous122* EREMOVE is harmless.123*/124return __eremove(sgx_get_epc_virt_addr(epc_page));125}126127static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)128{129int ret = sgx_vepc_remove_page(epc_page);130if (ret) {131/*132* Only SGX_CHILD_PRESENT is expected, which is because of133* EREMOVE'ing an SECS still with child, in which case it can134* be handled by EREMOVE'ing the SECS again after all pages in135* virtual EPC have been EREMOVE'd. See comments in below in136* sgx_vepc_release().137*138* The user of virtual EPC (KVM) needs to guarantee there's no139* logical processor is still running in the enclave in guest,140* otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be141* handled here.142*/143WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE,144ret, ret);145return ret;146}147148sgx_free_epc_page(epc_page);149return 0;150}151152static long sgx_vepc_remove_all(struct sgx_vepc *vepc)153{154struct sgx_epc_page *entry;155unsigned long index;156long failures = 0;157158xa_for_each(&vepc->page_array, index, entry) {159int ret = sgx_vepc_remove_page(entry);160if (ret) {161if (ret == SGX_CHILD_PRESENT) {162/* The page is a SECS, userspace will retry. */163failures++;164} else {165/*166* Report errors due to #GP or SGX_ENCLAVE_ACT; do not167* WARN, as userspace can induce said failures by168* calling the ioctl concurrently on multiple vEPCs or169* while one or more CPUs is running the enclave. Only170* a #PF on EREMOVE indicates a kernel/hardware issue.171*/172WARN_ON_ONCE(encls_faulted(ret) &&173ENCLS_TRAPNR(ret) != X86_TRAP_GP);174return -EBUSY;175}176}177cond_resched();178}179180/*181* Return the number of SECS pages that failed to be removed, so182* userspace knows that it has to retry.183*/184return failures;185}186187static int sgx_vepc_release(struct inode *inode, struct file *file)188{189struct sgx_vepc *vepc = file->private_data;190struct sgx_epc_page *epc_page, *tmp, *entry;191unsigned long index;192193LIST_HEAD(secs_pages);194195xa_for_each(&vepc->page_array, index, entry) {196/*197* Remove all normal, child pages. sgx_vepc_free_page()198* will fail if EREMOVE fails, but this is OK and expected on199* SECS pages. Those can only be EREMOVE'd *after* all their200* child pages. Retries below will clean them up.201*/202if (sgx_vepc_free_page(entry))203continue;204205xa_erase(&vepc->page_array, index);206cond_resched();207}208209/*210* Retry EREMOVE'ing pages. This will clean up any SECS pages that211* only had children in this 'epc' area.212*/213xa_for_each(&vepc->page_array, index, entry) {214epc_page = entry;215/*216* An EREMOVE failure here means that the SECS page still217* has children. But, since all children in this 'sgx_vepc'218* have been removed, the SECS page must have a child on219* another instance.220*/221if (sgx_vepc_free_page(epc_page))222list_add_tail(&epc_page->list, &secs_pages);223224xa_erase(&vepc->page_array, index);225cond_resched();226}227228/*229* SECS pages are "pinned" by child pages, and "unpinned" once all230* children have been EREMOVE'd. A child page in this instance231* may have pinned an SECS page encountered in an earlier release(),232* creating a zombie. Since some children were EREMOVE'd above,233* try to EREMOVE all zombies in the hopes that one was unpinned.234*/235mutex_lock(&zombie_secs_pages_lock);236list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) {237/*238* Speculatively remove the page from the list of zombies,239* if the page is successfully EREMOVE'd it will be added to240* the list of free pages. If EREMOVE fails, throw the page241* on the local list, which will be spliced on at the end.242*/243list_del(&epc_page->list);244245if (sgx_vepc_free_page(epc_page))246list_add_tail(&epc_page->list, &secs_pages);247cond_resched();248}249250if (!list_empty(&secs_pages))251list_splice_tail(&secs_pages, &zombie_secs_pages);252mutex_unlock(&zombie_secs_pages_lock);253254xa_destroy(&vepc->page_array);255kfree(vepc);256257return 0;258}259260static int sgx_vepc_open(struct inode *inode, struct file *file)261{262struct sgx_vepc *vepc;263264vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL);265if (!vepc)266return -ENOMEM;267mutex_init(&vepc->lock);268xa_init(&vepc->page_array);269270file->private_data = vepc;271272return 0;273}274275static long sgx_vepc_ioctl(struct file *file,276unsigned int cmd, unsigned long arg)277{278struct sgx_vepc *vepc = file->private_data;279280switch (cmd) {281case SGX_IOC_VEPC_REMOVE_ALL:282if (arg)283return -EINVAL;284return sgx_vepc_remove_all(vepc);285286default:287return -ENOTTY;288}289}290291static const struct file_operations sgx_vepc_fops = {292.owner = THIS_MODULE,293.open = sgx_vepc_open,294.unlocked_ioctl = sgx_vepc_ioctl,295.compat_ioctl = sgx_vepc_ioctl,296.release = sgx_vepc_release,297.mmap = sgx_vepc_mmap,298};299300static struct miscdevice sgx_vepc_dev = {301.minor = MISC_DYNAMIC_MINOR,302.name = "sgx_vepc",303.nodename = "sgx_vepc",304.fops = &sgx_vepc_fops,305};306307int __init sgx_vepc_init(void)308{309/* SGX virtualization requires KVM to work */310if (!cpu_feature_enabled(X86_FEATURE_VMX))311return -ENODEV;312313INIT_LIST_HEAD(&zombie_secs_pages);314mutex_init(&zombie_secs_pages_lock);315316return misc_register(&sgx_vepc_dev);317}318319/**320* sgx_virt_ecreate() - Run ECREATE on behalf of guest321* @pageinfo: Pointer to PAGEINFO structure322* @secs: Userspace pointer to SECS page323* @trapnr: trap number injected to guest in case of ECREATE error324*325* Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose326* of enforcing policies of guest's enclaves, and return the trap number327* which should be injected to guest in case of any ECREATE error.328*329* Return:330* - 0: ECREATE was successful.331* - <0: on error.332*/333int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,334int *trapnr)335{336int ret;337338/*339* @secs is an untrusted, userspace-provided address. It comes from340* KVM and is assumed to be a valid pointer which points somewhere in341* userspace. This can fault and call SGX or other fault handlers when342* userspace mapping @secs doesn't exist.343*344* Add a WARN() to make sure @secs is already valid userspace pointer345* from caller (KVM), who should already have handled invalid pointer346* case (for instance, made by malicious guest). All other checks,347* such as alignment of @secs, are deferred to ENCLS itself.348*/349if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE)))350return -EINVAL;351352__uaccess_begin();353ret = __ecreate(pageinfo, (void *)secs);354__uaccess_end();355356if (encls_faulted(ret)) {357*trapnr = ENCLS_TRAPNR(ret);358return -EFAULT;359}360361/* ECREATE doesn't return an error code, it faults or succeeds. */362WARN_ON_ONCE(ret);363return 0;364}365EXPORT_SYMBOL_GPL(sgx_virt_ecreate);366367static int __sgx_virt_einit(void __user *sigstruct, void __user *token,368void __user *secs)369{370int ret;371372/*373* Make sure all userspace pointers from caller (KVM) are valid.374* All other checks deferred to ENCLS itself. Also see comment375* for @secs in sgx_virt_ecreate().376*/377#define SGX_EINITTOKEN_SIZE 304378if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) ||379!access_ok(token, SGX_EINITTOKEN_SIZE) ||380!access_ok(secs, PAGE_SIZE)))381return -EINVAL;382383__uaccess_begin();384ret = __einit((void *)sigstruct, (void *)token, (void *)secs);385__uaccess_end();386387return ret;388}389390/**391* sgx_virt_einit() - Run EINIT on behalf of guest392* @sigstruct: Userspace pointer to SIGSTRUCT structure393* @token: Userspace pointer to EINITTOKEN structure394* @secs: Userspace pointer to SECS page395* @lepubkeyhash: Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values396* @trapnr: trap number injected to guest in case of EINIT error397*398* Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available399* in host, SGX driver may rewrite the hardware values at wish, therefore KVM400* needs to update hardware values to guest's virtual MSR values in order to401* ensure EINIT is executed with expected hardware values.402*403* Return:404* - 0: EINIT was successful.405* - <0: on error.406*/407int sgx_virt_einit(void __user *sigstruct, void __user *token,408void __user *secs, u64 *lepubkeyhash, int *trapnr)409{410int ret;411412if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) {413ret = __sgx_virt_einit(sigstruct, token, secs);414} else {415preempt_disable();416417sgx_update_lepubkeyhash(lepubkeyhash);418419ret = __sgx_virt_einit(sigstruct, token, secs);420preempt_enable();421}422423/* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */424if (ret == -EINVAL)425return ret;426427if (encls_faulted(ret)) {428*trapnr = ENCLS_TRAPNR(ret);429return -EFAULT;430}431432return ret;433}434EXPORT_SYMBOL_GPL(sgx_virt_einit);435436437