Path: blob/master/arch/x86/boot/compressed/ident_map_64.c
26481 views
// SPDX-License-Identifier: GPL-2.01/*2* This code is used on x86_64 to create page table identity mappings on3* demand by building up a new set of page tables (or appending to the4* existing ones), and then switching over to them when ready.5*6* Copyright (C) 2015-2016 Yinghai Lu7* Copyright (C) 2016 Kees Cook8*/910/* No MITIGATION_PAGE_TABLE_ISOLATION support needed either: */11#undef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION1213#include "error.h"14#include "misc.h"1516/* These actually do the work of building the kernel identity maps. */17#include <linux/pgtable.h>18#include <asm/cmpxchg.h>19#include <asm/trap_pf.h>20#include <asm/trapnr.h>21#include <asm/init.h>22/* Use the static base for this part of the boot process */23#undef __PAGE_OFFSET24#define __PAGE_OFFSET __PAGE_OFFSET_BASE25#include "../../mm/ident_map.c"2627#define _SETUP28#include <asm/setup.h> /* For COMMAND_LINE_SIZE */29#undef _SETUP3031extern unsigned long get_cmd_line_ptr(void);3233/* Used by PAGE_KERN* macros: */34pteval_t __default_kernel_pte_mask __read_mostly = ~0;3536/* Used to track our page table allocation area. */37struct alloc_pgt_data {38unsigned char *pgt_buf;39unsigned long pgt_buf_size;40unsigned long pgt_buf_offset;41};4243/*44* Allocates space for a page table entry, using struct alloc_pgt_data45* above. Besides the local callers, this is used as the allocation46* callback in mapping_info below.47*/48static void *alloc_pgt_page(void *context)49{50struct alloc_pgt_data *pages = (struct alloc_pgt_data *)context;51unsigned char *entry;5253/* Validate there is space available for a new page. */54if (pages->pgt_buf_offset >= pages->pgt_buf_size) {55debug_putstr("out of pgt_buf in " __FILE__ "!?\n");56debug_putaddr(pages->pgt_buf_offset);57debug_putaddr(pages->pgt_buf_size);58return NULL;59}6061/* Consumed more tables than expected? */62if (pages->pgt_buf_offset == BOOT_PGT_SIZE_WARN) {63debug_putstr("pgt_buf running low in " __FILE__ "\n");64debug_putstr("Need to raise BOOT_PGT_SIZE?\n");65debug_putaddr(pages->pgt_buf_offset);66debug_putaddr(pages->pgt_buf_size);67}6869entry = pages->pgt_buf + pages->pgt_buf_offset;70pages->pgt_buf_offset += PAGE_SIZE;7172return entry;73}7475/* Used to track our allocated page tables. */76static struct alloc_pgt_data pgt_data;7778/* The top level page table entry pointer. */79static unsigned long top_level_pgt;8081phys_addr_t physical_mask = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;8283/*84* Mapping information structure passed to kernel_ident_mapping_init().85* Due to relocation, pointers must be assigned at run time not build time.86*/87static struct x86_mapping_info mapping_info;8889/*90* Adds the specified range to the identity mappings.91*/92void kernel_add_identity_map(unsigned long start, unsigned long end)93{94int ret;9596/* Align boundary to 2M. */97start = round_down(start, PMD_SIZE);98end = round_up(end, PMD_SIZE);99if (start >= end)100return;101102/* Build the mapping. */103ret = kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt, start, end);104if (ret)105error("Error: kernel_ident_mapping_init() failed\n");106}107108/* Locates and clears a region for a new top level page table. */109void initialize_identity_maps(void *rmode)110{111unsigned long cmdline;112struct setup_data *sd;113114/* Exclude the encryption mask from __PHYSICAL_MASK */115physical_mask &= ~sme_me_mask;116117/* Init mapping_info with run-time function/buffer pointers. */118mapping_info.alloc_pgt_page = alloc_pgt_page;119mapping_info.context = &pgt_data;120mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask;121mapping_info.kernpg_flag = _KERNPG_TABLE;122123/*124* It should be impossible for this not to already be true,125* but since calling this a second time would rewind the other126* counters, let's just make sure this is reset too.127*/128pgt_data.pgt_buf_offset = 0;129130/*131* If we came here via startup_32(), cr3 will be _pgtable already132* and we must append to the existing area instead of entirely133* overwriting it.134*135* With 5-level paging, we use '_pgtable' to allocate the p4d page table,136* the top-level page table is allocated separately.137*138* p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level139* cases. On 4-level paging it's equal to 'top_level_pgt'.140*/141top_level_pgt = read_cr3_pa();142if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) {143pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;144pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE;145memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);146} else {147pgt_data.pgt_buf = _pgtable;148pgt_data.pgt_buf_size = BOOT_PGT_SIZE;149memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);150top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data);151}152153/*154* New page-table is set up - map the kernel image, boot_params and the155* command line. The uncompressed kernel requires boot_params and the156* command line to be mapped in the identity mapping. Map them157* explicitly here in case the compressed kernel does not touch them,158* or does not touch all the pages covering them.159*/160kernel_add_identity_map((unsigned long)_head, (unsigned long)_end);161boot_params_ptr = rmode;162kernel_add_identity_map((unsigned long)boot_params_ptr,163(unsigned long)(boot_params_ptr + 1));164cmdline = get_cmd_line_ptr();165kernel_add_identity_map(cmdline, cmdline + COMMAND_LINE_SIZE);166167/*168* Also map the setup_data entries passed via boot_params in case they169* need to be accessed by uncompressed kernel via the identity mapping.170*/171sd = (struct setup_data *)boot_params_ptr->hdr.setup_data;172while (sd) {173unsigned long sd_addr = (unsigned long)sd;174175kernel_add_identity_map(sd_addr, sd_addr + sizeof(*sd) + sd->len);176sd = (struct setup_data *)sd->next;177}178179sev_prep_identity_maps(top_level_pgt);180181/* Load the new page-table. */182write_cr3(top_level_pgt);183184/*185* Now that the required page table mappings are established and a186* GHCB can be used, check for SNP guest/HV feature compatibility.187*/188snp_check_features();189}190191static pte_t *split_large_pmd(struct x86_mapping_info *info,192pmd_t *pmdp, unsigned long __address)193{194unsigned long page_flags;195unsigned long address;196pte_t *pte;197pmd_t pmd;198int i;199200pte = (pte_t *)info->alloc_pgt_page(info->context);201if (!pte)202return NULL;203204address = __address & PMD_MASK;205/* No large page - clear PSE flag */206page_flags = info->page_flag & ~_PAGE_PSE;207208/* Populate the PTEs */209for (i = 0; i < PTRS_PER_PMD; i++) {210set_pte(&pte[i], __pte(address | page_flags));211address += PAGE_SIZE;212}213214/*215* Ideally we need to clear the large PMD first and do a TLB216* flush before we write the new PMD. But the 2M range of the217* PMD might contain the code we execute and/or the stack218* we are on, so we can't do that. But that should be safe here219* because we are going from large to small mappings and we are220* also the only user of the page-table, so there is no chance221* of a TLB multihit.222*/223pmd = __pmd((unsigned long)pte | info->kernpg_flag);224set_pmd(pmdp, pmd);225/* Flush TLB to establish the new PMD */226write_cr3(top_level_pgt);227228return pte + pte_index(__address);229}230231static void clflush_page(unsigned long address)232{233unsigned int flush_size;234char *cl, *start, *end;235236/*237* Hardcode cl-size to 64 - CPUID can't be used here because that might238* cause another #VC exception and the GHCB is not ready to use yet.239*/240flush_size = 64;241start = (char *)(address & PAGE_MASK);242end = start + PAGE_SIZE;243244/*245* First make sure there are no pending writes on the cache-lines to246* flush.247*/248asm volatile("mfence" : : : "memory");249250for (cl = start; cl != end; cl += flush_size)251clflush(cl);252}253254static int set_clr_page_flags(struct x86_mapping_info *info,255unsigned long address,256pteval_t set, pteval_t clr)257{258pgd_t *pgdp = (pgd_t *)top_level_pgt;259p4d_t *p4dp;260pud_t *pudp;261pmd_t *pmdp;262pte_t *ptep, pte;263264/*265* First make sure there is a PMD mapping for 'address'.266* It should already exist, but keep things generic.267*268* To map the page just read from it and fault it in if there is no269* mapping yet. kernel_add_identity_map() can't be called here because270* that would unconditionally map the address on PMD level, destroying271* any PTE-level mappings that might already exist. Use assembly here272* so the access won't be optimized away.273*/274asm volatile("mov %[address], %%r9"275:: [address] "g" (*(unsigned long *)address)276: "r9", "memory");277278/*279* The page is mapped at least with PMD size - so skip checks and walk280* directly to the PMD.281*/282p4dp = p4d_offset(pgdp, address);283pudp = pud_offset(p4dp, address);284pmdp = pmd_offset(pudp, address);285286if (pmd_leaf(*pmdp))287ptep = split_large_pmd(info, pmdp, address);288else289ptep = pte_offset_kernel(pmdp, address);290291if (!ptep)292return -ENOMEM;293294/*295* Changing encryption attributes of a page requires to flush it from296* the caches.297*/298if ((set | clr) & _PAGE_ENC) {299clflush_page(address);300301/*302* If the encryption attribute is being cleared, change the page state303* to shared in the RMP table.304*/305if (clr)306snp_set_page_shared(__pa(address & PAGE_MASK));307}308309/* Update PTE */310pte = *ptep;311pte = pte_set_flags(pte, set);312pte = pte_clear_flags(pte, clr);313set_pte(ptep, pte);314315/*316* If the encryption attribute is being set, then change the page state to317* private in the RMP entry. The page state change must be done after the PTE318* is updated.319*/320if (set & _PAGE_ENC)321snp_set_page_private(__pa(address & PAGE_MASK));322323/* Flush TLB after changing encryption attribute */324write_cr3(top_level_pgt);325326return 0;327}328329int set_page_decrypted(unsigned long address)330{331return set_clr_page_flags(&mapping_info, address, 0, _PAGE_ENC);332}333334int set_page_encrypted(unsigned long address)335{336return set_clr_page_flags(&mapping_info, address, _PAGE_ENC, 0);337}338339int set_page_non_present(unsigned long address)340{341return set_clr_page_flags(&mapping_info, address, 0, _PAGE_PRESENT);342}343344static void do_pf_error(const char *msg, unsigned long error_code,345unsigned long address, unsigned long ip)346{347error_putstr(msg);348349error_putstr("\nError Code: ");350error_puthex(error_code);351error_putstr("\nCR2: 0x");352error_puthex(address);353error_putstr("\nRIP relative to _head: 0x");354error_puthex(ip - (unsigned long)_head);355error_putstr("\n");356357error("Stopping.\n");358}359360void do_boot_page_fault(struct pt_regs *regs, unsigned long error_code)361{362unsigned long address = native_read_cr2();363unsigned long end;364bool ghcb_fault;365366ghcb_fault = sev_es_check_ghcb_fault(address);367368address &= PMD_MASK;369end = address + PMD_SIZE;370371/*372* Check for unexpected error codes. Unexpected are:373* - Faults on present pages374* - User faults375* - Reserved bits set376*/377if (error_code & (X86_PF_PROT | X86_PF_USER | X86_PF_RSVD))378do_pf_error("Unexpected page-fault:", error_code, address, regs->ip);379else if (ghcb_fault)380do_pf_error("Page-fault on GHCB page:", error_code, address, regs->ip);381382/*383* Error code is sane - now identity map the 2M region around384* the faulting address.385*/386kernel_add_identity_map(address, end);387}388389void do_boot_nmi_trap(struct pt_regs *regs, unsigned long error_code)390{391spurious_nmi_count++;392}393394395