Path: blob/master/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
26516 views
// SPDX-License-Identifier: GPL-2.01/*2* Resource Director Technology (RDT)3*4* Pseudo-locking support built on top of Cache Allocation Technology (CAT)5*6* Copyright (C) 2018 Intel Corporation7*8* Author: Reinette Chatre <[email protected]>9*/1011#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt1213#include <linux/cacheflush.h>14#include <linux/cpu.h>15#include <linux/perf_event.h>16#include <linux/pm_qos.h>17#include <linux/resctrl.h>1819#include <asm/cpu_device_id.h>20#include <asm/perf_event.h>21#include <asm/msr.h>2223#include "../../events/perf_event.h" /* For X86_CONFIG() */24#include "internal.h"2526#define CREATE_TRACE_POINTS2728#include "pseudo_lock_trace.h"2930/*31* The bits needed to disable hardware prefetching varies based on the32* platform. During initialization we will discover which bits to use.33*/34static u64 prefetch_disable_bits;3536/**37* resctrl_arch_get_prefetch_disable_bits - prefetch disable bits of supported38* platforms39* @void: It takes no parameters.40*41* Capture the list of platforms that have been validated to support42* pseudo-locking. This includes testing to ensure pseudo-locked regions43* with low cache miss rates can be created under variety of load conditions44* as well as that these pseudo-locked regions can maintain their low cache45* miss rates under variety of load conditions for significant lengths of time.46*47* After a platform has been validated to support pseudo-locking its48* hardware prefetch disable bits are included here as they are documented49* in the SDM.50*51* When adding a platform here also add support for its cache events to52* resctrl_arch_measure_l*_residency()53*54* Return:55* If platform is supported, the bits to disable hardware prefetchers, 056* if platform is not supported.57*/58u64 resctrl_arch_get_prefetch_disable_bits(void)59{60prefetch_disable_bits = 0;6162if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||63boot_cpu_data.x86 != 6)64return 0;6566switch (boot_cpu_data.x86_vfm) {67case INTEL_BROADWELL_X:68/*69* SDM defines bits of MSR_MISC_FEATURE_CONTROL register70* as:71* 0 L2 Hardware Prefetcher Disable (R/W)72* 1 L2 Adjacent Cache Line Prefetcher Disable (R/W)73* 2 DCU Hardware Prefetcher Disable (R/W)74* 3 DCU IP Prefetcher Disable (R/W)75* 63:4 Reserved76*/77prefetch_disable_bits = 0xF;78break;79case INTEL_ATOM_GOLDMONT:80case INTEL_ATOM_GOLDMONT_PLUS:81/*82* SDM defines bits of MSR_MISC_FEATURE_CONTROL register83* as:84* 0 L2 Hardware Prefetcher Disable (R/W)85* 1 Reserved86* 2 DCU Hardware Prefetcher Disable (R/W)87* 63:3 Reserved88*/89prefetch_disable_bits = 0x5;90break;91}9293return prefetch_disable_bits;94}9596/**97* resctrl_arch_pseudo_lock_fn - Load kernel memory into cache98* @_plr: the pseudo-lock region descriptor99*100* This is the core pseudo-locking flow.101*102* First we ensure that the kernel memory cannot be found in the cache.103* Then, while taking care that there will be as little interference as104* possible, the memory to be loaded is accessed while core is running105* with class of service set to the bitmask of the pseudo-locked region.106* After this is complete no future CAT allocations will be allowed to107* overlap with this bitmask.108*109* Local register variables are utilized to ensure that the memory region110* to be locked is the only memory access made during the critical locking111* loop.112*113* Return: 0. Waiter on waitqueue will be woken on completion.114*/115int resctrl_arch_pseudo_lock_fn(void *_plr)116{117struct pseudo_lock_region *plr = _plr;118u32 rmid_p, closid_p;119unsigned long i;120u64 saved_msr;121#ifdef CONFIG_KASAN122/*123* The registers used for local register variables are also used124* when KASAN is active. When KASAN is active we use a regular125* variable to ensure we always use a valid pointer, but the cost126* is that this variable will enter the cache through evicting the127* memory we are trying to lock into the cache. Thus expect lower128* pseudo-locking success rate when KASAN is active.129*/130unsigned int line_size;131unsigned int size;132void *mem_r;133#else134register unsigned int line_size asm("esi");135register unsigned int size asm("edi");136register void *mem_r asm(_ASM_BX);137#endif /* CONFIG_KASAN */138139/*140* Make sure none of the allocated memory is cached. If it is we141* will get a cache hit in below loop from outside of pseudo-locked142* region.143* wbinvd (as opposed to clflush/clflushopt) is required to144* increase likelihood that allocated cache portion will be filled145* with associated memory.146*/147wbinvd();148149/*150* Always called with interrupts enabled. By disabling interrupts151* ensure that we will not be preempted during this critical section.152*/153local_irq_disable();154155/*156* Call wrmsr and rdmsr as directly as possible to avoid tracing157* clobbering local register variables or affecting cache accesses.158*159* Disable the hardware prefetcher so that when the end of the memory160* being pseudo-locked is reached the hardware will not read beyond161* the buffer and evict pseudo-locked memory read earlier from the162* cache.163*/164saved_msr = native_rdmsrq(MSR_MISC_FEATURE_CONTROL);165native_wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);166closid_p = this_cpu_read(pqr_state.cur_closid);167rmid_p = this_cpu_read(pqr_state.cur_rmid);168mem_r = plr->kmem;169size = plr->size;170line_size = plr->line_size;171/*172* Critical section begin: start by writing the closid associated173* with the capacity bitmask of the cache region being174* pseudo-locked followed by reading of kernel memory to load it175* into the cache.176*/177native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, plr->closid);178179/*180* Cache was flushed earlier. Now access kernel memory to read it181* into cache region associated with just activated plr->closid.182* Loop over data twice:183* - In first loop the cache region is shared with the page walker184* as it populates the paging structure caches (including TLB).185* - In the second loop the paging structure caches are used and186* cache region is populated with the memory being referenced.187*/188for (i = 0; i < size; i += PAGE_SIZE) {189/*190* Add a barrier to prevent speculative execution of this191* loop reading beyond the end of the buffer.192*/193rmb();194asm volatile("mov (%0,%1,1), %%eax\n\t"195:196: "r" (mem_r), "r" (i)197: "%eax", "memory");198}199for (i = 0; i < size; i += line_size) {200/*201* Add a barrier to prevent speculative execution of this202* loop reading beyond the end of the buffer.203*/204rmb();205asm volatile("mov (%0,%1,1), %%eax\n\t"206:207: "r" (mem_r), "r" (i)208: "%eax", "memory");209}210/*211* Critical section end: restore closid with capacity bitmask that212* does not overlap with pseudo-locked region.213*/214native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, closid_p);215216/* Re-enable the hardware prefetcher(s) */217wrmsrq(MSR_MISC_FEATURE_CONTROL, saved_msr);218local_irq_enable();219220plr->thread_done = 1;221wake_up_interruptible(&plr->lock_thread_wq);222return 0;223}224225/**226* resctrl_arch_measure_cycles_lat_fn - Measure cycle latency to read227* pseudo-locked memory228* @_plr: pseudo-lock region to measure229*230* There is no deterministic way to test if a memory region is cached. One231* way is to measure how long it takes to read the memory, the speed of232* access is a good way to learn how close to the cpu the data was. Even233* more, if the prefetcher is disabled and the memory is read at a stride234* of half the cache line, then a cache miss will be easy to spot since the235* read of the first half would be significantly slower than the read of236* the second half.237*238* Return: 0. Waiter on waitqueue will be woken on completion.239*/240int resctrl_arch_measure_cycles_lat_fn(void *_plr)241{242struct pseudo_lock_region *plr = _plr;243u32 saved_low, saved_high;244unsigned long i;245u64 start, end;246void *mem_r;247248local_irq_disable();249/*250* Disable hardware prefetchers.251*/252rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);253wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);254mem_r = READ_ONCE(plr->kmem);255/*256* Dummy execute of the time measurement to load the needed257* instructions into the L1 instruction cache.258*/259start = rdtsc_ordered();260for (i = 0; i < plr->size; i += 32) {261start = rdtsc_ordered();262asm volatile("mov (%0,%1,1), %%eax\n\t"263:264: "r" (mem_r), "r" (i)265: "%eax", "memory");266end = rdtsc_ordered();267trace_pseudo_lock_mem_latency((u32)(end - start));268}269wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);270local_irq_enable();271plr->thread_done = 1;272wake_up_interruptible(&plr->lock_thread_wq);273return 0;274}275276/*277* Create a perf_event_attr for the hit and miss perf events that will278* be used during the performance measurement. A perf_event maintains279* a pointer to its perf_event_attr so a unique attribute structure is280* created for each perf_event.281*282* The actual configuration of the event is set right before use in order283* to use the X86_CONFIG macro.284*/285static struct perf_event_attr perf_miss_attr = {286.type = PERF_TYPE_RAW,287.size = sizeof(struct perf_event_attr),288.pinned = 1,289.disabled = 0,290.exclude_user = 1,291};292293static struct perf_event_attr perf_hit_attr = {294.type = PERF_TYPE_RAW,295.size = sizeof(struct perf_event_attr),296.pinned = 1,297.disabled = 0,298.exclude_user = 1,299};300301struct residency_counts {302u64 miss_before, hits_before;303u64 miss_after, hits_after;304};305306static int measure_residency_fn(struct perf_event_attr *miss_attr,307struct perf_event_attr *hit_attr,308struct pseudo_lock_region *plr,309struct residency_counts *counts)310{311u64 hits_before = 0, hits_after = 0, miss_before = 0, miss_after = 0;312struct perf_event *miss_event, *hit_event;313int hit_pmcnum, miss_pmcnum;314u32 saved_low, saved_high;315unsigned int line_size;316unsigned int size;317unsigned long i;318void *mem_r;319u64 tmp;320321miss_event = perf_event_create_kernel_counter(miss_attr, plr->cpu,322NULL, NULL, NULL);323if (IS_ERR(miss_event))324goto out;325326hit_event = perf_event_create_kernel_counter(hit_attr, plr->cpu,327NULL, NULL, NULL);328if (IS_ERR(hit_event))329goto out_miss;330331local_irq_disable();332/*333* Check any possible error state of events used by performing334* one local read.335*/336if (perf_event_read_local(miss_event, &tmp, NULL, NULL)) {337local_irq_enable();338goto out_hit;339}340if (perf_event_read_local(hit_event, &tmp, NULL, NULL)) {341local_irq_enable();342goto out_hit;343}344345/*346* Disable hardware prefetchers.347*/348rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);349wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);350351/* Initialize rest of local variables */352/*353* Performance event has been validated right before this with354* interrupts disabled - it is thus safe to read the counter index.355*/356miss_pmcnum = x86_perf_rdpmc_index(miss_event);357hit_pmcnum = x86_perf_rdpmc_index(hit_event);358line_size = READ_ONCE(plr->line_size);359mem_r = READ_ONCE(plr->kmem);360size = READ_ONCE(plr->size);361362/*363* Read counter variables twice - first to load the instructions364* used in L1 cache, second to capture accurate value that does not365* include cache misses incurred because of instruction loads.366*/367hits_before = rdpmc(hit_pmcnum);368miss_before = rdpmc(miss_pmcnum);369/*370* From SDM: Performing back-to-back fast reads are not guaranteed371* to be monotonic.372* Use LFENCE to ensure all previous instructions are retired373* before proceeding.374*/375rmb();376hits_before = rdpmc(hit_pmcnum);377miss_before = rdpmc(miss_pmcnum);378/*379* Use LFENCE to ensure all previous instructions are retired380* before proceeding.381*/382rmb();383for (i = 0; i < size; i += line_size) {384/*385* Add a barrier to prevent speculative execution of this386* loop reading beyond the end of the buffer.387*/388rmb();389asm volatile("mov (%0,%1,1), %%eax\n\t"390:391: "r" (mem_r), "r" (i)392: "%eax", "memory");393}394/*395* Use LFENCE to ensure all previous instructions are retired396* before proceeding.397*/398rmb();399hits_after = rdpmc(hit_pmcnum);400miss_after = rdpmc(miss_pmcnum);401/*402* Use LFENCE to ensure all previous instructions are retired403* before proceeding.404*/405rmb();406/* Re-enable hardware prefetchers */407wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);408local_irq_enable();409out_hit:410perf_event_release_kernel(hit_event);411out_miss:412perf_event_release_kernel(miss_event);413out:414/*415* All counts will be zero on failure.416*/417counts->miss_before = miss_before;418counts->hits_before = hits_before;419counts->miss_after = miss_after;420counts->hits_after = hits_after;421return 0;422}423424int resctrl_arch_measure_l2_residency(void *_plr)425{426struct pseudo_lock_region *plr = _plr;427struct residency_counts counts = {0};428429/*430* Non-architectural event for the Goldmont Microarchitecture431* from Intel x86 Architecture Software Developer Manual (SDM):432* MEM_LOAD_UOPS_RETIRED D1H (event number)433* Umask values:434* L2_HIT 02H435* L2_MISS 10H436*/437switch (boot_cpu_data.x86_vfm) {438case INTEL_ATOM_GOLDMONT:439case INTEL_ATOM_GOLDMONT_PLUS:440perf_miss_attr.config = X86_CONFIG(.event = 0xd1,441.umask = 0x10);442perf_hit_attr.config = X86_CONFIG(.event = 0xd1,443.umask = 0x2);444break;445default:446goto out;447}448449measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts);450/*451* If a failure prevented the measurements from succeeding452* tracepoints will still be written and all counts will be zero.453*/454trace_pseudo_lock_l2(counts.hits_after - counts.hits_before,455counts.miss_after - counts.miss_before);456out:457plr->thread_done = 1;458wake_up_interruptible(&plr->lock_thread_wq);459return 0;460}461462int resctrl_arch_measure_l3_residency(void *_plr)463{464struct pseudo_lock_region *plr = _plr;465struct residency_counts counts = {0};466467/*468* On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event469* has two "no fix" errata associated with it: BDM35 and BDM100. On470* this platform the following events are used instead:471* LONGEST_LAT_CACHE 2EH (Documented in SDM)472* REFERENCE 4FH473* MISS 41H474*/475476switch (boot_cpu_data.x86_vfm) {477case INTEL_BROADWELL_X:478/* On BDW the hit event counts references, not hits */479perf_hit_attr.config = X86_CONFIG(.event = 0x2e,480.umask = 0x4f);481perf_miss_attr.config = X86_CONFIG(.event = 0x2e,482.umask = 0x41);483break;484default:485goto out;486}487488measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts);489/*490* If a failure prevented the measurements from succeeding491* tracepoints will still be written and all counts will be zero.492*/493494counts.miss_after -= counts.miss_before;495if (boot_cpu_data.x86_vfm == INTEL_BROADWELL_X) {496/*497* On BDW references and misses are counted, need to adjust.498* Sometimes the "hits" counter is a bit more than the499* references, for example, x references but x + 1 hits.500* To not report invalid hit values in this case we treat501* that as misses equal to references.502*/503/* First compute the number of cache references measured */504counts.hits_after -= counts.hits_before;505/* Next convert references to cache hits */506counts.hits_after -= min(counts.miss_after, counts.hits_after);507} else {508counts.hits_after -= counts.hits_before;509}510511trace_pseudo_lock_l3(counts.hits_after, counts.miss_after);512out:513plr->thread_done = 1;514wake_up_interruptible(&plr->lock_thread_wq);515return 0;516}517518519