Path: blob/master/arch/x86/kernel/cpu/resctrl/monitor.c
26516 views
// SPDX-License-Identifier: GPL-2.0-only1/*2* Resource Director Technology(RDT)3* - Monitoring code4*5* Copyright (C) 2017 Intel Corporation6*7* Author:8* Vikas Shivappa <[email protected]>9*10* This replaces the cqm.c based on perf but we reuse a lot of11* code and datastructures originally from Peter Zijlstra and Matt Fleming.12*13* More information about RDT be found in the Intel (R) x86 Architecture14* Software Developer Manual June 2016, volume 3, section 17.17.15*/1617#define pr_fmt(fmt) "resctrl: " fmt1819#include <linux/cpu.h>20#include <linux/resctrl.h>2122#include <asm/cpu_device_id.h>23#include <asm/msr.h>2425#include "internal.h"2627/*28* Global boolean for rdt_monitor which is true if any29* resource monitoring is enabled.30*/31bool rdt_mon_capable;3233/*34* Global to indicate which monitoring events are enabled.35*/36unsigned int rdt_mon_features;3738#define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5))3940static int snc_nodes_per_l3_cache = 1;4142/*43* The correction factor table is documented in Documentation/filesystems/resctrl.rst.44* If rmid > rmid threshold, MBM total and local values should be multiplied45* by the correction factor.46*47* The original table is modified for better code:48*49* 1. The threshold 0 is changed to rmid count - 1 so don't do correction50* for the case.51* 2. MBM total and local correction table indexed by core counter which is52* equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27.53* 3. The correction factor is normalized to 2^20 (1048576) so it's faster54* to calculate corrected value by shifting:55* corrected_value = (original_value * correction_factor) >> 2056*/57static const struct mbm_correction_factor_table {58u32 rmidthreshold;59u64 cf;60} mbm_cf_table[] __initconst = {61{7, CF(1.000000)},62{15, CF(1.000000)},63{15, CF(0.969650)},64{31, CF(1.000000)},65{31, CF(1.066667)},66{31, CF(0.969650)},67{47, CF(1.142857)},68{63, CF(1.000000)},69{63, CF(1.185115)},70{63, CF(1.066553)},71{79, CF(1.454545)},72{95, CF(1.000000)},73{95, CF(1.230769)},74{95, CF(1.142857)},75{95, CF(1.066667)},76{127, CF(1.000000)},77{127, CF(1.254863)},78{127, CF(1.185255)},79{151, CF(1.000000)},80{127, CF(1.066667)},81{167, CF(1.000000)},82{159, CF(1.454334)},83{183, CF(1.000000)},84{127, CF(0.969744)},85{191, CF(1.280246)},86{191, CF(1.230921)},87{215, CF(1.000000)},88{191, CF(1.143118)},89};9091static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX;9293static u64 mbm_cf __read_mostly;9495static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val)96{97/* Correct MBM value. */98if (rmid > mbm_cf_rmidthreshold)99val = (val * mbm_cf) >> 20;100101return val;102}103104/*105* When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by106* "snc_nodes_per_l3_cache == 1") no translation of the RMID value is107* needed. The physical RMID is the same as the logical RMID.108*109* On a platform with SNC mode enabled, Linux enables RMID sharing mode110* via MSR 0xCA0 (see the "RMID Sharing Mode" section in the "Intel111* Resource Director Technology Architecture Specification" for a full112* description of RMID sharing mode).113*114* In RMID sharing mode there are fewer "logical RMID" values available115* to accumulate data ("physical RMIDs" are divided evenly between SNC116* nodes that share an L3 cache). Linux creates an rdt_mon_domain for117* each SNC node.118*119* The value loaded into IA32_PQR_ASSOC is the "logical RMID".120*121* Data is collected independently on each SNC node and can be retrieved122* using the "physical RMID" value computed by this function and loaded123* into IA32_QM_EVTSEL. @cpu can be any CPU in the SNC node.124*125* The scope of the IA32_QM_EVTSEL and IA32_QM_CTR MSRs is at the L3126* cache. So a "physical RMID" may be read from any CPU that shares127* the L3 cache with the desired SNC node, not just from a CPU in128* the specific SNC node.129*/130static int logical_rmid_to_physical_rmid(int cpu, int lrmid)131{132struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;133134if (snc_nodes_per_l3_cache == 1)135return lrmid;136137return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->num_rmid;138}139140static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val)141{142u64 msr_val;143144/*145* As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured146* with a valid event code for supported resource type and the bits147* IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,148* IA32_QM_CTR.data (bits 61:0) reports the monitored data.149* IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)150* are error bits.151*/152wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid);153rdmsrq(MSR_IA32_QM_CTR, msr_val);154155if (msr_val & RMID_VAL_ERROR)156return -EIO;157if (msr_val & RMID_VAL_UNAVAIL)158return -EINVAL;159160*val = msr_val;161return 0;162}163164static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_dom,165u32 rmid,166enum resctrl_event_id eventid)167{168switch (eventid) {169case QOS_L3_OCCUP_EVENT_ID:170return NULL;171case QOS_L3_MBM_TOTAL_EVENT_ID:172return &hw_dom->arch_mbm_total[rmid];173case QOS_L3_MBM_LOCAL_EVENT_ID:174return &hw_dom->arch_mbm_local[rmid];175default:176/* Never expect to get here */177WARN_ON_ONCE(1);178return NULL;179}180}181182void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d,183u32 unused, u32 rmid,184enum resctrl_event_id eventid)185{186struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);187int cpu = cpumask_any(&d->hdr.cpu_mask);188struct arch_mbm_state *am;189u32 prmid;190191am = get_arch_mbm_state(hw_dom, rmid, eventid);192if (am) {193memset(am, 0, sizeof(*am));194195prmid = logical_rmid_to_physical_rmid(cpu, rmid);196/* Record any initial, non-zero count value. */197__rmid_read_phys(prmid, eventid, &am->prev_msr);198}199}200201/*202* Assumes that hardware counters are also reset and thus that there is203* no need to record initial non-zero counts.204*/205void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d)206{207struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);208209if (resctrl_arch_is_mbm_total_enabled())210memset(hw_dom->arch_mbm_total, 0,211sizeof(*hw_dom->arch_mbm_total) * r->num_rmid);212213if (resctrl_arch_is_mbm_local_enabled())214memset(hw_dom->arch_mbm_local, 0,215sizeof(*hw_dom->arch_mbm_local) * r->num_rmid);216}217218static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)219{220u64 shift = 64 - width, chunks;221222chunks = (cur_msr << shift) - (prev_msr << shift);223return chunks >> shift;224}225226int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d,227u32 unused, u32 rmid, enum resctrl_event_id eventid,228u64 *val, void *ignored)229{230struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);231struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);232int cpu = cpumask_any(&d->hdr.cpu_mask);233struct arch_mbm_state *am;234u64 msr_val, chunks;235u32 prmid;236int ret;237238resctrl_arch_rmid_read_context_check();239240prmid = logical_rmid_to_physical_rmid(cpu, rmid);241ret = __rmid_read_phys(prmid, eventid, &msr_val);242if (ret)243return ret;244245am = get_arch_mbm_state(hw_dom, rmid, eventid);246if (am) {247am->chunks += mbm_overflow_count(am->prev_msr, msr_val,248hw_res->mbm_width);249chunks = get_corrected_mbm_count(rmid, am->chunks);250am->prev_msr = msr_val;251} else {252chunks = msr_val;253}254255*val = chunks * hw_res->mon_scale;256257return 0;258}259260/*261* The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1262* which indicates that RMIDs are configured in legacy mode.263* This mode is incompatible with Linux resctrl semantics264* as RMIDs are partitioned between SNC nodes, which requires265* a user to know which RMID is allocated to a task.266* Clearing bit 0 reconfigures the RMID counters for use267* in RMID sharing mode. This mode is better for Linux.268* The RMID space is divided between all SNC nodes with the269* RMIDs renumbered to start from zero in each node when270* counting operations from tasks. Code to read the counters271* must adjust RMID counter numbers based on SNC node. See272* logical_rmid_to_physical_rmid() for code that does this.273*/274void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d)275{276if (snc_nodes_per_l3_cache > 1)277msr_clear_bit(MSR_RMID_SNC_CONFIG, 0);278}279280/* CPU models that support MSR_RMID_SNC_CONFIG */281static const struct x86_cpu_id snc_cpu_ids[] __initconst = {282X86_MATCH_VFM(INTEL_ICELAKE_X, 0),283X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0),284X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0),285X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0),286X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0),287{}288};289290/*291* There isn't a simple hardware bit that indicates whether a CPU is running292* in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the293* number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in294* the same NUMA node as CPU0.295* It is not possible to accurately determine SNC state if the system is296* booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes297* to L3 caches. It will be OK if system is booted with hyperthreading298* disabled (since this doesn't affect the ratio).299*/300static __init int snc_get_config(void)301{302struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE);303const cpumask_t *node0_cpumask;304int cpus_per_node, cpus_per_l3;305int ret;306307if (!x86_match_cpu(snc_cpu_ids) || !ci)308return 1;309310cpus_read_lock();311if (num_online_cpus() != num_present_cpus())312pr_warn("Some CPUs offline, SNC detection may be incorrect\n");313cpus_read_unlock();314315node0_cpumask = cpumask_of_node(cpu_to_node(0));316317cpus_per_node = cpumask_weight(node0_cpumask);318cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map);319320if (!cpus_per_node || !cpus_per_l3)321return 1;322323ret = cpus_per_l3 / cpus_per_node;324325/* sanity check: Only valid results are 1, 2, 3, 4, 6 */326switch (ret) {327case 1:328break;329case 2 ... 4:330case 6:331pr_info("Sub-NUMA Cluster mode detected with %d nodes per L3 cache\n", ret);332rdt_resources_all[RDT_RESOURCE_L3].r_resctrl.mon_scope = RESCTRL_L3_NODE;333break;334default:335pr_warn("Ignore improbable SNC node count %d\n", ret);336ret = 1;337break;338}339340return ret;341}342343int __init rdt_get_mon_l3_config(struct rdt_resource *r)344{345unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;346struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);347unsigned int threshold;348349snc_nodes_per_l3_cache = snc_get_config();350351resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024;352hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache;353r->num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache;354hw_res->mbm_width = MBM_CNTR_WIDTH_BASE;355356if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX)357hw_res->mbm_width += mbm_offset;358else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX)359pr_warn("Ignoring impossible MBM counter offset\n");360361/*362* A reasonable upper limit on the max threshold is the number363* of lines tagged per RMID if all RMIDs have the same number of364* lines tagged in the LLC.365*366* For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.367*/368threshold = resctrl_rmid_realloc_limit / r->num_rmid;369370/*371* Because num_rmid may not be a power of two, round the value372* to the nearest multiple of hw_res->mon_scale so it matches a373* value the hardware will measure. mon_scale may not be a power of 2.374*/375resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold);376377if (rdt_cpu_has(X86_FEATURE_BMEC)) {378u32 eax, ebx, ecx, edx;379380/* Detect list of bandwidth sources that can be tracked */381cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx);382r->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS;383}384385r->mon_capable = true;386387return 0;388}389390void __init intel_rdt_mbm_apply_quirk(void)391{392int cf_index;393394cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1;395if (cf_index >= ARRAY_SIZE(mbm_cf_table)) {396pr_info("No MBM correction factor available\n");397return;398}399400mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold;401mbm_cf = mbm_cf_table[cf_index].cf;402}403404405