Path: blob/master/arch/x86/kernel/cpu/resctrl/monitor.c
49432 views
// SPDX-License-Identifier: GPL-2.0-only1/*2* Resource Director Technology(RDT)3* - Monitoring code4*5* Copyright (C) 2017 Intel Corporation6*7* Author:8* Vikas Shivappa <[email protected]>9*10* This replaces the cqm.c based on perf but we reuse a lot of11* code and datastructures originally from Peter Zijlstra and Matt Fleming.12*13* More information about RDT be found in the Intel (R) x86 Architecture14* Software Developer Manual June 2016, volume 3, section 17.17.15*/1617#define pr_fmt(fmt) "resctrl: " fmt1819#include <linux/cpu.h>20#include <linux/resctrl.h>2122#include <asm/cpu_device_id.h>23#include <asm/msr.h>2425#include "internal.h"2627/*28* Global boolean for rdt_monitor which is true if any29* resource monitoring is enabled.30*/31bool rdt_mon_capable;3233#define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5))3435static int snc_nodes_per_l3_cache = 1;3637/*38* The correction factor table is documented in Documentation/filesystems/resctrl.rst.39* If rmid > rmid threshold, MBM total and local values should be multiplied40* by the correction factor.41*42* The original table is modified for better code:43*44* 1. The threshold 0 is changed to rmid count - 1 so don't do correction45* for the case.46* 2. MBM total and local correction table indexed by core counter which is47* equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27.48* 3. The correction factor is normalized to 2^20 (1048576) so it's faster49* to calculate corrected value by shifting:50* corrected_value = (original_value * correction_factor) >> 2051*/52static const struct mbm_correction_factor_table {53u32 rmidthreshold;54u64 cf;55} mbm_cf_table[] __initconst = {56{7, CF(1.000000)},57{15, CF(1.000000)},58{15, CF(0.969650)},59{31, CF(1.000000)},60{31, CF(1.066667)},61{31, CF(0.969650)},62{47, CF(1.142857)},63{63, CF(1.000000)},64{63, CF(1.185115)},65{63, CF(1.066553)},66{79, CF(1.454545)},67{95, CF(1.000000)},68{95, CF(1.230769)},69{95, CF(1.142857)},70{95, CF(1.066667)},71{127, CF(1.000000)},72{127, CF(1.254863)},73{127, CF(1.185255)},74{151, CF(1.000000)},75{127, CF(1.066667)},76{167, CF(1.000000)},77{159, CF(1.454334)},78{183, CF(1.000000)},79{127, CF(0.969744)},80{191, CF(1.280246)},81{191, CF(1.230921)},82{215, CF(1.000000)},83{191, CF(1.143118)},84};8586static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX;8788static u64 mbm_cf __read_mostly;8990static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val)91{92/* Correct MBM value. */93if (rmid > mbm_cf_rmidthreshold)94val = (val * mbm_cf) >> 20;9596return val;97}9899/*100* When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by101* "snc_nodes_per_l3_cache == 1") no translation of the RMID value is102* needed. The physical RMID is the same as the logical RMID.103*104* On a platform with SNC mode enabled, Linux enables RMID sharing mode105* via MSR 0xCA0 (see the "RMID Sharing Mode" section in the "Intel106* Resource Director Technology Architecture Specification" for a full107* description of RMID sharing mode).108*109* In RMID sharing mode there are fewer "logical RMID" values available110* to accumulate data ("physical RMIDs" are divided evenly between SNC111* nodes that share an L3 cache). Linux creates an rdt_mon_domain for112* each SNC node.113*114* The value loaded into IA32_PQR_ASSOC is the "logical RMID".115*116* Data is collected independently on each SNC node and can be retrieved117* using the "physical RMID" value computed by this function and loaded118* into IA32_QM_EVTSEL. @cpu can be any CPU in the SNC node.119*120* The scope of the IA32_QM_EVTSEL and IA32_QM_CTR MSRs is at the L3121* cache. So a "physical RMID" may be read from any CPU that shares122* the L3 cache with the desired SNC node, not just from a CPU in123* the specific SNC node.124*/125static int logical_rmid_to_physical_rmid(int cpu, int lrmid)126{127struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;128129if (snc_nodes_per_l3_cache == 1)130return lrmid;131132return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->mon.num_rmid;133}134135static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val)136{137u64 msr_val;138139/*140* As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured141* with a valid event code for supported resource type and the bits142* IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,143* IA32_QM_CTR.data (bits 61:0) reports the monitored data.144* IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)145* are error bits.146*/147wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid);148rdmsrq(MSR_IA32_QM_CTR, msr_val);149150if (msr_val & RMID_VAL_ERROR)151return -EIO;152if (msr_val & RMID_VAL_UNAVAIL)153return -EINVAL;154155*val = msr_val;156return 0;157}158159static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_dom,160u32 rmid,161enum resctrl_event_id eventid)162{163struct arch_mbm_state *state;164165if (!resctrl_is_mbm_event(eventid))166return NULL;167168state = hw_dom->arch_mbm_states[MBM_STATE_IDX(eventid)];169170return state ? &state[rmid] : NULL;171}172173void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d,174u32 unused, u32 rmid,175enum resctrl_event_id eventid)176{177struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);178int cpu = cpumask_any(&d->hdr.cpu_mask);179struct arch_mbm_state *am;180u32 prmid;181182am = get_arch_mbm_state(hw_dom, rmid, eventid);183if (am) {184memset(am, 0, sizeof(*am));185186prmid = logical_rmid_to_physical_rmid(cpu, rmid);187/* Record any initial, non-zero count value. */188__rmid_read_phys(prmid, eventid, &am->prev_msr);189}190}191192/*193* Assumes that hardware counters are also reset and thus that there is194* no need to record initial non-zero counts.195*/196void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d)197{198struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);199enum resctrl_event_id eventid;200int idx;201202for_each_mbm_event_id(eventid) {203if (!resctrl_is_mon_event_enabled(eventid))204continue;205idx = MBM_STATE_IDX(eventid);206memset(hw_dom->arch_mbm_states[idx], 0,207sizeof(*hw_dom->arch_mbm_states[0]) * r->mon.num_rmid);208}209}210211static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)212{213u64 shift = 64 - width, chunks;214215chunks = (cur_msr << shift) - (prev_msr << shift);216return chunks >> shift;217}218219static u64 get_corrected_val(struct rdt_resource *r, struct rdt_mon_domain *d,220u32 rmid, enum resctrl_event_id eventid, u64 msr_val)221{222struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);223struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);224struct arch_mbm_state *am;225u64 chunks;226227am = get_arch_mbm_state(hw_dom, rmid, eventid);228if (am) {229am->chunks += mbm_overflow_count(am->prev_msr, msr_val,230hw_res->mbm_width);231chunks = get_corrected_mbm_count(rmid, am->chunks);232am->prev_msr = msr_val;233} else {234chunks = msr_val;235}236237return chunks * hw_res->mon_scale;238}239240int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d,241u32 unused, u32 rmid, enum resctrl_event_id eventid,242u64 *val, void *ignored)243{244struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);245int cpu = cpumask_any(&d->hdr.cpu_mask);246struct arch_mbm_state *am;247u64 msr_val;248u32 prmid;249int ret;250251resctrl_arch_rmid_read_context_check();252253prmid = logical_rmid_to_physical_rmid(cpu, rmid);254ret = __rmid_read_phys(prmid, eventid, &msr_val);255256if (!ret) {257*val = get_corrected_val(r, d, rmid, eventid, msr_val);258} else if (ret == -EINVAL) {259am = get_arch_mbm_state(hw_dom, rmid, eventid);260if (am)261am->prev_msr = 0;262}263264return ret;265}266267static int __cntr_id_read(u32 cntr_id, u64 *val)268{269u64 msr_val;270271/*272* QM_EVTSEL Register definition:273* =======================================================274* Bits Mnemonic Description275* =======================================================276* 63:44 -- Reserved277* 43:32 RMID RMID or counter ID in ABMC mode278* when reading an MBM event279* 31 ExtendedEvtID Extended Event Identifier280* 30:8 -- Reserved281* 7:0 EvtID Event Identifier282* =======================================================283* The contents of a specific counter can be read by setting the284* following fields in QM_EVTSEL.ExtendedEvtID(=1) and285* QM_EVTSEL.EvtID = L3CacheABMC (=1) and setting QM_EVTSEL.RMID286* to the desired counter ID. Reading the QM_CTR then returns the287* contents of the specified counter. The RMID_VAL_ERROR bit is set288* if the counter configuration is invalid, or if an invalid counter289* ID is set in the QM_EVTSEL.RMID field. The RMID_VAL_UNAVAIL bit290* is set if the counter data is unavailable.291*/292wrmsr(MSR_IA32_QM_EVTSEL, ABMC_EXTENDED_EVT_ID | ABMC_EVT_ID, cntr_id);293rdmsrl(MSR_IA32_QM_CTR, msr_val);294295if (msr_val & RMID_VAL_ERROR)296return -EIO;297if (msr_val & RMID_VAL_UNAVAIL)298return -EINVAL;299300*val = msr_val;301return 0;302}303304void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d,305u32 unused, u32 rmid, int cntr_id,306enum resctrl_event_id eventid)307{308struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);309struct arch_mbm_state *am;310311am = get_arch_mbm_state(hw_dom, rmid, eventid);312if (am) {313memset(am, 0, sizeof(*am));314315/* Record any initial, non-zero count value. */316__cntr_id_read(cntr_id, &am->prev_msr);317}318}319320int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d,321u32 unused, u32 rmid, int cntr_id,322enum resctrl_event_id eventid, u64 *val)323{324u64 msr_val;325int ret;326327ret = __cntr_id_read(cntr_id, &msr_val);328if (ret)329return ret;330331*val = get_corrected_val(r, d, rmid, eventid, msr_val);332333return 0;334}335336/*337* The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1338* which indicates that RMIDs are configured in legacy mode.339* This mode is incompatible with Linux resctrl semantics340* as RMIDs are partitioned between SNC nodes, which requires341* a user to know which RMID is allocated to a task.342* Clearing bit 0 reconfigures the RMID counters for use343* in RMID sharing mode. This mode is better for Linux.344* The RMID space is divided between all SNC nodes with the345* RMIDs renumbered to start from zero in each node when346* counting operations from tasks. Code to read the counters347* must adjust RMID counter numbers based on SNC node. See348* logical_rmid_to_physical_rmid() for code that does this.349*/350void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d)351{352if (snc_nodes_per_l3_cache > 1)353msr_clear_bit(MSR_RMID_SNC_CONFIG, 0);354}355356/* CPU models that support MSR_RMID_SNC_CONFIG */357static const struct x86_cpu_id snc_cpu_ids[] __initconst = {358X86_MATCH_VFM(INTEL_ICELAKE_X, 0),359X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0),360X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0),361X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0),362X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0),363X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X, 0),364{}365};366367/*368* There isn't a simple hardware bit that indicates whether a CPU is running369* in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the370* number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in371* the same NUMA node as CPU0.372* It is not possible to accurately determine SNC state if the system is373* booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes374* to L3 caches. It will be OK if system is booted with hyperthreading375* disabled (since this doesn't affect the ratio).376*/377static __init int snc_get_config(void)378{379struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE);380const cpumask_t *node0_cpumask;381int cpus_per_node, cpus_per_l3;382int ret;383384if (!x86_match_cpu(snc_cpu_ids) || !ci)385return 1;386387cpus_read_lock();388if (num_online_cpus() != num_present_cpus())389pr_warn("Some CPUs offline, SNC detection may be incorrect\n");390cpus_read_unlock();391392node0_cpumask = cpumask_of_node(cpu_to_node(0));393394cpus_per_node = cpumask_weight(node0_cpumask);395cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map);396397if (!cpus_per_node || !cpus_per_l3)398return 1;399400ret = cpus_per_l3 / cpus_per_node;401402/* sanity check: Only valid results are 1, 2, 3, 4, 6 */403switch (ret) {404case 1:405break;406case 2 ... 4:407case 6:408pr_info("Sub-NUMA Cluster mode detected with %d nodes per L3 cache\n", ret);409rdt_resources_all[RDT_RESOURCE_L3].r_resctrl.mon_scope = RESCTRL_L3_NODE;410break;411default:412pr_warn("Ignore improbable SNC node count %d\n", ret);413ret = 1;414break;415}416417return ret;418}419420int __init rdt_get_mon_l3_config(struct rdt_resource *r)421{422unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;423struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);424unsigned int threshold;425u32 eax, ebx, ecx, edx;426427snc_nodes_per_l3_cache = snc_get_config();428429resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024;430hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache;431r->mon.num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache;432hw_res->mbm_width = MBM_CNTR_WIDTH_BASE;433434if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX)435hw_res->mbm_width += mbm_offset;436else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX)437pr_warn("Ignoring impossible MBM counter offset\n");438439/*440* A reasonable upper limit on the max threshold is the number441* of lines tagged per RMID if all RMIDs have the same number of442* lines tagged in the LLC.443*444* For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.445*/446threshold = resctrl_rmid_realloc_limit / r->mon.num_rmid;447448/*449* Because num_rmid may not be a power of two, round the value450* to the nearest multiple of hw_res->mon_scale so it matches a451* value the hardware will measure. mon_scale may not be a power of 2.452*/453resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold);454455if (rdt_cpu_has(X86_FEATURE_BMEC) || rdt_cpu_has(X86_FEATURE_ABMC)) {456/* Detect list of bandwidth sources that can be tracked */457cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx);458r->mon.mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS;459}460461/*462* resctrl assumes a system that supports assignable counters can463* switch to "default" mode. Ensure that there is a "default" mode464* to switch to. This enforces a dependency between the independent465* X86_FEATURE_ABMC and X86_FEATURE_CQM_MBM_TOTAL/X86_FEATURE_CQM_MBM_LOCAL466* hardware features.467*/468if (rdt_cpu_has(X86_FEATURE_ABMC) &&469(rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL) ||470rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL))) {471r->mon.mbm_cntr_assignable = true;472cpuid_count(0x80000020, 5, &eax, &ebx, &ecx, &edx);473r->mon.num_mbm_cntrs = (ebx & GENMASK(15, 0)) + 1;474hw_res->mbm_cntr_assign_enabled = true;475}476477r->mon_capable = true;478479return 0;480}481482void __init intel_rdt_mbm_apply_quirk(void)483{484int cf_index;485486cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1;487if (cf_index >= ARRAY_SIZE(mbm_cf_table)) {488pr_info("No MBM correction factor available\n");489return;490}491492mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold;493mbm_cf = mbm_cf_table[cf_index].cf;494}495496static void resctrl_abmc_set_one_amd(void *arg)497{498bool *enable = arg;499500if (*enable)501msr_set_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT);502else503msr_clear_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT);504}505506/*507* ABMC enable/disable requires update of L3_QOS_EXT_CFG MSR on all the CPUs508* associated with all monitor domains.509*/510static void _resctrl_abmc_enable(struct rdt_resource *r, bool enable)511{512struct rdt_mon_domain *d;513514lockdep_assert_cpus_held();515516list_for_each_entry(d, &r->mon_domains, hdr.list) {517on_each_cpu_mask(&d->hdr.cpu_mask, resctrl_abmc_set_one_amd,518&enable, 1);519resctrl_arch_reset_rmid_all(r, d);520}521}522523int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable)524{525struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);526527if (r->mon.mbm_cntr_assignable &&528hw_res->mbm_cntr_assign_enabled != enable) {529_resctrl_abmc_enable(r, enable);530hw_res->mbm_cntr_assign_enabled = enable;531}532533return 0;534}535536bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r)537{538return resctrl_to_arch_res(r)->mbm_cntr_assign_enabled;539}540541static void resctrl_abmc_config_one_amd(void *info)542{543union l3_qos_abmc_cfg *abmc_cfg = info;544545wrmsrl(MSR_IA32_L3_QOS_ABMC_CFG, abmc_cfg->full);546}547548/*549* Send an IPI to the domain to assign the counter to RMID, event pair.550*/551void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d,552enum resctrl_event_id evtid, u32 rmid, u32 closid,553u32 cntr_id, bool assign)554{555struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);556union l3_qos_abmc_cfg abmc_cfg = { 0 };557struct arch_mbm_state *am;558559abmc_cfg.split.cfg_en = 1;560abmc_cfg.split.cntr_en = assign ? 1 : 0;561abmc_cfg.split.cntr_id = cntr_id;562abmc_cfg.split.bw_src = rmid;563if (assign)564abmc_cfg.split.bw_type = resctrl_get_mon_evt_cfg(evtid);565566smp_call_function_any(&d->hdr.cpu_mask, resctrl_abmc_config_one_amd, &abmc_cfg, 1);567568/*569* The hardware counter is reset (because cfg_en == 1) so there is no570* need to record initial non-zero counts.571*/572am = get_arch_mbm_state(hw_dom, rmid, evtid);573if (am)574memset(am, 0, sizeof(*am));575}576577void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r)578{579struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);580581resctrl_abmc_set_one_amd(&hw_res->mbm_cntr_assign_enabled);582}583584585