Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/kernel/cpu/resctrl/monitor.c
49432 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
* Resource Director Technology(RDT)
4
* - Monitoring code
5
*
6
* Copyright (C) 2017 Intel Corporation
7
*
8
* Author:
9
* Vikas Shivappa <[email protected]>
10
*
11
* This replaces the cqm.c based on perf but we reuse a lot of
12
* code and datastructures originally from Peter Zijlstra and Matt Fleming.
13
*
14
* More information about RDT be found in the Intel (R) x86 Architecture
15
* Software Developer Manual June 2016, volume 3, section 17.17.
16
*/
17
18
#define pr_fmt(fmt) "resctrl: " fmt
19
20
#include <linux/cpu.h>
21
#include <linux/resctrl.h>
22
23
#include <asm/cpu_device_id.h>
24
#include <asm/msr.h>
25
26
#include "internal.h"
27
28
/*
29
* Global boolean for rdt_monitor which is true if any
30
* resource monitoring is enabled.
31
*/
32
bool rdt_mon_capable;
33
34
#define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5))
35
36
static int snc_nodes_per_l3_cache = 1;
37
38
/*
39
* The correction factor table is documented in Documentation/filesystems/resctrl.rst.
40
* If rmid > rmid threshold, MBM total and local values should be multiplied
41
* by the correction factor.
42
*
43
* The original table is modified for better code:
44
*
45
* 1. The threshold 0 is changed to rmid count - 1 so don't do correction
46
* for the case.
47
* 2. MBM total and local correction table indexed by core counter which is
48
* equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27.
49
* 3. The correction factor is normalized to 2^20 (1048576) so it's faster
50
* to calculate corrected value by shifting:
51
* corrected_value = (original_value * correction_factor) >> 20
52
*/
53
static const struct mbm_correction_factor_table {
54
u32 rmidthreshold;
55
u64 cf;
56
} mbm_cf_table[] __initconst = {
57
{7, CF(1.000000)},
58
{15, CF(1.000000)},
59
{15, CF(0.969650)},
60
{31, CF(1.000000)},
61
{31, CF(1.066667)},
62
{31, CF(0.969650)},
63
{47, CF(1.142857)},
64
{63, CF(1.000000)},
65
{63, CF(1.185115)},
66
{63, CF(1.066553)},
67
{79, CF(1.454545)},
68
{95, CF(1.000000)},
69
{95, CF(1.230769)},
70
{95, CF(1.142857)},
71
{95, CF(1.066667)},
72
{127, CF(1.000000)},
73
{127, CF(1.254863)},
74
{127, CF(1.185255)},
75
{151, CF(1.000000)},
76
{127, CF(1.066667)},
77
{167, CF(1.000000)},
78
{159, CF(1.454334)},
79
{183, CF(1.000000)},
80
{127, CF(0.969744)},
81
{191, CF(1.280246)},
82
{191, CF(1.230921)},
83
{215, CF(1.000000)},
84
{191, CF(1.143118)},
85
};
86
87
static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX;
88
89
static u64 mbm_cf __read_mostly;
90
91
static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val)
92
{
93
/* Correct MBM value. */
94
if (rmid > mbm_cf_rmidthreshold)
95
val = (val * mbm_cf) >> 20;
96
97
return val;
98
}
99
100
/*
101
* When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by
102
* "snc_nodes_per_l3_cache == 1") no translation of the RMID value is
103
* needed. The physical RMID is the same as the logical RMID.
104
*
105
* On a platform with SNC mode enabled, Linux enables RMID sharing mode
106
* via MSR 0xCA0 (see the "RMID Sharing Mode" section in the "Intel
107
* Resource Director Technology Architecture Specification" for a full
108
* description of RMID sharing mode).
109
*
110
* In RMID sharing mode there are fewer "logical RMID" values available
111
* to accumulate data ("physical RMIDs" are divided evenly between SNC
112
* nodes that share an L3 cache). Linux creates an rdt_mon_domain for
113
* each SNC node.
114
*
115
* The value loaded into IA32_PQR_ASSOC is the "logical RMID".
116
*
117
* Data is collected independently on each SNC node and can be retrieved
118
* using the "physical RMID" value computed by this function and loaded
119
* into IA32_QM_EVTSEL. @cpu can be any CPU in the SNC node.
120
*
121
* The scope of the IA32_QM_EVTSEL and IA32_QM_CTR MSRs is at the L3
122
* cache. So a "physical RMID" may be read from any CPU that shares
123
* the L3 cache with the desired SNC node, not just from a CPU in
124
* the specific SNC node.
125
*/
126
static int logical_rmid_to_physical_rmid(int cpu, int lrmid)
127
{
128
struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
129
130
if (snc_nodes_per_l3_cache == 1)
131
return lrmid;
132
133
return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->mon.num_rmid;
134
}
135
136
static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val)
137
{
138
u64 msr_val;
139
140
/*
141
* As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
142
* with a valid event code for supported resource type and the bits
143
* IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,
144
* IA32_QM_CTR.data (bits 61:0) reports the monitored data.
145
* IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
146
* are error bits.
147
*/
148
wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid);
149
rdmsrq(MSR_IA32_QM_CTR, msr_val);
150
151
if (msr_val & RMID_VAL_ERROR)
152
return -EIO;
153
if (msr_val & RMID_VAL_UNAVAIL)
154
return -EINVAL;
155
156
*val = msr_val;
157
return 0;
158
}
159
160
static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_dom,
161
u32 rmid,
162
enum resctrl_event_id eventid)
163
{
164
struct arch_mbm_state *state;
165
166
if (!resctrl_is_mbm_event(eventid))
167
return NULL;
168
169
state = hw_dom->arch_mbm_states[MBM_STATE_IDX(eventid)];
170
171
return state ? &state[rmid] : NULL;
172
}
173
174
void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d,
175
u32 unused, u32 rmid,
176
enum resctrl_event_id eventid)
177
{
178
struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
179
int cpu = cpumask_any(&d->hdr.cpu_mask);
180
struct arch_mbm_state *am;
181
u32 prmid;
182
183
am = get_arch_mbm_state(hw_dom, rmid, eventid);
184
if (am) {
185
memset(am, 0, sizeof(*am));
186
187
prmid = logical_rmid_to_physical_rmid(cpu, rmid);
188
/* Record any initial, non-zero count value. */
189
__rmid_read_phys(prmid, eventid, &am->prev_msr);
190
}
191
}
192
193
/*
194
* Assumes that hardware counters are also reset and thus that there is
195
* no need to record initial non-zero counts.
196
*/
197
void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d)
198
{
199
struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
200
enum resctrl_event_id eventid;
201
int idx;
202
203
for_each_mbm_event_id(eventid) {
204
if (!resctrl_is_mon_event_enabled(eventid))
205
continue;
206
idx = MBM_STATE_IDX(eventid);
207
memset(hw_dom->arch_mbm_states[idx], 0,
208
sizeof(*hw_dom->arch_mbm_states[0]) * r->mon.num_rmid);
209
}
210
}
211
212
static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
213
{
214
u64 shift = 64 - width, chunks;
215
216
chunks = (cur_msr << shift) - (prev_msr << shift);
217
return chunks >> shift;
218
}
219
220
static u64 get_corrected_val(struct rdt_resource *r, struct rdt_mon_domain *d,
221
u32 rmid, enum resctrl_event_id eventid, u64 msr_val)
222
{
223
struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
224
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
225
struct arch_mbm_state *am;
226
u64 chunks;
227
228
am = get_arch_mbm_state(hw_dom, rmid, eventid);
229
if (am) {
230
am->chunks += mbm_overflow_count(am->prev_msr, msr_val,
231
hw_res->mbm_width);
232
chunks = get_corrected_mbm_count(rmid, am->chunks);
233
am->prev_msr = msr_val;
234
} else {
235
chunks = msr_val;
236
}
237
238
return chunks * hw_res->mon_scale;
239
}
240
241
int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d,
242
u32 unused, u32 rmid, enum resctrl_event_id eventid,
243
u64 *val, void *ignored)
244
{
245
struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
246
int cpu = cpumask_any(&d->hdr.cpu_mask);
247
struct arch_mbm_state *am;
248
u64 msr_val;
249
u32 prmid;
250
int ret;
251
252
resctrl_arch_rmid_read_context_check();
253
254
prmid = logical_rmid_to_physical_rmid(cpu, rmid);
255
ret = __rmid_read_phys(prmid, eventid, &msr_val);
256
257
if (!ret) {
258
*val = get_corrected_val(r, d, rmid, eventid, msr_val);
259
} else if (ret == -EINVAL) {
260
am = get_arch_mbm_state(hw_dom, rmid, eventid);
261
if (am)
262
am->prev_msr = 0;
263
}
264
265
return ret;
266
}
267
268
static int __cntr_id_read(u32 cntr_id, u64 *val)
269
{
270
u64 msr_val;
271
272
/*
273
* QM_EVTSEL Register definition:
274
* =======================================================
275
* Bits Mnemonic Description
276
* =======================================================
277
* 63:44 -- Reserved
278
* 43:32 RMID RMID or counter ID in ABMC mode
279
* when reading an MBM event
280
* 31 ExtendedEvtID Extended Event Identifier
281
* 30:8 -- Reserved
282
* 7:0 EvtID Event Identifier
283
* =======================================================
284
* The contents of a specific counter can be read by setting the
285
* following fields in QM_EVTSEL.ExtendedEvtID(=1) and
286
* QM_EVTSEL.EvtID = L3CacheABMC (=1) and setting QM_EVTSEL.RMID
287
* to the desired counter ID. Reading the QM_CTR then returns the
288
* contents of the specified counter. The RMID_VAL_ERROR bit is set
289
* if the counter configuration is invalid, or if an invalid counter
290
* ID is set in the QM_EVTSEL.RMID field. The RMID_VAL_UNAVAIL bit
291
* is set if the counter data is unavailable.
292
*/
293
wrmsr(MSR_IA32_QM_EVTSEL, ABMC_EXTENDED_EVT_ID | ABMC_EVT_ID, cntr_id);
294
rdmsrl(MSR_IA32_QM_CTR, msr_val);
295
296
if (msr_val & RMID_VAL_ERROR)
297
return -EIO;
298
if (msr_val & RMID_VAL_UNAVAIL)
299
return -EINVAL;
300
301
*val = msr_val;
302
return 0;
303
}
304
305
void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d,
306
u32 unused, u32 rmid, int cntr_id,
307
enum resctrl_event_id eventid)
308
{
309
struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
310
struct arch_mbm_state *am;
311
312
am = get_arch_mbm_state(hw_dom, rmid, eventid);
313
if (am) {
314
memset(am, 0, sizeof(*am));
315
316
/* Record any initial, non-zero count value. */
317
__cntr_id_read(cntr_id, &am->prev_msr);
318
}
319
}
320
321
int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d,
322
u32 unused, u32 rmid, int cntr_id,
323
enum resctrl_event_id eventid, u64 *val)
324
{
325
u64 msr_val;
326
int ret;
327
328
ret = __cntr_id_read(cntr_id, &msr_val);
329
if (ret)
330
return ret;
331
332
*val = get_corrected_val(r, d, rmid, eventid, msr_val);
333
334
return 0;
335
}
336
337
/*
338
* The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1
339
* which indicates that RMIDs are configured in legacy mode.
340
* This mode is incompatible with Linux resctrl semantics
341
* as RMIDs are partitioned between SNC nodes, which requires
342
* a user to know which RMID is allocated to a task.
343
* Clearing bit 0 reconfigures the RMID counters for use
344
* in RMID sharing mode. This mode is better for Linux.
345
* The RMID space is divided between all SNC nodes with the
346
* RMIDs renumbered to start from zero in each node when
347
* counting operations from tasks. Code to read the counters
348
* must adjust RMID counter numbers based on SNC node. See
349
* logical_rmid_to_physical_rmid() for code that does this.
350
*/
351
void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d)
352
{
353
if (snc_nodes_per_l3_cache > 1)
354
msr_clear_bit(MSR_RMID_SNC_CONFIG, 0);
355
}
356
357
/* CPU models that support MSR_RMID_SNC_CONFIG */
358
static const struct x86_cpu_id snc_cpu_ids[] __initconst = {
359
X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
360
X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0),
361
X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0),
362
X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0),
363
X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0),
364
X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X, 0),
365
{}
366
};
367
368
/*
369
* There isn't a simple hardware bit that indicates whether a CPU is running
370
* in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the
371
* number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in
372
* the same NUMA node as CPU0.
373
* It is not possible to accurately determine SNC state if the system is
374
* booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes
375
* to L3 caches. It will be OK if system is booted with hyperthreading
376
* disabled (since this doesn't affect the ratio).
377
*/
378
static __init int snc_get_config(void)
379
{
380
struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE);
381
const cpumask_t *node0_cpumask;
382
int cpus_per_node, cpus_per_l3;
383
int ret;
384
385
if (!x86_match_cpu(snc_cpu_ids) || !ci)
386
return 1;
387
388
cpus_read_lock();
389
if (num_online_cpus() != num_present_cpus())
390
pr_warn("Some CPUs offline, SNC detection may be incorrect\n");
391
cpus_read_unlock();
392
393
node0_cpumask = cpumask_of_node(cpu_to_node(0));
394
395
cpus_per_node = cpumask_weight(node0_cpumask);
396
cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map);
397
398
if (!cpus_per_node || !cpus_per_l3)
399
return 1;
400
401
ret = cpus_per_l3 / cpus_per_node;
402
403
/* sanity check: Only valid results are 1, 2, 3, 4, 6 */
404
switch (ret) {
405
case 1:
406
break;
407
case 2 ... 4:
408
case 6:
409
pr_info("Sub-NUMA Cluster mode detected with %d nodes per L3 cache\n", ret);
410
rdt_resources_all[RDT_RESOURCE_L3].r_resctrl.mon_scope = RESCTRL_L3_NODE;
411
break;
412
default:
413
pr_warn("Ignore improbable SNC node count %d\n", ret);
414
ret = 1;
415
break;
416
}
417
418
return ret;
419
}
420
421
int __init rdt_get_mon_l3_config(struct rdt_resource *r)
422
{
423
unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
424
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
425
unsigned int threshold;
426
u32 eax, ebx, ecx, edx;
427
428
snc_nodes_per_l3_cache = snc_get_config();
429
430
resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024;
431
hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache;
432
r->mon.num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache;
433
hw_res->mbm_width = MBM_CNTR_WIDTH_BASE;
434
435
if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX)
436
hw_res->mbm_width += mbm_offset;
437
else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX)
438
pr_warn("Ignoring impossible MBM counter offset\n");
439
440
/*
441
* A reasonable upper limit on the max threshold is the number
442
* of lines tagged per RMID if all RMIDs have the same number of
443
* lines tagged in the LLC.
444
*
445
* For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
446
*/
447
threshold = resctrl_rmid_realloc_limit / r->mon.num_rmid;
448
449
/*
450
* Because num_rmid may not be a power of two, round the value
451
* to the nearest multiple of hw_res->mon_scale so it matches a
452
* value the hardware will measure. mon_scale may not be a power of 2.
453
*/
454
resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold);
455
456
if (rdt_cpu_has(X86_FEATURE_BMEC) || rdt_cpu_has(X86_FEATURE_ABMC)) {
457
/* Detect list of bandwidth sources that can be tracked */
458
cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx);
459
r->mon.mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS;
460
}
461
462
/*
463
* resctrl assumes a system that supports assignable counters can
464
* switch to "default" mode. Ensure that there is a "default" mode
465
* to switch to. This enforces a dependency between the independent
466
* X86_FEATURE_ABMC and X86_FEATURE_CQM_MBM_TOTAL/X86_FEATURE_CQM_MBM_LOCAL
467
* hardware features.
468
*/
469
if (rdt_cpu_has(X86_FEATURE_ABMC) &&
470
(rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL) ||
471
rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL))) {
472
r->mon.mbm_cntr_assignable = true;
473
cpuid_count(0x80000020, 5, &eax, &ebx, &ecx, &edx);
474
r->mon.num_mbm_cntrs = (ebx & GENMASK(15, 0)) + 1;
475
hw_res->mbm_cntr_assign_enabled = true;
476
}
477
478
r->mon_capable = true;
479
480
return 0;
481
}
482
483
void __init intel_rdt_mbm_apply_quirk(void)
484
{
485
int cf_index;
486
487
cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1;
488
if (cf_index >= ARRAY_SIZE(mbm_cf_table)) {
489
pr_info("No MBM correction factor available\n");
490
return;
491
}
492
493
mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold;
494
mbm_cf = mbm_cf_table[cf_index].cf;
495
}
496
497
static void resctrl_abmc_set_one_amd(void *arg)
498
{
499
bool *enable = arg;
500
501
if (*enable)
502
msr_set_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT);
503
else
504
msr_clear_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT);
505
}
506
507
/*
508
* ABMC enable/disable requires update of L3_QOS_EXT_CFG MSR on all the CPUs
509
* associated with all monitor domains.
510
*/
511
static void _resctrl_abmc_enable(struct rdt_resource *r, bool enable)
512
{
513
struct rdt_mon_domain *d;
514
515
lockdep_assert_cpus_held();
516
517
list_for_each_entry(d, &r->mon_domains, hdr.list) {
518
on_each_cpu_mask(&d->hdr.cpu_mask, resctrl_abmc_set_one_amd,
519
&enable, 1);
520
resctrl_arch_reset_rmid_all(r, d);
521
}
522
}
523
524
int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable)
525
{
526
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
527
528
if (r->mon.mbm_cntr_assignable &&
529
hw_res->mbm_cntr_assign_enabled != enable) {
530
_resctrl_abmc_enable(r, enable);
531
hw_res->mbm_cntr_assign_enabled = enable;
532
}
533
534
return 0;
535
}
536
537
bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r)
538
{
539
return resctrl_to_arch_res(r)->mbm_cntr_assign_enabled;
540
}
541
542
static void resctrl_abmc_config_one_amd(void *info)
543
{
544
union l3_qos_abmc_cfg *abmc_cfg = info;
545
546
wrmsrl(MSR_IA32_L3_QOS_ABMC_CFG, abmc_cfg->full);
547
}
548
549
/*
550
* Send an IPI to the domain to assign the counter to RMID, event pair.
551
*/
552
void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d,
553
enum resctrl_event_id evtid, u32 rmid, u32 closid,
554
u32 cntr_id, bool assign)
555
{
556
struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
557
union l3_qos_abmc_cfg abmc_cfg = { 0 };
558
struct arch_mbm_state *am;
559
560
abmc_cfg.split.cfg_en = 1;
561
abmc_cfg.split.cntr_en = assign ? 1 : 0;
562
abmc_cfg.split.cntr_id = cntr_id;
563
abmc_cfg.split.bw_src = rmid;
564
if (assign)
565
abmc_cfg.split.bw_type = resctrl_get_mon_evt_cfg(evtid);
566
567
smp_call_function_any(&d->hdr.cpu_mask, resctrl_abmc_config_one_amd, &abmc_cfg, 1);
568
569
/*
570
* The hardware counter is reset (because cfg_en == 1) so there is no
571
* need to record initial non-zero counts.
572
*/
573
am = get_arch_mbm_state(hw_dom, rmid, evtid);
574
if (am)
575
memset(am, 0, sizeof(*am));
576
}
577
578
void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r)
579
{
580
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
581
582
resctrl_abmc_set_one_amd(&hw_res->mbm_cntr_assign_enabled);
583
}
584
585