Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/kernel/cpu/resctrl/monitor.c
26516 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
* Resource Director Technology(RDT)
4
* - Monitoring code
5
*
6
* Copyright (C) 2017 Intel Corporation
7
*
8
* Author:
9
* Vikas Shivappa <[email protected]>
10
*
11
* This replaces the cqm.c based on perf but we reuse a lot of
12
* code and datastructures originally from Peter Zijlstra and Matt Fleming.
13
*
14
* More information about RDT be found in the Intel (R) x86 Architecture
15
* Software Developer Manual June 2016, volume 3, section 17.17.
16
*/
17
18
#define pr_fmt(fmt) "resctrl: " fmt
19
20
#include <linux/cpu.h>
21
#include <linux/resctrl.h>
22
23
#include <asm/cpu_device_id.h>
24
#include <asm/msr.h>
25
26
#include "internal.h"
27
28
/*
29
* Global boolean for rdt_monitor which is true if any
30
* resource monitoring is enabled.
31
*/
32
bool rdt_mon_capable;
33
34
/*
35
* Global to indicate which monitoring events are enabled.
36
*/
37
unsigned int rdt_mon_features;
38
39
#define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5))
40
41
static int snc_nodes_per_l3_cache = 1;
42
43
/*
44
* The correction factor table is documented in Documentation/filesystems/resctrl.rst.
45
* If rmid > rmid threshold, MBM total and local values should be multiplied
46
* by the correction factor.
47
*
48
* The original table is modified for better code:
49
*
50
* 1. The threshold 0 is changed to rmid count - 1 so don't do correction
51
* for the case.
52
* 2. MBM total and local correction table indexed by core counter which is
53
* equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27.
54
* 3. The correction factor is normalized to 2^20 (1048576) so it's faster
55
* to calculate corrected value by shifting:
56
* corrected_value = (original_value * correction_factor) >> 20
57
*/
58
static const struct mbm_correction_factor_table {
59
u32 rmidthreshold;
60
u64 cf;
61
} mbm_cf_table[] __initconst = {
62
{7, CF(1.000000)},
63
{15, CF(1.000000)},
64
{15, CF(0.969650)},
65
{31, CF(1.000000)},
66
{31, CF(1.066667)},
67
{31, CF(0.969650)},
68
{47, CF(1.142857)},
69
{63, CF(1.000000)},
70
{63, CF(1.185115)},
71
{63, CF(1.066553)},
72
{79, CF(1.454545)},
73
{95, CF(1.000000)},
74
{95, CF(1.230769)},
75
{95, CF(1.142857)},
76
{95, CF(1.066667)},
77
{127, CF(1.000000)},
78
{127, CF(1.254863)},
79
{127, CF(1.185255)},
80
{151, CF(1.000000)},
81
{127, CF(1.066667)},
82
{167, CF(1.000000)},
83
{159, CF(1.454334)},
84
{183, CF(1.000000)},
85
{127, CF(0.969744)},
86
{191, CF(1.280246)},
87
{191, CF(1.230921)},
88
{215, CF(1.000000)},
89
{191, CF(1.143118)},
90
};
91
92
static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX;
93
94
static u64 mbm_cf __read_mostly;
95
96
static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val)
97
{
98
/* Correct MBM value. */
99
if (rmid > mbm_cf_rmidthreshold)
100
val = (val * mbm_cf) >> 20;
101
102
return val;
103
}
104
105
/*
106
* When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by
107
* "snc_nodes_per_l3_cache == 1") no translation of the RMID value is
108
* needed. The physical RMID is the same as the logical RMID.
109
*
110
* On a platform with SNC mode enabled, Linux enables RMID sharing mode
111
* via MSR 0xCA0 (see the "RMID Sharing Mode" section in the "Intel
112
* Resource Director Technology Architecture Specification" for a full
113
* description of RMID sharing mode).
114
*
115
* In RMID sharing mode there are fewer "logical RMID" values available
116
* to accumulate data ("physical RMIDs" are divided evenly between SNC
117
* nodes that share an L3 cache). Linux creates an rdt_mon_domain for
118
* each SNC node.
119
*
120
* The value loaded into IA32_PQR_ASSOC is the "logical RMID".
121
*
122
* Data is collected independently on each SNC node and can be retrieved
123
* using the "physical RMID" value computed by this function and loaded
124
* into IA32_QM_EVTSEL. @cpu can be any CPU in the SNC node.
125
*
126
* The scope of the IA32_QM_EVTSEL and IA32_QM_CTR MSRs is at the L3
127
* cache. So a "physical RMID" may be read from any CPU that shares
128
* the L3 cache with the desired SNC node, not just from a CPU in
129
* the specific SNC node.
130
*/
131
static int logical_rmid_to_physical_rmid(int cpu, int lrmid)
132
{
133
struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
134
135
if (snc_nodes_per_l3_cache == 1)
136
return lrmid;
137
138
return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->num_rmid;
139
}
140
141
static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val)
142
{
143
u64 msr_val;
144
145
/*
146
* As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
147
* with a valid event code for supported resource type and the bits
148
* IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,
149
* IA32_QM_CTR.data (bits 61:0) reports the monitored data.
150
* IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
151
* are error bits.
152
*/
153
wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid);
154
rdmsrq(MSR_IA32_QM_CTR, msr_val);
155
156
if (msr_val & RMID_VAL_ERROR)
157
return -EIO;
158
if (msr_val & RMID_VAL_UNAVAIL)
159
return -EINVAL;
160
161
*val = msr_val;
162
return 0;
163
}
164
165
static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_dom,
166
u32 rmid,
167
enum resctrl_event_id eventid)
168
{
169
switch (eventid) {
170
case QOS_L3_OCCUP_EVENT_ID:
171
return NULL;
172
case QOS_L3_MBM_TOTAL_EVENT_ID:
173
return &hw_dom->arch_mbm_total[rmid];
174
case QOS_L3_MBM_LOCAL_EVENT_ID:
175
return &hw_dom->arch_mbm_local[rmid];
176
default:
177
/* Never expect to get here */
178
WARN_ON_ONCE(1);
179
return NULL;
180
}
181
}
182
183
void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d,
184
u32 unused, u32 rmid,
185
enum resctrl_event_id eventid)
186
{
187
struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
188
int cpu = cpumask_any(&d->hdr.cpu_mask);
189
struct arch_mbm_state *am;
190
u32 prmid;
191
192
am = get_arch_mbm_state(hw_dom, rmid, eventid);
193
if (am) {
194
memset(am, 0, sizeof(*am));
195
196
prmid = logical_rmid_to_physical_rmid(cpu, rmid);
197
/* Record any initial, non-zero count value. */
198
__rmid_read_phys(prmid, eventid, &am->prev_msr);
199
}
200
}
201
202
/*
203
* Assumes that hardware counters are also reset and thus that there is
204
* no need to record initial non-zero counts.
205
*/
206
void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d)
207
{
208
struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
209
210
if (resctrl_arch_is_mbm_total_enabled())
211
memset(hw_dom->arch_mbm_total, 0,
212
sizeof(*hw_dom->arch_mbm_total) * r->num_rmid);
213
214
if (resctrl_arch_is_mbm_local_enabled())
215
memset(hw_dom->arch_mbm_local, 0,
216
sizeof(*hw_dom->arch_mbm_local) * r->num_rmid);
217
}
218
219
static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
220
{
221
u64 shift = 64 - width, chunks;
222
223
chunks = (cur_msr << shift) - (prev_msr << shift);
224
return chunks >> shift;
225
}
226
227
int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d,
228
u32 unused, u32 rmid, enum resctrl_event_id eventid,
229
u64 *val, void *ignored)
230
{
231
struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
232
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
233
int cpu = cpumask_any(&d->hdr.cpu_mask);
234
struct arch_mbm_state *am;
235
u64 msr_val, chunks;
236
u32 prmid;
237
int ret;
238
239
resctrl_arch_rmid_read_context_check();
240
241
prmid = logical_rmid_to_physical_rmid(cpu, rmid);
242
ret = __rmid_read_phys(prmid, eventid, &msr_val);
243
if (ret)
244
return ret;
245
246
am = get_arch_mbm_state(hw_dom, rmid, eventid);
247
if (am) {
248
am->chunks += mbm_overflow_count(am->prev_msr, msr_val,
249
hw_res->mbm_width);
250
chunks = get_corrected_mbm_count(rmid, am->chunks);
251
am->prev_msr = msr_val;
252
} else {
253
chunks = msr_val;
254
}
255
256
*val = chunks * hw_res->mon_scale;
257
258
return 0;
259
}
260
261
/*
262
* The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1
263
* which indicates that RMIDs are configured in legacy mode.
264
* This mode is incompatible with Linux resctrl semantics
265
* as RMIDs are partitioned between SNC nodes, which requires
266
* a user to know which RMID is allocated to a task.
267
* Clearing bit 0 reconfigures the RMID counters for use
268
* in RMID sharing mode. This mode is better for Linux.
269
* The RMID space is divided between all SNC nodes with the
270
* RMIDs renumbered to start from zero in each node when
271
* counting operations from tasks. Code to read the counters
272
* must adjust RMID counter numbers based on SNC node. See
273
* logical_rmid_to_physical_rmid() for code that does this.
274
*/
275
void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d)
276
{
277
if (snc_nodes_per_l3_cache > 1)
278
msr_clear_bit(MSR_RMID_SNC_CONFIG, 0);
279
}
280
281
/* CPU models that support MSR_RMID_SNC_CONFIG */
282
static const struct x86_cpu_id snc_cpu_ids[] __initconst = {
283
X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
284
X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0),
285
X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0),
286
X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0),
287
X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0),
288
{}
289
};
290
291
/*
292
* There isn't a simple hardware bit that indicates whether a CPU is running
293
* in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the
294
* number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in
295
* the same NUMA node as CPU0.
296
* It is not possible to accurately determine SNC state if the system is
297
* booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes
298
* to L3 caches. It will be OK if system is booted with hyperthreading
299
* disabled (since this doesn't affect the ratio).
300
*/
301
static __init int snc_get_config(void)
302
{
303
struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE);
304
const cpumask_t *node0_cpumask;
305
int cpus_per_node, cpus_per_l3;
306
int ret;
307
308
if (!x86_match_cpu(snc_cpu_ids) || !ci)
309
return 1;
310
311
cpus_read_lock();
312
if (num_online_cpus() != num_present_cpus())
313
pr_warn("Some CPUs offline, SNC detection may be incorrect\n");
314
cpus_read_unlock();
315
316
node0_cpumask = cpumask_of_node(cpu_to_node(0));
317
318
cpus_per_node = cpumask_weight(node0_cpumask);
319
cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map);
320
321
if (!cpus_per_node || !cpus_per_l3)
322
return 1;
323
324
ret = cpus_per_l3 / cpus_per_node;
325
326
/* sanity check: Only valid results are 1, 2, 3, 4, 6 */
327
switch (ret) {
328
case 1:
329
break;
330
case 2 ... 4:
331
case 6:
332
pr_info("Sub-NUMA Cluster mode detected with %d nodes per L3 cache\n", ret);
333
rdt_resources_all[RDT_RESOURCE_L3].r_resctrl.mon_scope = RESCTRL_L3_NODE;
334
break;
335
default:
336
pr_warn("Ignore improbable SNC node count %d\n", ret);
337
ret = 1;
338
break;
339
}
340
341
return ret;
342
}
343
344
int __init rdt_get_mon_l3_config(struct rdt_resource *r)
345
{
346
unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
347
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
348
unsigned int threshold;
349
350
snc_nodes_per_l3_cache = snc_get_config();
351
352
resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024;
353
hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache;
354
r->num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache;
355
hw_res->mbm_width = MBM_CNTR_WIDTH_BASE;
356
357
if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX)
358
hw_res->mbm_width += mbm_offset;
359
else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX)
360
pr_warn("Ignoring impossible MBM counter offset\n");
361
362
/*
363
* A reasonable upper limit on the max threshold is the number
364
* of lines tagged per RMID if all RMIDs have the same number of
365
* lines tagged in the LLC.
366
*
367
* For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
368
*/
369
threshold = resctrl_rmid_realloc_limit / r->num_rmid;
370
371
/*
372
* Because num_rmid may not be a power of two, round the value
373
* to the nearest multiple of hw_res->mon_scale so it matches a
374
* value the hardware will measure. mon_scale may not be a power of 2.
375
*/
376
resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold);
377
378
if (rdt_cpu_has(X86_FEATURE_BMEC)) {
379
u32 eax, ebx, ecx, edx;
380
381
/* Detect list of bandwidth sources that can be tracked */
382
cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx);
383
r->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS;
384
}
385
386
r->mon_capable = true;
387
388
return 0;
389
}
390
391
void __init intel_rdt_mbm_apply_quirk(void)
392
{
393
int cf_index;
394
395
cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1;
396
if (cf_index >= ARRAY_SIZE(mbm_cf_table)) {
397
pr_info("No MBM correction factor available\n");
398
return;
399
}
400
401
mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold;
402
mbm_cf = mbm_cf_table[cf_index].cf;
403
}
404
405