CoCalc -- monitor.c

GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/kernel/cpu/resctrl/monitor.c
²⁶⁵¹⁶ views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
 * Resource Director Technology(RDT)
4
 * - Monitoring code
5
 *
6
 * Copyright (C) 2017 Intel Corporation
7
 *
8
 * Author:
9
 *    Vikas Shivappa <[email protected]>
10
 *
11
 * This replaces the cqm.c based on perf but we reuse a lot of
12
 * code and datastructures originally from Peter Zijlstra and Matt Fleming.
13
 *
14
 * More information about RDT be found in the Intel (R) x86 Architecture
15
 * Software Developer Manual June 2016, volume 3, section 17.17.
16
 */
17

18
#define pr_fmt(fmt)	"resctrl: " fmt
19

20
#include <linux/cpu.h>
21
#include <linux/resctrl.h>
22

23
#include <asm/cpu_device_id.h>
24
#include <asm/msr.h>
25

26
#include "internal.h"
27

28
/*
29
 * Global boolean for rdt_monitor which is true if any
30
 * resource monitoring is enabled.
31
 */
32
bool rdt_mon_capable;
33

34
/*
35
 * Global to indicate which monitoring events are enabled.
36
 */
37
unsigned int rdt_mon_features;
38

39
#define CF(cf)	((unsigned long)(1048576 * (cf) + 0.5))
40

41
static int snc_nodes_per_l3_cache = 1;
42

43
/*
44
 * The correction factor table is documented in Documentation/filesystems/resctrl.rst.
45
 * If rmid > rmid threshold, MBM total and local values should be multiplied
46
 * by the correction factor.
47
 *
48
 * The original table is modified for better code:
49
 *
50
 * 1. The threshold 0 is changed to rmid count - 1 so don't do correction
51
 *    for the case.
52
 * 2. MBM total and local correction table indexed by core counter which is
53
 *    equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27.
54
 * 3. The correction factor is normalized to 2^20 (1048576) so it's faster
55
 *    to calculate corrected value by shifting:
56
 *    corrected_value = (original_value * correction_factor) >> 20
57
 */
58
static const struct mbm_correction_factor_table {
59
	u32 rmidthreshold;
60
	u64 cf;
61
} mbm_cf_table[] __initconst = {
62
	{7,	CF(1.000000)},
63
	{15,	CF(1.000000)},
64
	{15,	CF(0.969650)},
65
	{31,	CF(1.000000)},
66
	{31,	CF(1.066667)},
67
	{31,	CF(0.969650)},
68
	{47,	CF(1.142857)},
69
	{63,	CF(1.000000)},
70
	{63,	CF(1.185115)},
71
	{63,	CF(1.066553)},
72
	{79,	CF(1.454545)},
73
	{95,	CF(1.000000)},
74
	{95,	CF(1.230769)},
75
	{95,	CF(1.142857)},
76
	{95,	CF(1.066667)},
77
	{127,	CF(1.000000)},
78
	{127,	CF(1.254863)},
79
	{127,	CF(1.185255)},
80
	{151,	CF(1.000000)},
81
	{127,	CF(1.066667)},
82
	{167,	CF(1.000000)},
83
	{159,	CF(1.454334)},
84
	{183,	CF(1.000000)},
85
	{127,	CF(0.969744)},
86
	{191,	CF(1.280246)},
87
	{191,	CF(1.230921)},
88
	{215,	CF(1.000000)},
89
	{191,	CF(1.143118)},
90
};
91

92
static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX;
93

94
static u64 mbm_cf __read_mostly;
95

96
static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val)
97
{
98
	/* Correct MBM value. */
99
	if (rmid > mbm_cf_rmidthreshold)
100
		val = (val * mbm_cf) >> 20;
101

102
	return val;
103
}
104

105
/*
106
 * When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by
107
 * "snc_nodes_per_l3_cache == 1") no translation of the RMID value is
108
 * needed. The physical RMID is the same as the logical RMID.
109
 *
110
 * On a platform with SNC mode enabled, Linux enables RMID sharing mode
111
 * via MSR 0xCA0 (see the "RMID Sharing Mode" section in the "Intel
112
 * Resource Director Technology Architecture Specification" for a full
113
 * description of RMID sharing mode).
114
 *
115
 * In RMID sharing mode there are fewer "logical RMID" values available
116
 * to accumulate data ("physical RMIDs" are divided evenly between SNC
117
 * nodes that share an L3 cache). Linux creates an rdt_mon_domain for
118
 * each SNC node.
119
 *
120
 * The value loaded into IA32_PQR_ASSOC is the "logical RMID".
121
 *
122
 * Data is collected independently on each SNC node and can be retrieved
123
 * using the "physical RMID" value computed by this function and loaded
124
 * into IA32_QM_EVTSEL. @cpu can be any CPU in the SNC node.
125
 *
126
 * The scope of the IA32_QM_EVTSEL and IA32_QM_CTR MSRs is at the L3
127
 * cache.  So a "physical RMID" may be read from any CPU that shares
128
 * the L3 cache with the desired SNC node, not just from a CPU in
129
 * the specific SNC node.
130
 */
131
static int logical_rmid_to_physical_rmid(int cpu, int lrmid)
132
{
133
	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
134

135
	if (snc_nodes_per_l3_cache == 1)
136
		return lrmid;
137

138
	return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->num_rmid;
139
}
140

141
static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val)
142
{
143
	u64 msr_val;
144

145
	/*
146
	 * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
147
	 * with a valid event code for supported resource type and the bits
148
	 * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,
149
	 * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
150
	 * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
151
	 * are error bits.
152
	 */
153
	wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid);
154
	rdmsrq(MSR_IA32_QM_CTR, msr_val);
155

156
	if (msr_val & RMID_VAL_ERROR)
157
		return -EIO;
158
	if (msr_val & RMID_VAL_UNAVAIL)
159
		return -EINVAL;
160

161
	*val = msr_val;
162
	return 0;
163
}
164

165
static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_dom,
166
						 u32 rmid,
167
						 enum resctrl_event_id eventid)
168
{
169
	switch (eventid) {
170
	case QOS_L3_OCCUP_EVENT_ID:
171
		return NULL;
172
	case QOS_L3_MBM_TOTAL_EVENT_ID:
173
		return &hw_dom->arch_mbm_total[rmid];
174
	case QOS_L3_MBM_LOCAL_EVENT_ID:
175
		return &hw_dom->arch_mbm_local[rmid];
176
	default:
177
		/* Never expect to get here */
178
		WARN_ON_ONCE(1);
179
		return NULL;
180
	}
181
}
182

183
void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d,
184
			     u32 unused, u32 rmid,
185
			     enum resctrl_event_id eventid)
186
{
187
	struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
188
	int cpu = cpumask_any(&d->hdr.cpu_mask);
189
	struct arch_mbm_state *am;
190
	u32 prmid;
191

192
	am = get_arch_mbm_state(hw_dom, rmid, eventid);
193
	if (am) {
194
		memset(am, 0, sizeof(*am));
195

196
		prmid = logical_rmid_to_physical_rmid(cpu, rmid);
197
		/* Record any initial, non-zero count value. */
198
		__rmid_read_phys(prmid, eventid, &am->prev_msr);
199
	}
200
}
201

202
/*
203
 * Assumes that hardware counters are also reset and thus that there is
204
 * no need to record initial non-zero counts.
205
 */
206
void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d)
207
{
208
	struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
209

210
	if (resctrl_arch_is_mbm_total_enabled())
211
		memset(hw_dom->arch_mbm_total, 0,
212
		       sizeof(*hw_dom->arch_mbm_total) * r->num_rmid);
213

214
	if (resctrl_arch_is_mbm_local_enabled())
215
		memset(hw_dom->arch_mbm_local, 0,
216
		       sizeof(*hw_dom->arch_mbm_local) * r->num_rmid);
217
}
218

219
static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
220
{
221
	u64 shift = 64 - width, chunks;
222

223
	chunks = (cur_msr << shift) - (prev_msr << shift);
224
	return chunks >> shift;
225
}
226

227
int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d,
228
			   u32 unused, u32 rmid, enum resctrl_event_id eventid,
229
			   u64 *val, void *ignored)
230
{
231
	struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
232
	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
233
	int cpu = cpumask_any(&d->hdr.cpu_mask);
234
	struct arch_mbm_state *am;
235
	u64 msr_val, chunks;
236
	u32 prmid;
237
	int ret;
238

239
	resctrl_arch_rmid_read_context_check();
240

241
	prmid = logical_rmid_to_physical_rmid(cpu, rmid);
242
	ret = __rmid_read_phys(prmid, eventid, &msr_val);
243
	if (ret)
244
		return ret;
245

246
	am = get_arch_mbm_state(hw_dom, rmid, eventid);
247
	if (am) {
248
		am->chunks += mbm_overflow_count(am->prev_msr, msr_val,
249
						 hw_res->mbm_width);
250
		chunks = get_corrected_mbm_count(rmid, am->chunks);
251
		am->prev_msr = msr_val;
252
	} else {
253
		chunks = msr_val;
254
	}
255

256
	*val = chunks * hw_res->mon_scale;
257

258
	return 0;
259
}
260

261
/*
262
 * The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1
263
 * which indicates that RMIDs are configured in legacy mode.
264
 * This mode is incompatible with Linux resctrl semantics
265
 * as RMIDs are partitioned between SNC nodes, which requires
266
 * a user to know which RMID is allocated to a task.
267
 * Clearing bit 0 reconfigures the RMID counters for use
268
 * in RMID sharing mode. This mode is better for Linux.
269
 * The RMID space is divided between all SNC nodes with the
270
 * RMIDs renumbered to start from zero in each node when
271
 * counting operations from tasks. Code to read the counters
272
 * must adjust RMID counter numbers based on SNC node. See
273
 * logical_rmid_to_physical_rmid() for code that does this.
274
 */
275
void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d)
276
{
277
	if (snc_nodes_per_l3_cache > 1)
278
		msr_clear_bit(MSR_RMID_SNC_CONFIG, 0);
279
}
280

281
/* CPU models that support MSR_RMID_SNC_CONFIG */
282
static const struct x86_cpu_id snc_cpu_ids[] __initconst = {
283
	X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
284
	X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0),
285
	X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0),
286
	X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0),
287
	X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0),
288
	{}
289
};
290

291
/*
292
 * There isn't a simple hardware bit that indicates whether a CPU is running
293
 * in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the
294
 * number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in
295
 * the same NUMA node as CPU0.
296
 * It is not possible to accurately determine SNC state if the system is
297
 * booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes
298
 * to L3 caches. It will be OK if system is booted with hyperthreading
299
 * disabled (since this doesn't affect the ratio).
300
 */
301
static __init int snc_get_config(void)
302
{
303
	struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE);
304
	const cpumask_t *node0_cpumask;
305
	int cpus_per_node, cpus_per_l3;
306
	int ret;
307

308
	if (!x86_match_cpu(snc_cpu_ids) || !ci)
309
		return 1;
310

311
	cpus_read_lock();
312
	if (num_online_cpus() != num_present_cpus())
313
		pr_warn("Some CPUs offline, SNC detection may be incorrect\n");
314
	cpus_read_unlock();
315

316
	node0_cpumask = cpumask_of_node(cpu_to_node(0));
317

318
	cpus_per_node = cpumask_weight(node0_cpumask);
319
	cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map);
320

321
	if (!cpus_per_node || !cpus_per_l3)
322
		return 1;
323

324
	ret = cpus_per_l3 / cpus_per_node;
325

326
	/* sanity check: Only valid results are 1, 2, 3, 4, 6 */
327
	switch (ret) {
328
	case 1:
329
		break;
330
	case 2 ... 4:
331
	case 6:
332
		pr_info("Sub-NUMA Cluster mode detected with %d nodes per L3 cache\n", ret);
333
		rdt_resources_all[RDT_RESOURCE_L3].r_resctrl.mon_scope = RESCTRL_L3_NODE;
334
		break;
335
	default:
336
		pr_warn("Ignore improbable SNC node count %d\n", ret);
337
		ret = 1;
338
		break;
339
	}
340

341
	return ret;
342
}
343

344
int __init rdt_get_mon_l3_config(struct rdt_resource *r)
345
{
346
	unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
347
	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
348
	unsigned int threshold;
349

350
	snc_nodes_per_l3_cache = snc_get_config();
351

352
	resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024;
353
	hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache;
354
	r->num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache;
355
	hw_res->mbm_width = MBM_CNTR_WIDTH_BASE;
356

357
	if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX)
358
		hw_res->mbm_width += mbm_offset;
359
	else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX)
360
		pr_warn("Ignoring impossible MBM counter offset\n");
361

362
	/*
363
	 * A reasonable upper limit on the max threshold is the number
364
	 * of lines tagged per RMID if all RMIDs have the same number of
365
	 * lines tagged in the LLC.
366
	 *
367
	 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
368
	 */
369
	threshold = resctrl_rmid_realloc_limit / r->num_rmid;
370

371
	/*
372
	 * Because num_rmid may not be a power of two, round the value
373
	 * to the nearest multiple of hw_res->mon_scale so it matches a
374
	 * value the hardware will measure. mon_scale may not be a power of 2.
375
	 */
376
	resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold);
377

378
	if (rdt_cpu_has(X86_FEATURE_BMEC)) {
379
		u32 eax, ebx, ecx, edx;
380

381
		/* Detect list of bandwidth sources that can be tracked */
382
		cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx);
383
		r->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS;
384
	}
385

386
	r->mon_capable = true;
387

388
	return 0;
389
}
390

391
void __init intel_rdt_mbm_apply_quirk(void)
392
{
393
	int cf_index;
394

395
	cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1;
396
	if (cf_index >= ARRAY_SIZE(mbm_cf_table)) {
397
		pr_info("No MBM correction factor available\n");
398
		return;
399
	}
400

401
	mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold;
402
	mbm_cf = mbm_cf_table[cf_index].cf;
403
}
404

405
Product

Resources

Company