CoCalc -- pseudo

GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
²⁶⁵¹⁶ views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
 * Resource Director Technology (RDT)
4
 *
5
 * Pseudo-locking support built on top of Cache Allocation Technology (CAT)
6
 *
7
 * Copyright (C) 2018 Intel Corporation
8
 *
9
 * Author: Reinette Chatre <[email protected]>
10
 */
11

12
#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
13

14
#include <linux/cacheflush.h>
15
#include <linux/cpu.h>
16
#include <linux/perf_event.h>
17
#include <linux/pm_qos.h>
18
#include <linux/resctrl.h>
19

20
#include <asm/cpu_device_id.h>
21
#include <asm/perf_event.h>
22
#include <asm/msr.h>
23

24
#include "../../events/perf_event.h" /* For X86_CONFIG() */
25
#include "internal.h"
26

27
#define CREATE_TRACE_POINTS
28

29
#include "pseudo_lock_trace.h"
30

31
/*
32
 * The bits needed to disable hardware prefetching varies based on the
33
 * platform. During initialization we will discover which bits to use.
34
 */
35
static u64 prefetch_disable_bits;
36

37
/**
38
 * resctrl_arch_get_prefetch_disable_bits - prefetch disable bits of supported
39
 *                                          platforms
40
 * @void: It takes no parameters.
41
 *
42
 * Capture the list of platforms that have been validated to support
43
 * pseudo-locking. This includes testing to ensure pseudo-locked regions
44
 * with low cache miss rates can be created under variety of load conditions
45
 * as well as that these pseudo-locked regions can maintain their low cache
46
 * miss rates under variety of load conditions for significant lengths of time.
47
 *
48
 * After a platform has been validated to support pseudo-locking its
49
 * hardware prefetch disable bits are included here as they are documented
50
 * in the SDM.
51
 *
52
 * When adding a platform here also add support for its cache events to
53
 * resctrl_arch_measure_l*_residency()
54
 *
55
 * Return:
56
 * If platform is supported, the bits to disable hardware prefetchers, 0
57
 * if platform is not supported.
58
 */
59
u64 resctrl_arch_get_prefetch_disable_bits(void)
60
{
61
	prefetch_disable_bits = 0;
62

63
	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
64
	    boot_cpu_data.x86 != 6)
65
		return 0;
66

67
	switch (boot_cpu_data.x86_vfm) {
68
	case INTEL_BROADWELL_X:
69
		/*
70
		 * SDM defines bits of MSR_MISC_FEATURE_CONTROL register
71
		 * as:
72
		 * 0    L2 Hardware Prefetcher Disable (R/W)
73
		 * 1    L2 Adjacent Cache Line Prefetcher Disable (R/W)
74
		 * 2    DCU Hardware Prefetcher Disable (R/W)
75
		 * 3    DCU IP Prefetcher Disable (R/W)
76
		 * 63:4 Reserved
77
		 */
78
		prefetch_disable_bits = 0xF;
79
		break;
80
	case INTEL_ATOM_GOLDMONT:
81
	case INTEL_ATOM_GOLDMONT_PLUS:
82
		/*
83
		 * SDM defines bits of MSR_MISC_FEATURE_CONTROL register
84
		 * as:
85
		 * 0     L2 Hardware Prefetcher Disable (R/W)
86
		 * 1     Reserved
87
		 * 2     DCU Hardware Prefetcher Disable (R/W)
88
		 * 63:3  Reserved
89
		 */
90
		prefetch_disable_bits = 0x5;
91
		break;
92
	}
93

94
	return prefetch_disable_bits;
95
}
96

97
/**
98
 * resctrl_arch_pseudo_lock_fn - Load kernel memory into cache
99
 * @_plr: the pseudo-lock region descriptor
100
 *
101
 * This is the core pseudo-locking flow.
102
 *
103
 * First we ensure that the kernel memory cannot be found in the cache.
104
 * Then, while taking care that there will be as little interference as
105
 * possible, the memory to be loaded is accessed while core is running
106
 * with class of service set to the bitmask of the pseudo-locked region.
107
 * After this is complete no future CAT allocations will be allowed to
108
 * overlap with this bitmask.
109
 *
110
 * Local register variables are utilized to ensure that the memory region
111
 * to be locked is the only memory access made during the critical locking
112
 * loop.
113
 *
114
 * Return: 0. Waiter on waitqueue will be woken on completion.
115
 */
116
int resctrl_arch_pseudo_lock_fn(void *_plr)
117
{
118
	struct pseudo_lock_region *plr = _plr;
119
	u32 rmid_p, closid_p;
120
	unsigned long i;
121
	u64 saved_msr;
122
#ifdef CONFIG_KASAN
123
	/*
124
	 * The registers used for local register variables are also used
125
	 * when KASAN is active. When KASAN is active we use a regular
126
	 * variable to ensure we always use a valid pointer, but the cost
127
	 * is that this variable will enter the cache through evicting the
128
	 * memory we are trying to lock into the cache. Thus expect lower
129
	 * pseudo-locking success rate when KASAN is active.
130
	 */
131
	unsigned int line_size;
132
	unsigned int size;
133
	void *mem_r;
134
#else
135
	register unsigned int line_size asm("esi");
136
	register unsigned int size asm("edi");
137
	register void *mem_r asm(_ASM_BX);
138
#endif /* CONFIG_KASAN */
139

140
	/*
141
	 * Make sure none of the allocated memory is cached. If it is we
142
	 * will get a cache hit in below loop from outside of pseudo-locked
143
	 * region.
144
	 * wbinvd (as opposed to clflush/clflushopt) is required to
145
	 * increase likelihood that allocated cache portion will be filled
146
	 * with associated memory.
147
	 */
148
	wbinvd();
149

150
	/*
151
	 * Always called with interrupts enabled. By disabling interrupts
152
	 * ensure that we will not be preempted during this critical section.
153
	 */
154
	local_irq_disable();
155

156
	/*
157
	 * Call wrmsr and rdmsr as directly as possible to avoid tracing
158
	 * clobbering local register variables or affecting cache accesses.
159
	 *
160
	 * Disable the hardware prefetcher so that when the end of the memory
161
	 * being pseudo-locked is reached the hardware will not read beyond
162
	 * the buffer and evict pseudo-locked memory read earlier from the
163
	 * cache.
164
	 */
165
	saved_msr = native_rdmsrq(MSR_MISC_FEATURE_CONTROL);
166
	native_wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);
167
	closid_p = this_cpu_read(pqr_state.cur_closid);
168
	rmid_p = this_cpu_read(pqr_state.cur_rmid);
169
	mem_r = plr->kmem;
170
	size = plr->size;
171
	line_size = plr->line_size;
172
	/*
173
	 * Critical section begin: start by writing the closid associated
174
	 * with the capacity bitmask of the cache region being
175
	 * pseudo-locked followed by reading of kernel memory to load it
176
	 * into the cache.
177
	 */
178
	native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, plr->closid);
179

180
	/*
181
	 * Cache was flushed earlier. Now access kernel memory to read it
182
	 * into cache region associated with just activated plr->closid.
183
	 * Loop over data twice:
184
	 * - In first loop the cache region is shared with the page walker
185
	 *   as it populates the paging structure caches (including TLB).
186
	 * - In the second loop the paging structure caches are used and
187
	 *   cache region is populated with the memory being referenced.
188
	 */
189
	for (i = 0; i < size; i += PAGE_SIZE) {
190
		/*
191
		 * Add a barrier to prevent speculative execution of this
192
		 * loop reading beyond the end of the buffer.
193
		 */
194
		rmb();
195
		asm volatile("mov (%0,%1,1), %%eax\n\t"
196
			:
197
			: "r" (mem_r), "r" (i)
198
			: "%eax", "memory");
199
	}
200
	for (i = 0; i < size; i += line_size) {
201
		/*
202
		 * Add a barrier to prevent speculative execution of this
203
		 * loop reading beyond the end of the buffer.
204
		 */
205
		rmb();
206
		asm volatile("mov (%0,%1,1), %%eax\n\t"
207
			:
208
			: "r" (mem_r), "r" (i)
209
			: "%eax", "memory");
210
	}
211
	/*
212
	 * Critical section end: restore closid with capacity bitmask that
213
	 * does not overlap with pseudo-locked region.
214
	 */
215
	native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, closid_p);
216

217
	/* Re-enable the hardware prefetcher(s) */
218
	wrmsrq(MSR_MISC_FEATURE_CONTROL, saved_msr);
219
	local_irq_enable();
220

221
	plr->thread_done = 1;
222
	wake_up_interruptible(&plr->lock_thread_wq);
223
	return 0;
224
}
225

226
/**
227
 * resctrl_arch_measure_cycles_lat_fn - Measure cycle latency to read
228
 *                                      pseudo-locked memory
229
 * @_plr: pseudo-lock region to measure
230
 *
231
 * There is no deterministic way to test if a memory region is cached. One
232
 * way is to measure how long it takes to read the memory, the speed of
233
 * access is a good way to learn how close to the cpu the data was. Even
234
 * more, if the prefetcher is disabled and the memory is read at a stride
235
 * of half the cache line, then a cache miss will be easy to spot since the
236
 * read of the first half would be significantly slower than the read of
237
 * the second half.
238
 *
239
 * Return: 0. Waiter on waitqueue will be woken on completion.
240
 */
241
int resctrl_arch_measure_cycles_lat_fn(void *_plr)
242
{
243
	struct pseudo_lock_region *plr = _plr;
244
	u32 saved_low, saved_high;
245
	unsigned long i;
246
	u64 start, end;
247
	void *mem_r;
248

249
	local_irq_disable();
250
	/*
251
	 * Disable hardware prefetchers.
252
	 */
253
	rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
254
	wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);
255
	mem_r = READ_ONCE(plr->kmem);
256
	/*
257
	 * Dummy execute of the time measurement to load the needed
258
	 * instructions into the L1 instruction cache.
259
	 */
260
	start = rdtsc_ordered();
261
	for (i = 0; i < plr->size; i += 32) {
262
		start = rdtsc_ordered();
263
		asm volatile("mov (%0,%1,1), %%eax\n\t"
264
			     :
265
			     : "r" (mem_r), "r" (i)
266
			     : "%eax", "memory");
267
		end = rdtsc_ordered();
268
		trace_pseudo_lock_mem_latency((u32)(end - start));
269
	}
270
	wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
271
	local_irq_enable();
272
	plr->thread_done = 1;
273
	wake_up_interruptible(&plr->lock_thread_wq);
274
	return 0;
275
}
276

277
/*
278
 * Create a perf_event_attr for the hit and miss perf events that will
279
 * be used during the performance measurement. A perf_event maintains
280
 * a pointer to its perf_event_attr so a unique attribute structure is
281
 * created for each perf_event.
282
 *
283
 * The actual configuration of the event is set right before use in order
284
 * to use the X86_CONFIG macro.
285
 */
286
static struct perf_event_attr perf_miss_attr = {
287
	.type		= PERF_TYPE_RAW,
288
	.size		= sizeof(struct perf_event_attr),
289
	.pinned		= 1,
290
	.disabled	= 0,
291
	.exclude_user	= 1,
292
};
293

294
static struct perf_event_attr perf_hit_attr = {
295
	.type		= PERF_TYPE_RAW,
296
	.size		= sizeof(struct perf_event_attr),
297
	.pinned		= 1,
298
	.disabled	= 0,
299
	.exclude_user	= 1,
300
};
301

302
struct residency_counts {
303
	u64 miss_before, hits_before;
304
	u64 miss_after,  hits_after;
305
};
306

307
static int measure_residency_fn(struct perf_event_attr *miss_attr,
308
				struct perf_event_attr *hit_attr,
309
				struct pseudo_lock_region *plr,
310
				struct residency_counts *counts)
311
{
312
	u64 hits_before = 0, hits_after = 0, miss_before = 0, miss_after = 0;
313
	struct perf_event *miss_event, *hit_event;
314
	int hit_pmcnum, miss_pmcnum;
315
	u32 saved_low, saved_high;
316
	unsigned int line_size;
317
	unsigned int size;
318
	unsigned long i;
319
	void *mem_r;
320
	u64 tmp;
321

322
	miss_event = perf_event_create_kernel_counter(miss_attr, plr->cpu,
323
						      NULL, NULL, NULL);
324
	if (IS_ERR(miss_event))
325
		goto out;
326

327
	hit_event = perf_event_create_kernel_counter(hit_attr, plr->cpu,
328
						     NULL, NULL, NULL);
329
	if (IS_ERR(hit_event))
330
		goto out_miss;
331

332
	local_irq_disable();
333
	/*
334
	 * Check any possible error state of events used by performing
335
	 * one local read.
336
	 */
337
	if (perf_event_read_local(miss_event, &tmp, NULL, NULL)) {
338
		local_irq_enable();
339
		goto out_hit;
340
	}
341
	if (perf_event_read_local(hit_event, &tmp, NULL, NULL)) {
342
		local_irq_enable();
343
		goto out_hit;
344
	}
345

346
	/*
347
	 * Disable hardware prefetchers.
348
	 */
349
	rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
350
	wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);
351

352
	/* Initialize rest of local variables */
353
	/*
354
	 * Performance event has been validated right before this with
355
	 * interrupts disabled - it is thus safe to read the counter index.
356
	 */
357
	miss_pmcnum = x86_perf_rdpmc_index(miss_event);
358
	hit_pmcnum = x86_perf_rdpmc_index(hit_event);
359
	line_size = READ_ONCE(plr->line_size);
360
	mem_r = READ_ONCE(plr->kmem);
361
	size = READ_ONCE(plr->size);
362

363
	/*
364
	 * Read counter variables twice - first to load the instructions
365
	 * used in L1 cache, second to capture accurate value that does not
366
	 * include cache misses incurred because of instruction loads.
367
	 */
368
	hits_before = rdpmc(hit_pmcnum);
369
	miss_before = rdpmc(miss_pmcnum);
370
	/*
371
	 * From SDM: Performing back-to-back fast reads are not guaranteed
372
	 * to be monotonic.
373
	 * Use LFENCE to ensure all previous instructions are retired
374
	 * before proceeding.
375
	 */
376
	rmb();
377
	hits_before = rdpmc(hit_pmcnum);
378
	miss_before = rdpmc(miss_pmcnum);
379
	/*
380
	 * Use LFENCE to ensure all previous instructions are retired
381
	 * before proceeding.
382
	 */
383
	rmb();
384
	for (i = 0; i < size; i += line_size) {
385
		/*
386
		 * Add a barrier to prevent speculative execution of this
387
		 * loop reading beyond the end of the buffer.
388
		 */
389
		rmb();
390
		asm volatile("mov (%0,%1,1), %%eax\n\t"
391
			     :
392
			     : "r" (mem_r), "r" (i)
393
			     : "%eax", "memory");
394
	}
395
	/*
396
	 * Use LFENCE to ensure all previous instructions are retired
397
	 * before proceeding.
398
	 */
399
	rmb();
400
	hits_after = rdpmc(hit_pmcnum);
401
	miss_after = rdpmc(miss_pmcnum);
402
	/*
403
	 * Use LFENCE to ensure all previous instructions are retired
404
	 * before proceeding.
405
	 */
406
	rmb();
407
	/* Re-enable hardware prefetchers */
408
	wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
409
	local_irq_enable();
410
out_hit:
411
	perf_event_release_kernel(hit_event);
412
out_miss:
413
	perf_event_release_kernel(miss_event);
414
out:
415
	/*
416
	 * All counts will be zero on failure.
417
	 */
418
	counts->miss_before = miss_before;
419
	counts->hits_before = hits_before;
420
	counts->miss_after  = miss_after;
421
	counts->hits_after  = hits_after;
422
	return 0;
423
}
424

425
int resctrl_arch_measure_l2_residency(void *_plr)
426
{
427
	struct pseudo_lock_region *plr = _plr;
428
	struct residency_counts counts = {0};
429

430
	/*
431
	 * Non-architectural event for the Goldmont Microarchitecture
432
	 * from Intel x86 Architecture Software Developer Manual (SDM):
433
	 * MEM_LOAD_UOPS_RETIRED D1H (event number)
434
	 * Umask values:
435
	 *     L2_HIT   02H
436
	 *     L2_MISS  10H
437
	 */
438
	switch (boot_cpu_data.x86_vfm) {
439
	case INTEL_ATOM_GOLDMONT:
440
	case INTEL_ATOM_GOLDMONT_PLUS:
441
		perf_miss_attr.config = X86_CONFIG(.event = 0xd1,
442
						   .umask = 0x10);
443
		perf_hit_attr.config = X86_CONFIG(.event = 0xd1,
444
						  .umask = 0x2);
445
		break;
446
	default:
447
		goto out;
448
	}
449

450
	measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts);
451
	/*
452
	 * If a failure prevented the measurements from succeeding
453
	 * tracepoints will still be written and all counts will be zero.
454
	 */
455
	trace_pseudo_lock_l2(counts.hits_after - counts.hits_before,
456
			     counts.miss_after - counts.miss_before);
457
out:
458
	plr->thread_done = 1;
459
	wake_up_interruptible(&plr->lock_thread_wq);
460
	return 0;
461
}
462

463
int resctrl_arch_measure_l3_residency(void *_plr)
464
{
465
	struct pseudo_lock_region *plr = _plr;
466
	struct residency_counts counts = {0};
467

468
	/*
469
	 * On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event
470
	 * has two "no fix" errata associated with it: BDM35 and BDM100. On
471
	 * this platform the following events are used instead:
472
	 * LONGEST_LAT_CACHE 2EH (Documented in SDM)
473
	 *       REFERENCE 4FH
474
	 *       MISS      41H
475
	 */
476

477
	switch (boot_cpu_data.x86_vfm) {
478
	case INTEL_BROADWELL_X:
479
		/* On BDW the hit event counts references, not hits */
480
		perf_hit_attr.config = X86_CONFIG(.event = 0x2e,
481
						  .umask = 0x4f);
482
		perf_miss_attr.config = X86_CONFIG(.event = 0x2e,
483
						   .umask = 0x41);
484
		break;
485
	default:
486
		goto out;
487
	}
488

489
	measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts);
490
	/*
491
	 * If a failure prevented the measurements from succeeding
492
	 * tracepoints will still be written and all counts will be zero.
493
	 */
494

495
	counts.miss_after -= counts.miss_before;
496
	if (boot_cpu_data.x86_vfm == INTEL_BROADWELL_X) {
497
		/*
498
		 * On BDW references and misses are counted, need to adjust.
499
		 * Sometimes the "hits" counter is a bit more than the
500
		 * references, for example, x references but x + 1 hits.
501
		 * To not report invalid hit values in this case we treat
502
		 * that as misses equal to references.
503
		 */
504
		/* First compute the number of cache references measured */
505
		counts.hits_after -= counts.hits_before;
506
		/* Next convert references to cache hits */
507
		counts.hits_after -= min(counts.miss_after, counts.hits_after);
508
	} else {
509
		counts.hits_after -= counts.hits_before;
510
	}
511

512
	trace_pseudo_lock_l3(counts.hits_after, counts.miss_after);
513
out:
514
	plr->thread_done = 1;
515
	wake_up_interruptible(&plr->lock_thread_wq);
516
	return 0;
517
}
518

519
Product

Resources

Company