Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
26516 views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
* Resource Director Technology (RDT)
4
*
5
* Pseudo-locking support built on top of Cache Allocation Technology (CAT)
6
*
7
* Copyright (C) 2018 Intel Corporation
8
*
9
* Author: Reinette Chatre <[email protected]>
10
*/
11
12
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13
14
#include <linux/cacheflush.h>
15
#include <linux/cpu.h>
16
#include <linux/perf_event.h>
17
#include <linux/pm_qos.h>
18
#include <linux/resctrl.h>
19
20
#include <asm/cpu_device_id.h>
21
#include <asm/perf_event.h>
22
#include <asm/msr.h>
23
24
#include "../../events/perf_event.h" /* For X86_CONFIG() */
25
#include "internal.h"
26
27
#define CREATE_TRACE_POINTS
28
29
#include "pseudo_lock_trace.h"
30
31
/*
32
* The bits needed to disable hardware prefetching varies based on the
33
* platform. During initialization we will discover which bits to use.
34
*/
35
static u64 prefetch_disable_bits;
36
37
/**
38
* resctrl_arch_get_prefetch_disable_bits - prefetch disable bits of supported
39
* platforms
40
* @void: It takes no parameters.
41
*
42
* Capture the list of platforms that have been validated to support
43
* pseudo-locking. This includes testing to ensure pseudo-locked regions
44
* with low cache miss rates can be created under variety of load conditions
45
* as well as that these pseudo-locked regions can maintain their low cache
46
* miss rates under variety of load conditions for significant lengths of time.
47
*
48
* After a platform has been validated to support pseudo-locking its
49
* hardware prefetch disable bits are included here as they are documented
50
* in the SDM.
51
*
52
* When adding a platform here also add support for its cache events to
53
* resctrl_arch_measure_l*_residency()
54
*
55
* Return:
56
* If platform is supported, the bits to disable hardware prefetchers, 0
57
* if platform is not supported.
58
*/
59
u64 resctrl_arch_get_prefetch_disable_bits(void)
60
{
61
prefetch_disable_bits = 0;
62
63
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
64
boot_cpu_data.x86 != 6)
65
return 0;
66
67
switch (boot_cpu_data.x86_vfm) {
68
case INTEL_BROADWELL_X:
69
/*
70
* SDM defines bits of MSR_MISC_FEATURE_CONTROL register
71
* as:
72
* 0 L2 Hardware Prefetcher Disable (R/W)
73
* 1 L2 Adjacent Cache Line Prefetcher Disable (R/W)
74
* 2 DCU Hardware Prefetcher Disable (R/W)
75
* 3 DCU IP Prefetcher Disable (R/W)
76
* 63:4 Reserved
77
*/
78
prefetch_disable_bits = 0xF;
79
break;
80
case INTEL_ATOM_GOLDMONT:
81
case INTEL_ATOM_GOLDMONT_PLUS:
82
/*
83
* SDM defines bits of MSR_MISC_FEATURE_CONTROL register
84
* as:
85
* 0 L2 Hardware Prefetcher Disable (R/W)
86
* 1 Reserved
87
* 2 DCU Hardware Prefetcher Disable (R/W)
88
* 63:3 Reserved
89
*/
90
prefetch_disable_bits = 0x5;
91
break;
92
}
93
94
return prefetch_disable_bits;
95
}
96
97
/**
98
* resctrl_arch_pseudo_lock_fn - Load kernel memory into cache
99
* @_plr: the pseudo-lock region descriptor
100
*
101
* This is the core pseudo-locking flow.
102
*
103
* First we ensure that the kernel memory cannot be found in the cache.
104
* Then, while taking care that there will be as little interference as
105
* possible, the memory to be loaded is accessed while core is running
106
* with class of service set to the bitmask of the pseudo-locked region.
107
* After this is complete no future CAT allocations will be allowed to
108
* overlap with this bitmask.
109
*
110
* Local register variables are utilized to ensure that the memory region
111
* to be locked is the only memory access made during the critical locking
112
* loop.
113
*
114
* Return: 0. Waiter on waitqueue will be woken on completion.
115
*/
116
int resctrl_arch_pseudo_lock_fn(void *_plr)
117
{
118
struct pseudo_lock_region *plr = _plr;
119
u32 rmid_p, closid_p;
120
unsigned long i;
121
u64 saved_msr;
122
#ifdef CONFIG_KASAN
123
/*
124
* The registers used for local register variables are also used
125
* when KASAN is active. When KASAN is active we use a regular
126
* variable to ensure we always use a valid pointer, but the cost
127
* is that this variable will enter the cache through evicting the
128
* memory we are trying to lock into the cache. Thus expect lower
129
* pseudo-locking success rate when KASAN is active.
130
*/
131
unsigned int line_size;
132
unsigned int size;
133
void *mem_r;
134
#else
135
register unsigned int line_size asm("esi");
136
register unsigned int size asm("edi");
137
register void *mem_r asm(_ASM_BX);
138
#endif /* CONFIG_KASAN */
139
140
/*
141
* Make sure none of the allocated memory is cached. If it is we
142
* will get a cache hit in below loop from outside of pseudo-locked
143
* region.
144
* wbinvd (as opposed to clflush/clflushopt) is required to
145
* increase likelihood that allocated cache portion will be filled
146
* with associated memory.
147
*/
148
wbinvd();
149
150
/*
151
* Always called with interrupts enabled. By disabling interrupts
152
* ensure that we will not be preempted during this critical section.
153
*/
154
local_irq_disable();
155
156
/*
157
* Call wrmsr and rdmsr as directly as possible to avoid tracing
158
* clobbering local register variables or affecting cache accesses.
159
*
160
* Disable the hardware prefetcher so that when the end of the memory
161
* being pseudo-locked is reached the hardware will not read beyond
162
* the buffer and evict pseudo-locked memory read earlier from the
163
* cache.
164
*/
165
saved_msr = native_rdmsrq(MSR_MISC_FEATURE_CONTROL);
166
native_wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);
167
closid_p = this_cpu_read(pqr_state.cur_closid);
168
rmid_p = this_cpu_read(pqr_state.cur_rmid);
169
mem_r = plr->kmem;
170
size = plr->size;
171
line_size = plr->line_size;
172
/*
173
* Critical section begin: start by writing the closid associated
174
* with the capacity bitmask of the cache region being
175
* pseudo-locked followed by reading of kernel memory to load it
176
* into the cache.
177
*/
178
native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, plr->closid);
179
180
/*
181
* Cache was flushed earlier. Now access kernel memory to read it
182
* into cache region associated with just activated plr->closid.
183
* Loop over data twice:
184
* - In first loop the cache region is shared with the page walker
185
* as it populates the paging structure caches (including TLB).
186
* - In the second loop the paging structure caches are used and
187
* cache region is populated with the memory being referenced.
188
*/
189
for (i = 0; i < size; i += PAGE_SIZE) {
190
/*
191
* Add a barrier to prevent speculative execution of this
192
* loop reading beyond the end of the buffer.
193
*/
194
rmb();
195
asm volatile("mov (%0,%1,1), %%eax\n\t"
196
:
197
: "r" (mem_r), "r" (i)
198
: "%eax", "memory");
199
}
200
for (i = 0; i < size; i += line_size) {
201
/*
202
* Add a barrier to prevent speculative execution of this
203
* loop reading beyond the end of the buffer.
204
*/
205
rmb();
206
asm volatile("mov (%0,%1,1), %%eax\n\t"
207
:
208
: "r" (mem_r), "r" (i)
209
: "%eax", "memory");
210
}
211
/*
212
* Critical section end: restore closid with capacity bitmask that
213
* does not overlap with pseudo-locked region.
214
*/
215
native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, closid_p);
216
217
/* Re-enable the hardware prefetcher(s) */
218
wrmsrq(MSR_MISC_FEATURE_CONTROL, saved_msr);
219
local_irq_enable();
220
221
plr->thread_done = 1;
222
wake_up_interruptible(&plr->lock_thread_wq);
223
return 0;
224
}
225
226
/**
227
* resctrl_arch_measure_cycles_lat_fn - Measure cycle latency to read
228
* pseudo-locked memory
229
* @_plr: pseudo-lock region to measure
230
*
231
* There is no deterministic way to test if a memory region is cached. One
232
* way is to measure how long it takes to read the memory, the speed of
233
* access is a good way to learn how close to the cpu the data was. Even
234
* more, if the prefetcher is disabled and the memory is read at a stride
235
* of half the cache line, then a cache miss will be easy to spot since the
236
* read of the first half would be significantly slower than the read of
237
* the second half.
238
*
239
* Return: 0. Waiter on waitqueue will be woken on completion.
240
*/
241
int resctrl_arch_measure_cycles_lat_fn(void *_plr)
242
{
243
struct pseudo_lock_region *plr = _plr;
244
u32 saved_low, saved_high;
245
unsigned long i;
246
u64 start, end;
247
void *mem_r;
248
249
local_irq_disable();
250
/*
251
* Disable hardware prefetchers.
252
*/
253
rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
254
wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);
255
mem_r = READ_ONCE(plr->kmem);
256
/*
257
* Dummy execute of the time measurement to load the needed
258
* instructions into the L1 instruction cache.
259
*/
260
start = rdtsc_ordered();
261
for (i = 0; i < plr->size; i += 32) {
262
start = rdtsc_ordered();
263
asm volatile("mov (%0,%1,1), %%eax\n\t"
264
:
265
: "r" (mem_r), "r" (i)
266
: "%eax", "memory");
267
end = rdtsc_ordered();
268
trace_pseudo_lock_mem_latency((u32)(end - start));
269
}
270
wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
271
local_irq_enable();
272
plr->thread_done = 1;
273
wake_up_interruptible(&plr->lock_thread_wq);
274
return 0;
275
}
276
277
/*
278
* Create a perf_event_attr for the hit and miss perf events that will
279
* be used during the performance measurement. A perf_event maintains
280
* a pointer to its perf_event_attr so a unique attribute structure is
281
* created for each perf_event.
282
*
283
* The actual configuration of the event is set right before use in order
284
* to use the X86_CONFIG macro.
285
*/
286
static struct perf_event_attr perf_miss_attr = {
287
.type = PERF_TYPE_RAW,
288
.size = sizeof(struct perf_event_attr),
289
.pinned = 1,
290
.disabled = 0,
291
.exclude_user = 1,
292
};
293
294
static struct perf_event_attr perf_hit_attr = {
295
.type = PERF_TYPE_RAW,
296
.size = sizeof(struct perf_event_attr),
297
.pinned = 1,
298
.disabled = 0,
299
.exclude_user = 1,
300
};
301
302
struct residency_counts {
303
u64 miss_before, hits_before;
304
u64 miss_after, hits_after;
305
};
306
307
static int measure_residency_fn(struct perf_event_attr *miss_attr,
308
struct perf_event_attr *hit_attr,
309
struct pseudo_lock_region *plr,
310
struct residency_counts *counts)
311
{
312
u64 hits_before = 0, hits_after = 0, miss_before = 0, miss_after = 0;
313
struct perf_event *miss_event, *hit_event;
314
int hit_pmcnum, miss_pmcnum;
315
u32 saved_low, saved_high;
316
unsigned int line_size;
317
unsigned int size;
318
unsigned long i;
319
void *mem_r;
320
u64 tmp;
321
322
miss_event = perf_event_create_kernel_counter(miss_attr, plr->cpu,
323
NULL, NULL, NULL);
324
if (IS_ERR(miss_event))
325
goto out;
326
327
hit_event = perf_event_create_kernel_counter(hit_attr, plr->cpu,
328
NULL, NULL, NULL);
329
if (IS_ERR(hit_event))
330
goto out_miss;
331
332
local_irq_disable();
333
/*
334
* Check any possible error state of events used by performing
335
* one local read.
336
*/
337
if (perf_event_read_local(miss_event, &tmp, NULL, NULL)) {
338
local_irq_enable();
339
goto out_hit;
340
}
341
if (perf_event_read_local(hit_event, &tmp, NULL, NULL)) {
342
local_irq_enable();
343
goto out_hit;
344
}
345
346
/*
347
* Disable hardware prefetchers.
348
*/
349
rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
350
wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);
351
352
/* Initialize rest of local variables */
353
/*
354
* Performance event has been validated right before this with
355
* interrupts disabled - it is thus safe to read the counter index.
356
*/
357
miss_pmcnum = x86_perf_rdpmc_index(miss_event);
358
hit_pmcnum = x86_perf_rdpmc_index(hit_event);
359
line_size = READ_ONCE(plr->line_size);
360
mem_r = READ_ONCE(plr->kmem);
361
size = READ_ONCE(plr->size);
362
363
/*
364
* Read counter variables twice - first to load the instructions
365
* used in L1 cache, second to capture accurate value that does not
366
* include cache misses incurred because of instruction loads.
367
*/
368
hits_before = rdpmc(hit_pmcnum);
369
miss_before = rdpmc(miss_pmcnum);
370
/*
371
* From SDM: Performing back-to-back fast reads are not guaranteed
372
* to be monotonic.
373
* Use LFENCE to ensure all previous instructions are retired
374
* before proceeding.
375
*/
376
rmb();
377
hits_before = rdpmc(hit_pmcnum);
378
miss_before = rdpmc(miss_pmcnum);
379
/*
380
* Use LFENCE to ensure all previous instructions are retired
381
* before proceeding.
382
*/
383
rmb();
384
for (i = 0; i < size; i += line_size) {
385
/*
386
* Add a barrier to prevent speculative execution of this
387
* loop reading beyond the end of the buffer.
388
*/
389
rmb();
390
asm volatile("mov (%0,%1,1), %%eax\n\t"
391
:
392
: "r" (mem_r), "r" (i)
393
: "%eax", "memory");
394
}
395
/*
396
* Use LFENCE to ensure all previous instructions are retired
397
* before proceeding.
398
*/
399
rmb();
400
hits_after = rdpmc(hit_pmcnum);
401
miss_after = rdpmc(miss_pmcnum);
402
/*
403
* Use LFENCE to ensure all previous instructions are retired
404
* before proceeding.
405
*/
406
rmb();
407
/* Re-enable hardware prefetchers */
408
wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
409
local_irq_enable();
410
out_hit:
411
perf_event_release_kernel(hit_event);
412
out_miss:
413
perf_event_release_kernel(miss_event);
414
out:
415
/*
416
* All counts will be zero on failure.
417
*/
418
counts->miss_before = miss_before;
419
counts->hits_before = hits_before;
420
counts->miss_after = miss_after;
421
counts->hits_after = hits_after;
422
return 0;
423
}
424
425
int resctrl_arch_measure_l2_residency(void *_plr)
426
{
427
struct pseudo_lock_region *plr = _plr;
428
struct residency_counts counts = {0};
429
430
/*
431
* Non-architectural event for the Goldmont Microarchitecture
432
* from Intel x86 Architecture Software Developer Manual (SDM):
433
* MEM_LOAD_UOPS_RETIRED D1H (event number)
434
* Umask values:
435
* L2_HIT 02H
436
* L2_MISS 10H
437
*/
438
switch (boot_cpu_data.x86_vfm) {
439
case INTEL_ATOM_GOLDMONT:
440
case INTEL_ATOM_GOLDMONT_PLUS:
441
perf_miss_attr.config = X86_CONFIG(.event = 0xd1,
442
.umask = 0x10);
443
perf_hit_attr.config = X86_CONFIG(.event = 0xd1,
444
.umask = 0x2);
445
break;
446
default:
447
goto out;
448
}
449
450
measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts);
451
/*
452
* If a failure prevented the measurements from succeeding
453
* tracepoints will still be written and all counts will be zero.
454
*/
455
trace_pseudo_lock_l2(counts.hits_after - counts.hits_before,
456
counts.miss_after - counts.miss_before);
457
out:
458
plr->thread_done = 1;
459
wake_up_interruptible(&plr->lock_thread_wq);
460
return 0;
461
}
462
463
int resctrl_arch_measure_l3_residency(void *_plr)
464
{
465
struct pseudo_lock_region *plr = _plr;
466
struct residency_counts counts = {0};
467
468
/*
469
* On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event
470
* has two "no fix" errata associated with it: BDM35 and BDM100. On
471
* this platform the following events are used instead:
472
* LONGEST_LAT_CACHE 2EH (Documented in SDM)
473
* REFERENCE 4FH
474
* MISS 41H
475
*/
476
477
switch (boot_cpu_data.x86_vfm) {
478
case INTEL_BROADWELL_X:
479
/* On BDW the hit event counts references, not hits */
480
perf_hit_attr.config = X86_CONFIG(.event = 0x2e,
481
.umask = 0x4f);
482
perf_miss_attr.config = X86_CONFIG(.event = 0x2e,
483
.umask = 0x41);
484
break;
485
default:
486
goto out;
487
}
488
489
measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts);
490
/*
491
* If a failure prevented the measurements from succeeding
492
* tracepoints will still be written and all counts will be zero.
493
*/
494
495
counts.miss_after -= counts.miss_before;
496
if (boot_cpu_data.x86_vfm == INTEL_BROADWELL_X) {
497
/*
498
* On BDW references and misses are counted, need to adjust.
499
* Sometimes the "hits" counter is a bit more than the
500
* references, for example, x references but x + 1 hits.
501
* To not report invalid hit values in this case we treat
502
* that as misses equal to references.
503
*/
504
/* First compute the number of cache references measured */
505
counts.hits_after -= counts.hits_before;
506
/* Next convert references to cache hits */
507
counts.hits_after -= min(counts.miss_after, counts.hits_after);
508
} else {
509
counts.hits_after -= counts.hits_before;
510
}
511
512
trace_pseudo_lock_l3(counts.hits_after, counts.miss_after);
513
out:
514
plr->thread_done = 1;
515
wake_up_interruptible(&plr->lock_thread_wq);
516
return 0;
517
}
518
519