Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/kvm/pmu.c
50693 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
* Kernel-based Virtual Machine -- Performance Monitoring Unit support
4
*
5
* Copyright 2015 Red Hat, Inc. and/or its affiliates.
6
*
7
* Authors:
8
* Avi Kivity <[email protected]>
9
* Gleb Natapov <[email protected]>
10
* Wei Huang <[email protected]>
11
*/
12
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13
14
#include <linux/types.h>
15
#include <linux/kvm_host.h>
16
#include <linux/perf_event.h>
17
#include <linux/bsearch.h>
18
#include <linux/sort.h>
19
#include <asm/perf_event.h>
20
#include <asm/cpu_device_id.h>
21
#include "x86.h"
22
#include "cpuid.h"
23
#include "lapic.h"
24
#include "pmu.h"
25
26
/* This is enough to filter the vast majority of currently defined events. */
27
#define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300
28
29
/* Unadultered PMU capabilities of the host, i.e. of hardware. */
30
static struct x86_pmu_capability __read_mostly kvm_host_pmu;
31
32
/* KVM's PMU capabilities, i.e. the intersection of KVM and hardware support. */
33
struct x86_pmu_capability __read_mostly kvm_pmu_cap;
34
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_cap);
35
36
struct kvm_pmu_emulated_event_selectors {
37
u64 INSTRUCTIONS_RETIRED;
38
u64 BRANCH_INSTRUCTIONS_RETIRED;
39
};
40
static struct kvm_pmu_emulated_event_selectors __read_mostly kvm_pmu_eventsel;
41
42
/* Precise Distribution of Instructions Retired (PDIR) */
43
static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = {
44
X86_MATCH_VFM(INTEL_ICELAKE_D, NULL),
45
X86_MATCH_VFM(INTEL_ICELAKE_X, NULL),
46
/* Instruction-Accurate PDIR (PDIR++) */
47
X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, NULL),
48
{}
49
};
50
51
/* Precise Distribution (PDist) */
52
static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = {
53
X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, NULL),
54
{}
55
};
56
57
/* NOTE:
58
* - Each perf counter is defined as "struct kvm_pmc";
59
* - There are two types of perf counters: general purpose (gp) and fixed.
60
* gp counters are stored in gp_counters[] and fixed counters are stored
61
* in fixed_counters[] respectively. Both of them are part of "struct
62
* kvm_pmu";
63
* - pmu.c understands the difference between gp counters and fixed counters.
64
* However AMD doesn't support fixed-counters;
65
* - There are three types of index to access perf counters (PMC):
66
* 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD
67
* has MSR_K7_PERFCTRn and, for families 15H and later,
68
* MSR_F15H_PERF_CTRn, where MSR_F15H_PERF_CTR[0-3] are
69
* aliased to MSR_K7_PERFCTRn.
70
* 2. MSR Index (named idx): This normally is used by RDPMC instruction.
71
* For instance AMD RDPMC instruction uses 0000_0003h in ECX to access
72
* C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except
73
* that it also supports fixed counters. idx can be used to as index to
74
* gp and fixed counters.
75
* 3. Global PMC Index (named pmc): pmc is an index specific to PMU
76
* code. Each pmc, stored in kvm_pmc.idx field, is unique across
77
* all perf counters (both gp and fixed). The mapping relationship
78
* between pmc and perf counters is as the following:
79
* * Intel: [0 .. KVM_MAX_NR_INTEL_GP_COUNTERS-1] <=> gp counters
80
* [KVM_FIXED_PMC_BASE_IDX .. KVM_FIXED_PMC_BASE_IDX + 2] <=> fixed
81
* * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H
82
* and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters
83
*/
84
85
static struct kvm_pmu_ops kvm_pmu_ops __read_mostly;
86
87
#define KVM_X86_PMU_OP(func) \
88
DEFINE_STATIC_CALL_NULL(kvm_x86_pmu_##func, \
89
*(((struct kvm_pmu_ops *)0)->func));
90
#define KVM_X86_PMU_OP_OPTIONAL KVM_X86_PMU_OP
91
#include <asm/kvm-x86-pmu-ops.h>
92
93
void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops)
94
{
95
memcpy(&kvm_pmu_ops, pmu_ops, sizeof(kvm_pmu_ops));
96
97
#define __KVM_X86_PMU_OP(func) \
98
static_call_update(kvm_x86_pmu_##func, kvm_pmu_ops.func);
99
#define KVM_X86_PMU_OP(func) \
100
WARN_ON(!kvm_pmu_ops.func); __KVM_X86_PMU_OP(func)
101
#define KVM_X86_PMU_OP_OPTIONAL __KVM_X86_PMU_OP
102
#include <asm/kvm-x86-pmu-ops.h>
103
#undef __KVM_X86_PMU_OP
104
}
105
106
void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops)
107
{
108
bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL;
109
int min_nr_gp_ctrs = pmu_ops->MIN_NR_GP_COUNTERS;
110
111
/*
112
* Hybrid PMUs don't play nice with virtualization without careful
113
* configuration by userspace, and KVM's APIs for reporting supported
114
* vPMU features do not account for hybrid PMUs. Disable vPMU support
115
* for hybrid PMUs until KVM gains a way to let userspace opt-in.
116
*/
117
if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) {
118
enable_pmu = false;
119
memset(&kvm_host_pmu, 0, sizeof(kvm_host_pmu));
120
} else {
121
perf_get_x86_pmu_capability(&kvm_host_pmu);
122
}
123
124
if (enable_pmu) {
125
/*
126
* WARN if perf did NOT disable hardware PMU if the number of
127
* architecturally required GP counters aren't present, i.e. if
128
* there are a non-zero number of counters, but fewer than what
129
* is architecturally required.
130
*/
131
if (!kvm_host_pmu.num_counters_gp ||
132
WARN_ON_ONCE(kvm_host_pmu.num_counters_gp < min_nr_gp_ctrs))
133
enable_pmu = false;
134
else if (is_intel && !kvm_host_pmu.version)
135
enable_pmu = false;
136
}
137
138
if (!enable_pmu) {
139
memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap));
140
return;
141
}
142
143
memcpy(&kvm_pmu_cap, &kvm_host_pmu, sizeof(kvm_host_pmu));
144
kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2);
145
kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp,
146
pmu_ops->MAX_NR_GP_COUNTERS);
147
kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed,
148
KVM_MAX_NR_FIXED_COUNTERS);
149
150
kvm_pmu_eventsel.INSTRUCTIONS_RETIRED =
151
perf_get_hw_event_config(PERF_COUNT_HW_INSTRUCTIONS);
152
kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED =
153
perf_get_hw_event_config(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
154
}
155
156
static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
157
{
158
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
159
bool skip_pmi = false;
160
161
if (pmc->perf_event && pmc->perf_event->attr.precise_ip) {
162
if (!in_pmi) {
163
/*
164
* TODO: KVM is currently _choosing_ to not generate records
165
* for emulated instructions, avoiding BUFFER_OVF PMI when
166
* there are no records. Strictly speaking, it should be done
167
* as well in the right context to improve sampling accuracy.
168
*/
169
skip_pmi = true;
170
} else {
171
/* Indicate PEBS overflow PMI to guest. */
172
skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT,
173
(unsigned long *)&pmu->global_status);
174
}
175
} else {
176
__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
177
}
178
179
if (pmc->intr && !skip_pmi)
180
kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
181
}
182
183
static void kvm_perf_overflow(struct perf_event *perf_event,
184
struct perf_sample_data *data,
185
struct pt_regs *regs)
186
{
187
struct kvm_pmc *pmc = perf_event->overflow_handler_context;
188
189
/*
190
* Ignore asynchronous overflow events for counters that are scheduled
191
* to be reprogrammed, e.g. if a PMI for the previous event races with
192
* KVM's handling of a related guest WRMSR.
193
*/
194
if (test_and_set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi))
195
return;
196
197
__kvm_perf_overflow(pmc, true);
198
199
kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
200
}
201
202
static u64 pmc_get_pebs_precise_level(struct kvm_pmc *pmc)
203
{
204
/*
205
* For some model specific pebs counters with special capabilities
206
* (PDIR, PDIR++, PDIST), KVM needs to raise the event precise
207
* level to the maximum value (currently 3, backwards compatible)
208
* so that the perf subsystem would assign specific hardware counter
209
* with that capability for vPMC.
210
*/
211
if ((pmc->idx == 0 && x86_match_cpu(vmx_pebs_pdist_cpu)) ||
212
(pmc->idx == 32 && x86_match_cpu(vmx_pebs_pdir_cpu)))
213
return 3;
214
215
/*
216
* The non-zero precision level of guest event makes the ordinary
217
* guest event becomes a guest PEBS event and triggers the host
218
* PEBS PMI handler to determine whether the PEBS overflow PMI
219
* comes from the host counters or the guest.
220
*/
221
return 1;
222
}
223
224
static u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value)
225
{
226
u64 sample_period = (-counter_value) & pmc_bitmask(pmc);
227
228
if (!sample_period)
229
sample_period = pmc_bitmask(pmc) + 1;
230
return sample_period;
231
}
232
233
static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
234
bool exclude_user, bool exclude_kernel,
235
bool intr)
236
{
237
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
238
struct perf_event *event;
239
struct perf_event_attr attr = {
240
.type = type,
241
.size = sizeof(attr),
242
.pinned = true,
243
.exclude_idle = true,
244
.exclude_host = 1,
245
.exclude_user = exclude_user,
246
.exclude_kernel = exclude_kernel,
247
.config = config,
248
};
249
bool pebs = test_bit(pmc->idx, (unsigned long *)&pmu->pebs_enable);
250
251
attr.sample_period = get_sample_period(pmc, pmc->counter);
252
253
if ((attr.config & HSW_IN_TX_CHECKPOINTED) &&
254
(boot_cpu_has(X86_FEATURE_RTM) || boot_cpu_has(X86_FEATURE_HLE))) {
255
/*
256
* HSW_IN_TX_CHECKPOINTED is not supported with nonzero
257
* period. Just clear the sample period so at least
258
* allocating the counter doesn't fail.
259
*/
260
attr.sample_period = 0;
261
}
262
if (pebs) {
263
/*
264
* For most PEBS hardware events, the difference in the software
265
* precision levels of guest and host PEBS events will not affect
266
* the accuracy of the PEBS profiling result, because the "event IP"
267
* in the PEBS record is calibrated on the guest side.
268
*/
269
attr.precise_ip = pmc_get_pebs_precise_level(pmc);
270
}
271
272
event = perf_event_create_kernel_counter(&attr, -1, current,
273
kvm_perf_overflow, pmc);
274
if (IS_ERR(event)) {
275
pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
276
PTR_ERR(event), pmc->idx);
277
return PTR_ERR(event);
278
}
279
280
pmc->perf_event = event;
281
pmc_to_pmu(pmc)->event_count++;
282
pmc->is_paused = false;
283
pmc->intr = intr || pebs;
284
return 0;
285
}
286
287
static bool pmc_pause_counter(struct kvm_pmc *pmc)
288
{
289
u64 counter = pmc->counter;
290
u64 prev_counter;
291
292
/* update counter, reset event value to avoid redundant accumulation */
293
if (pmc->perf_event && !pmc->is_paused)
294
counter += perf_event_pause(pmc->perf_event, true);
295
296
/*
297
* Snapshot the previous counter *after* accumulating state from perf.
298
* If overflow already happened, hardware (via perf) is responsible for
299
* generating a PMI. KVM just needs to detect overflow on emulated
300
* counter events that haven't yet been processed.
301
*/
302
prev_counter = counter & pmc_bitmask(pmc);
303
304
counter += pmc->emulated_counter;
305
pmc->counter = counter & pmc_bitmask(pmc);
306
307
pmc->emulated_counter = 0;
308
pmc->is_paused = true;
309
310
return pmc->counter < prev_counter;
311
}
312
313
static bool pmc_resume_counter(struct kvm_pmc *pmc)
314
{
315
if (!pmc->perf_event)
316
return false;
317
318
/* recalibrate sample period and check if it's accepted by perf core */
319
if (is_sampling_event(pmc->perf_event) &&
320
perf_event_period(pmc->perf_event,
321
get_sample_period(pmc, pmc->counter)))
322
return false;
323
324
if (test_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->pebs_enable) !=
325
(!!pmc->perf_event->attr.precise_ip))
326
return false;
327
328
/* reuse perf_event to serve as pmc_reprogram_counter() does*/
329
perf_event_enable(pmc->perf_event);
330
pmc->is_paused = false;
331
332
return true;
333
}
334
335
static void pmc_release_perf_event(struct kvm_pmc *pmc)
336
{
337
if (pmc->perf_event) {
338
perf_event_release_kernel(pmc->perf_event);
339
pmc->perf_event = NULL;
340
pmc->current_config = 0;
341
pmc_to_pmu(pmc)->event_count--;
342
}
343
}
344
345
static void pmc_stop_counter(struct kvm_pmc *pmc)
346
{
347
if (pmc->perf_event) {
348
pmc->counter = pmc_read_counter(pmc);
349
pmc_release_perf_event(pmc);
350
}
351
}
352
353
static void pmc_update_sample_period(struct kvm_pmc *pmc)
354
{
355
if (!pmc->perf_event || pmc->is_paused ||
356
!is_sampling_event(pmc->perf_event))
357
return;
358
359
perf_event_period(pmc->perf_event,
360
get_sample_period(pmc, pmc->counter));
361
}
362
363
void pmc_write_counter(struct kvm_pmc *pmc, u64 val)
364
{
365
/*
366
* Drop any unconsumed accumulated counts, the WRMSR is a write, not a
367
* read-modify-write. Adjust the counter value so that its value is
368
* relative to the current count, as reading the current count from
369
* perf is faster than pausing and repgrogramming the event in order to
370
* reset it to '0'. Note, this very sneakily offsets the accumulated
371
* emulated count too, by using pmc_read_counter()!
372
*/
373
pmc->emulated_counter = 0;
374
pmc->counter += val - pmc_read_counter(pmc);
375
pmc->counter &= pmc_bitmask(pmc);
376
pmc_update_sample_period(pmc);
377
}
378
EXPORT_SYMBOL_FOR_KVM_INTERNAL(pmc_write_counter);
379
380
static int filter_cmp(const void *pa, const void *pb, u64 mask)
381
{
382
u64 a = *(u64 *)pa & mask;
383
u64 b = *(u64 *)pb & mask;
384
385
return (a > b) - (a < b);
386
}
387
388
389
static int filter_sort_cmp(const void *pa, const void *pb)
390
{
391
return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT |
392
KVM_PMU_MASKED_ENTRY_EXCLUDE));
393
}
394
395
/*
396
* For the event filter, searching is done on the 'includes' list and
397
* 'excludes' list separately rather than on the 'events' list (which
398
* has both). As a result the exclude bit can be ignored.
399
*/
400
static int filter_event_cmp(const void *pa, const void *pb)
401
{
402
return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT));
403
}
404
405
static int find_filter_index(u64 *events, u64 nevents, u64 key)
406
{
407
u64 *fe = bsearch(&key, events, nevents, sizeof(events[0]),
408
filter_event_cmp);
409
410
if (!fe)
411
return -1;
412
413
return fe - events;
414
}
415
416
static bool is_filter_entry_match(u64 filter_event, u64 umask)
417
{
418
u64 mask = filter_event >> (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8);
419
u64 match = filter_event & KVM_PMU_MASKED_ENTRY_UMASK_MATCH;
420
421
BUILD_BUG_ON((KVM_PMU_ENCODE_MASKED_ENTRY(0, 0xff, 0, false) >>
422
(KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8)) !=
423
ARCH_PERFMON_EVENTSEL_UMASK);
424
425
return (umask & mask) == match;
426
}
427
428
static bool filter_contains_match(u64 *events, u64 nevents, u64 eventsel)
429
{
430
u64 event_select = eventsel & kvm_pmu_ops.EVENTSEL_EVENT;
431
u64 umask = eventsel & ARCH_PERFMON_EVENTSEL_UMASK;
432
int i, index;
433
434
index = find_filter_index(events, nevents, event_select);
435
if (index < 0)
436
return false;
437
438
/*
439
* Entries are sorted by the event select. Walk the list in both
440
* directions to process all entries with the targeted event select.
441
*/
442
for (i = index; i < nevents; i++) {
443
if (filter_event_cmp(&events[i], &event_select))
444
break;
445
446
if (is_filter_entry_match(events[i], umask))
447
return true;
448
}
449
450
for (i = index - 1; i >= 0; i--) {
451
if (filter_event_cmp(&events[i], &event_select))
452
break;
453
454
if (is_filter_entry_match(events[i], umask))
455
return true;
456
}
457
458
return false;
459
}
460
461
static bool is_gp_event_allowed(struct kvm_x86_pmu_event_filter *f,
462
u64 eventsel)
463
{
464
if (filter_contains_match(f->includes, f->nr_includes, eventsel) &&
465
!filter_contains_match(f->excludes, f->nr_excludes, eventsel))
466
return f->action == KVM_PMU_EVENT_ALLOW;
467
468
return f->action == KVM_PMU_EVENT_DENY;
469
}
470
471
static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter,
472
int idx)
473
{
474
int fixed_idx = idx - KVM_FIXED_PMC_BASE_IDX;
475
476
if (filter->action == KVM_PMU_EVENT_DENY &&
477
test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
478
return false;
479
if (filter->action == KVM_PMU_EVENT_ALLOW &&
480
!test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
481
return false;
482
483
return true;
484
}
485
486
static bool pmc_is_event_allowed(struct kvm_pmc *pmc)
487
{
488
struct kvm_x86_pmu_event_filter *filter;
489
struct kvm *kvm = pmc->vcpu->kvm;
490
491
filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
492
if (!filter)
493
return true;
494
495
if (pmc_is_gp(pmc))
496
return is_gp_event_allowed(filter, pmc->eventsel);
497
498
return is_fixed_event_allowed(filter, pmc->idx);
499
}
500
501
static int reprogram_counter(struct kvm_pmc *pmc)
502
{
503
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
504
u64 eventsel = pmc->eventsel;
505
u64 new_config = eventsel;
506
bool emulate_overflow;
507
u8 fixed_ctr_ctrl;
508
509
emulate_overflow = pmc_pause_counter(pmc);
510
511
if (!pmc_is_globally_enabled(pmc) || !pmc_is_locally_enabled(pmc) ||
512
!pmc_is_event_allowed(pmc))
513
return 0;
514
515
if (emulate_overflow)
516
__kvm_perf_overflow(pmc, false);
517
518
if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
519
printk_once("kvm pmu: pin control bit is ignored\n");
520
521
if (pmc_is_fixed(pmc)) {
522
fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl,
523
pmc->idx - KVM_FIXED_PMC_BASE_IDX);
524
if (fixed_ctr_ctrl & INTEL_FIXED_0_KERNEL)
525
eventsel |= ARCH_PERFMON_EVENTSEL_OS;
526
if (fixed_ctr_ctrl & INTEL_FIXED_0_USER)
527
eventsel |= ARCH_PERFMON_EVENTSEL_USR;
528
if (fixed_ctr_ctrl & INTEL_FIXED_0_ENABLE_PMI)
529
eventsel |= ARCH_PERFMON_EVENTSEL_INT;
530
new_config = (u64)fixed_ctr_ctrl;
531
}
532
533
if (pmc->current_config == new_config && pmc_resume_counter(pmc))
534
return 0;
535
536
pmc_release_perf_event(pmc);
537
538
pmc->current_config = new_config;
539
540
return pmc_reprogram_counter(pmc, PERF_TYPE_RAW,
541
(eventsel & pmu->raw_event_mask),
542
!(eventsel & ARCH_PERFMON_EVENTSEL_USR),
543
!(eventsel & ARCH_PERFMON_EVENTSEL_OS),
544
eventsel & ARCH_PERFMON_EVENTSEL_INT);
545
}
546
547
static bool pmc_is_event_match(struct kvm_pmc *pmc, u64 eventsel)
548
{
549
/*
550
* Ignore checks for edge detect (all events currently emulated by KVM
551
* are always rising edges), pin control (unsupported by modern CPUs),
552
* and counter mask and its invert flag (KVM doesn't emulate multiple
553
* events in a single clock cycle).
554
*
555
* Note, the uppermost nibble of AMD's mask overlaps Intel's IN_TX (bit
556
* 32) and IN_TXCP (bit 33), as well as two reserved bits (bits 35:34).
557
* Checking the "in HLE/RTM transaction" flags is correct as the vCPU
558
* can't be in a transaction if KVM is emulating an instruction.
559
*
560
* Checking the reserved bits might be wrong if they are defined in the
561
* future, but so could ignoring them, so do the simple thing for now.
562
*/
563
return !((pmc->eventsel ^ eventsel) & AMD64_RAW_EVENT_MASK_NB);
564
}
565
566
void kvm_pmu_recalc_pmc_emulation(struct kvm_pmu *pmu, struct kvm_pmc *pmc)
567
{
568
bitmap_clear(pmu->pmc_counting_instructions, pmc->idx, 1);
569
bitmap_clear(pmu->pmc_counting_branches, pmc->idx, 1);
570
571
/*
572
* Do NOT consult the PMU event filters, as the filters must be checked
573
* at the time of emulation to ensure KVM uses fresh information, e.g.
574
* omitting a PMC from a bitmap could result in a missed event if the
575
* filter is changed to allow counting the event.
576
*/
577
if (!pmc_is_locally_enabled(pmc))
578
return;
579
580
if (pmc_is_event_match(pmc, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED))
581
bitmap_set(pmu->pmc_counting_instructions, pmc->idx, 1);
582
583
if (pmc_is_event_match(pmc, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED))
584
bitmap_set(pmu->pmc_counting_branches, pmc->idx, 1);
585
}
586
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_recalc_pmc_emulation);
587
588
void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
589
{
590
DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX);
591
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
592
struct kvm_pmc *pmc;
593
int bit;
594
595
bitmap_copy(bitmap, pmu->reprogram_pmi, X86_PMC_IDX_MAX);
596
597
/*
598
* The reprogramming bitmap can be written asynchronously by something
599
* other than the task that holds vcpu->mutex, take care to clear only
600
* the bits that will actually processed.
601
*/
602
BUILD_BUG_ON(sizeof(bitmap) != sizeof(atomic64_t));
603
atomic64_andnot(*(s64 *)bitmap, &pmu->__reprogram_pmi);
604
605
kvm_for_each_pmc(pmu, pmc, bit, bitmap) {
606
/*
607
* If reprogramming fails, e.g. due to contention, re-set the
608
* regprogram bit set, i.e. opportunistically try again on the
609
* next PMU refresh. Don't make a new request as doing so can
610
* stall the guest if reprogramming repeatedly fails.
611
*/
612
if (reprogram_counter(pmc))
613
set_bit(pmc->idx, pmu->reprogram_pmi);
614
}
615
616
/*
617
* Release unused perf_events if the corresponding guest MSRs weren't
618
* accessed during the last vCPU time slice (need_cleanup is set when
619
* the vCPU is scheduled back in).
620
*/
621
if (unlikely(pmu->need_cleanup))
622
kvm_pmu_cleanup(vcpu);
623
624
kvm_for_each_pmc(pmu, pmc, bit, bitmap)
625
kvm_pmu_recalc_pmc_emulation(pmu, pmc);
626
}
627
628
int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx)
629
{
630
/*
631
* On Intel, VMX interception has priority over RDPMC exceptions that
632
* aren't already handled by the emulator, i.e. there are no additional
633
* check needed for Intel PMUs.
634
*
635
* On AMD, _all_ exceptions on RDPMC have priority over SVM intercepts,
636
* i.e. an invalid PMC results in a #GP, not #VMEXIT.
637
*/
638
if (!kvm_pmu_ops.check_rdpmc_early)
639
return 0;
640
641
return kvm_pmu_call(check_rdpmc_early)(vcpu, idx);
642
}
643
644
bool is_vmware_backdoor_pmc(u32 pmc_idx)
645
{
646
switch (pmc_idx) {
647
case VMWARE_BACKDOOR_PMC_HOST_TSC:
648
case VMWARE_BACKDOOR_PMC_REAL_TIME:
649
case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
650
return true;
651
}
652
return false;
653
}
654
655
static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
656
{
657
u64 ctr_val;
658
659
switch (idx) {
660
case VMWARE_BACKDOOR_PMC_HOST_TSC:
661
ctr_val = rdtsc();
662
break;
663
case VMWARE_BACKDOOR_PMC_REAL_TIME:
664
ctr_val = ktime_get_boottime_ns();
665
break;
666
case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
667
ctr_val = ktime_get_boottime_ns() +
668
vcpu->kvm->arch.kvmclock_offset;
669
break;
670
default:
671
return 1;
672
}
673
674
*data = ctr_val;
675
return 0;
676
}
677
678
int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
679
{
680
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
681
struct kvm_pmc *pmc;
682
u64 mask = ~0ull;
683
684
if (!pmu->version)
685
return 1;
686
687
if (is_vmware_backdoor_pmc(idx))
688
return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
689
690
pmc = kvm_pmu_call(rdpmc_ecx_to_pmc)(vcpu, idx, &mask);
691
if (!pmc)
692
return 1;
693
694
if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCE) &&
695
(kvm_x86_call(get_cpl)(vcpu) != 0) &&
696
kvm_is_cr0_bit_set(vcpu, X86_CR0_PE))
697
return 1;
698
699
*data = pmc_read_counter(pmc) & mask;
700
return 0;
701
}
702
703
void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
704
{
705
if (lapic_in_kernel(vcpu)) {
706
kvm_pmu_call(deliver_pmi)(vcpu);
707
kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
708
}
709
}
710
711
bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
712
{
713
switch (msr) {
714
case MSR_CORE_PERF_GLOBAL_STATUS:
715
case MSR_CORE_PERF_GLOBAL_CTRL:
716
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
717
return kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu));
718
default:
719
break;
720
}
721
return kvm_pmu_call(msr_idx_to_pmc)(vcpu, msr) ||
722
kvm_pmu_call(is_valid_msr)(vcpu, msr);
723
}
724
725
static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr)
726
{
727
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
728
struct kvm_pmc *pmc = kvm_pmu_call(msr_idx_to_pmc)(vcpu, msr);
729
730
if (pmc)
731
__set_bit(pmc->idx, pmu->pmc_in_use);
732
}
733
734
int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
735
{
736
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
737
u32 msr = msr_info->index;
738
739
switch (msr) {
740
case MSR_CORE_PERF_GLOBAL_STATUS:
741
case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
742
msr_info->data = pmu->global_status;
743
break;
744
case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
745
case MSR_CORE_PERF_GLOBAL_CTRL:
746
msr_info->data = pmu->global_ctrl;
747
break;
748
case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
749
case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET:
750
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
751
msr_info->data = 0;
752
break;
753
default:
754
return kvm_pmu_call(get_msr)(vcpu, msr_info);
755
}
756
757
return 0;
758
}
759
760
int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
761
{
762
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
763
u32 msr = msr_info->index;
764
u64 data = msr_info->data;
765
u64 diff;
766
767
/*
768
* Note, AMD ignores writes to reserved bits and read-only PMU MSRs,
769
* whereas Intel generates #GP on attempts to write reserved/RO MSRs.
770
*/
771
switch (msr) {
772
case MSR_CORE_PERF_GLOBAL_STATUS:
773
if (!msr_info->host_initiated)
774
return 1; /* RO MSR */
775
fallthrough;
776
case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
777
/* Per PPR, Read-only MSR. Writes are ignored. */
778
if (!msr_info->host_initiated)
779
break;
780
781
if (data & pmu->global_status_rsvd)
782
return 1;
783
784
pmu->global_status = data;
785
break;
786
case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
787
data &= ~pmu->global_ctrl_rsvd;
788
fallthrough;
789
case MSR_CORE_PERF_GLOBAL_CTRL:
790
if (!kvm_valid_perf_global_ctrl(pmu, data))
791
return 1;
792
793
if (pmu->global_ctrl != data) {
794
diff = pmu->global_ctrl ^ data;
795
pmu->global_ctrl = data;
796
reprogram_counters(pmu, diff);
797
}
798
break;
799
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
800
/*
801
* GLOBAL_OVF_CTRL, a.k.a. GLOBAL STATUS_RESET, clears bits in
802
* GLOBAL_STATUS, and so the set of reserved bits is the same.
803
*/
804
if (data & pmu->global_status_rsvd)
805
return 1;
806
fallthrough;
807
case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
808
if (!msr_info->host_initiated)
809
pmu->global_status &= ~data;
810
break;
811
case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET:
812
if (!msr_info->host_initiated)
813
pmu->global_status |= data & ~pmu->global_status_rsvd;
814
break;
815
default:
816
kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index);
817
return kvm_pmu_call(set_msr)(vcpu, msr_info);
818
}
819
820
return 0;
821
}
822
823
static void kvm_pmu_reset(struct kvm_vcpu *vcpu)
824
{
825
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
826
struct kvm_pmc *pmc;
827
int i;
828
829
pmu->need_cleanup = false;
830
831
bitmap_zero(pmu->reprogram_pmi, X86_PMC_IDX_MAX);
832
833
kvm_for_each_pmc(pmu, pmc, i, pmu->all_valid_pmc_idx) {
834
pmc_stop_counter(pmc);
835
pmc->counter = 0;
836
pmc->emulated_counter = 0;
837
838
if (pmc_is_gp(pmc))
839
pmc->eventsel = 0;
840
}
841
842
pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0;
843
844
kvm_pmu_call(reset)(vcpu);
845
}
846
847
848
/*
849
* Refresh the PMU configuration for the vCPU, e.g. if userspace changes CPUID
850
* and/or PERF_CAPABILITIES.
851
*/
852
void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
853
{
854
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
855
856
if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm))
857
return;
858
859
/*
860
* Stop/release all existing counters/events before realizing the new
861
* vPMU model.
862
*/
863
kvm_pmu_reset(vcpu);
864
865
pmu->version = 0;
866
pmu->nr_arch_gp_counters = 0;
867
pmu->nr_arch_fixed_counters = 0;
868
pmu->counter_bitmask[KVM_PMC_GP] = 0;
869
pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
870
pmu->reserved_bits = 0xffffffff00200000ull;
871
pmu->raw_event_mask = X86_RAW_EVENT_MASK;
872
pmu->global_ctrl_rsvd = ~0ull;
873
pmu->global_status_rsvd = ~0ull;
874
pmu->fixed_ctr_ctrl_rsvd = ~0ull;
875
pmu->pebs_enable_rsvd = ~0ull;
876
pmu->pebs_data_cfg_rsvd = ~0ull;
877
bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX);
878
879
if (!vcpu->kvm->arch.enable_pmu)
880
return;
881
882
kvm_pmu_call(refresh)(vcpu);
883
884
/*
885
* At RESET, both Intel and AMD CPUs set all enable bits for general
886
* purpose counters in IA32_PERF_GLOBAL_CTRL (so that software that
887
* was written for v1 PMUs don't unknowingly leave GP counters disabled
888
* in the global controls). Emulate that behavior when refreshing the
889
* PMU so that userspace doesn't need to manually set PERF_GLOBAL_CTRL.
890
*/
891
if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters)
892
pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - 1, 0);
893
894
bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters);
895
bitmap_set(pmu->all_valid_pmc_idx, KVM_FIXED_PMC_BASE_IDX,
896
pmu->nr_arch_fixed_counters);
897
}
898
899
void kvm_pmu_init(struct kvm_vcpu *vcpu)
900
{
901
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
902
903
memset(pmu, 0, sizeof(*pmu));
904
kvm_pmu_call(init)(vcpu);
905
}
906
907
/* Release perf_events for vPMCs that have been unused for a full time slice. */
908
void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
909
{
910
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
911
struct kvm_pmc *pmc = NULL;
912
DECLARE_BITMAP(bitmask, X86_PMC_IDX_MAX);
913
int i;
914
915
pmu->need_cleanup = false;
916
917
bitmap_andnot(bitmask, pmu->all_valid_pmc_idx,
918
pmu->pmc_in_use, X86_PMC_IDX_MAX);
919
920
kvm_for_each_pmc(pmu, pmc, i, bitmask) {
921
if (pmc->perf_event && !pmc_is_locally_enabled(pmc))
922
pmc_stop_counter(pmc);
923
}
924
925
kvm_pmu_call(cleanup)(vcpu);
926
927
bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX);
928
}
929
930
void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
931
{
932
kvm_pmu_reset(vcpu);
933
}
934
935
static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
936
{
937
pmc->emulated_counter++;
938
kvm_pmu_request_counter_reprogram(pmc);
939
}
940
941
static inline bool cpl_is_matched(struct kvm_pmc *pmc)
942
{
943
bool select_os, select_user;
944
u64 config;
945
946
if (pmc_is_gp(pmc)) {
947
config = pmc->eventsel;
948
select_os = config & ARCH_PERFMON_EVENTSEL_OS;
949
select_user = config & ARCH_PERFMON_EVENTSEL_USR;
950
} else {
951
config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl,
952
pmc->idx - KVM_FIXED_PMC_BASE_IDX);
953
select_os = config & INTEL_FIXED_0_KERNEL;
954
select_user = config & INTEL_FIXED_0_USER;
955
}
956
957
/*
958
* Skip the CPL lookup, which isn't free on Intel, if the result will
959
* be the same regardless of the CPL.
960
*/
961
if (select_os == select_user)
962
return select_os;
963
964
return (kvm_x86_call(get_cpl)(pmc->vcpu) == 0) ? select_os :
965
select_user;
966
}
967
968
static void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu,
969
const unsigned long *event_pmcs)
970
{
971
DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX);
972
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
973
struct kvm_pmc *pmc;
974
int i, idx;
975
976
BUILD_BUG_ON(sizeof(pmu->global_ctrl) * BITS_PER_BYTE != X86_PMC_IDX_MAX);
977
978
if (bitmap_empty(event_pmcs, X86_PMC_IDX_MAX))
979
return;
980
981
if (!kvm_pmu_has_perf_global_ctrl(pmu))
982
bitmap_copy(bitmap, event_pmcs, X86_PMC_IDX_MAX);
983
else if (!bitmap_and(bitmap, event_pmcs,
984
(unsigned long *)&pmu->global_ctrl, X86_PMC_IDX_MAX))
985
return;
986
987
idx = srcu_read_lock(&vcpu->kvm->srcu);
988
kvm_for_each_pmc(pmu, pmc, i, bitmap) {
989
if (!pmc_is_event_allowed(pmc) || !cpl_is_matched(pmc))
990
continue;
991
992
kvm_pmu_incr_counter(pmc);
993
}
994
srcu_read_unlock(&vcpu->kvm->srcu, idx);
995
}
996
997
void kvm_pmu_instruction_retired(struct kvm_vcpu *vcpu)
998
{
999
kvm_pmu_trigger_event(vcpu, vcpu_to_pmu(vcpu)->pmc_counting_instructions);
1000
}
1001
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_instruction_retired);
1002
1003
void kvm_pmu_branch_retired(struct kvm_vcpu *vcpu)
1004
{
1005
kvm_pmu_trigger_event(vcpu, vcpu_to_pmu(vcpu)->pmc_counting_branches);
1006
}
1007
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_branch_retired);
1008
1009
static bool is_masked_filter_valid(const struct kvm_x86_pmu_event_filter *filter)
1010
{
1011
u64 mask = kvm_pmu_ops.EVENTSEL_EVENT |
1012
KVM_PMU_MASKED_ENTRY_UMASK_MASK |
1013
KVM_PMU_MASKED_ENTRY_UMASK_MATCH |
1014
KVM_PMU_MASKED_ENTRY_EXCLUDE;
1015
int i;
1016
1017
for (i = 0; i < filter->nevents; i++) {
1018
if (filter->events[i] & ~mask)
1019
return false;
1020
}
1021
1022
return true;
1023
}
1024
1025
static void convert_to_masked_filter(struct kvm_x86_pmu_event_filter *filter)
1026
{
1027
int i, j;
1028
1029
for (i = 0, j = 0; i < filter->nevents; i++) {
1030
/*
1031
* Skip events that are impossible to match against a guest
1032
* event. When filtering, only the event select + unit mask
1033
* of the guest event is used. To maintain backwards
1034
* compatibility, impossible filters can't be rejected :-(
1035
*/
1036
if (filter->events[i] & ~(kvm_pmu_ops.EVENTSEL_EVENT |
1037
ARCH_PERFMON_EVENTSEL_UMASK))
1038
continue;
1039
/*
1040
* Convert userspace events to a common in-kernel event so
1041
* only one code path is needed to support both events. For
1042
* the in-kernel events use masked events because they are
1043
* flexible enough to handle both cases. To convert to masked
1044
* events all that's needed is to add an "all ones" umask_mask,
1045
* (unmasked filter events don't support EXCLUDE).
1046
*/
1047
filter->events[j++] = filter->events[i] |
1048
(0xFFULL << KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT);
1049
}
1050
1051
filter->nevents = j;
1052
}
1053
1054
static int prepare_filter_lists(struct kvm_x86_pmu_event_filter *filter)
1055
{
1056
int i;
1057
1058
if (!(filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS))
1059
convert_to_masked_filter(filter);
1060
else if (!is_masked_filter_valid(filter))
1061
return -EINVAL;
1062
1063
/*
1064
* Sort entries by event select and includes vs. excludes so that all
1065
* entries for a given event select can be processed efficiently during
1066
* filtering. The EXCLUDE flag uses a more significant bit than the
1067
* event select, and so the sorted list is also effectively split into
1068
* includes and excludes sub-lists.
1069
*/
1070
sort(&filter->events, filter->nevents, sizeof(filter->events[0]),
1071
filter_sort_cmp, NULL);
1072
1073
i = filter->nevents;
1074
/* Find the first EXCLUDE event (only supported for masked events). */
1075
if (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS) {
1076
for (i = 0; i < filter->nevents; i++) {
1077
if (filter->events[i] & KVM_PMU_MASKED_ENTRY_EXCLUDE)
1078
break;
1079
}
1080
}
1081
1082
filter->nr_includes = i;
1083
filter->nr_excludes = filter->nevents - filter->nr_includes;
1084
filter->includes = filter->events;
1085
filter->excludes = filter->events + filter->nr_includes;
1086
1087
return 0;
1088
}
1089
1090
int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
1091
{
1092
struct kvm_pmu_event_filter __user *user_filter = argp;
1093
struct kvm_x86_pmu_event_filter *filter;
1094
struct kvm_pmu_event_filter tmp;
1095
struct kvm_vcpu *vcpu;
1096
unsigned long i;
1097
size_t size;
1098
int r;
1099
1100
if (copy_from_user(&tmp, user_filter, sizeof(tmp)))
1101
return -EFAULT;
1102
1103
if (tmp.action != KVM_PMU_EVENT_ALLOW &&
1104
tmp.action != KVM_PMU_EVENT_DENY)
1105
return -EINVAL;
1106
1107
if (tmp.flags & ~KVM_PMU_EVENT_FLAGS_VALID_MASK)
1108
return -EINVAL;
1109
1110
if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS)
1111
return -E2BIG;
1112
1113
size = struct_size(filter, events, tmp.nevents);
1114
filter = kzalloc(size, GFP_KERNEL_ACCOUNT);
1115
if (!filter)
1116
return -ENOMEM;
1117
1118
filter->action = tmp.action;
1119
filter->nevents = tmp.nevents;
1120
filter->fixed_counter_bitmap = tmp.fixed_counter_bitmap;
1121
filter->flags = tmp.flags;
1122
1123
r = -EFAULT;
1124
if (copy_from_user(filter->events, user_filter->events,
1125
sizeof(filter->events[0]) * filter->nevents))
1126
goto cleanup;
1127
1128
r = prepare_filter_lists(filter);
1129
if (r)
1130
goto cleanup;
1131
1132
mutex_lock(&kvm->lock);
1133
filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
1134
mutex_is_locked(&kvm->lock));
1135
mutex_unlock(&kvm->lock);
1136
synchronize_srcu_expedited(&kvm->srcu);
1137
1138
BUILD_BUG_ON(sizeof(((struct kvm_pmu *)0)->reprogram_pmi) >
1139
sizeof(((struct kvm_pmu *)0)->__reprogram_pmi));
1140
1141
kvm_for_each_vcpu(i, vcpu, kvm)
1142
atomic64_set(&vcpu_to_pmu(vcpu)->__reprogram_pmi, -1ull);
1143
1144
kvm_make_all_cpus_request(kvm, KVM_REQ_PMU);
1145
1146
r = 0;
1147
cleanup:
1148
kfree(filter);
1149
return r;
1150
}
1151
1152