Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/events/intel/pt.c
26481 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
* Intel(R) Processor Trace PMU driver for perf
4
* Copyright (c) 2013-2014, Intel Corporation.
5
*
6
* Intel PT is specified in the Intel Architecture Instruction Set Extensions
7
* Programming Reference:
8
* http://software.intel.com/en-us/intel-isa-extensions
9
*/
10
11
#undef DEBUG
12
13
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14
15
#include <linux/types.h>
16
#include <linux/bits.h>
17
#include <linux/limits.h>
18
#include <linux/slab.h>
19
#include <linux/device.h>
20
21
#include <asm/cpuid/api.h>
22
#include <asm/perf_event.h>
23
#include <asm/insn.h>
24
#include <asm/io.h>
25
#include <asm/intel_pt.h>
26
#include <asm/cpu_device_id.h>
27
#include <asm/msr.h>
28
29
#include "../perf_event.h"
30
#include "pt.h"
31
32
static DEFINE_PER_CPU(struct pt, pt_ctx);
33
34
static struct pt_pmu pt_pmu;
35
36
/*
37
* Capabilities of Intel PT hardware, such as number of address bits or
38
* supported output schemes, are cached and exported to userspace as "caps"
39
* attribute group of pt pmu device
40
* (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store
41
* relevant bits together with intel_pt traces.
42
*
43
* These are necessary for both trace decoding (payloads_lip, contains address
44
* width encoded in IP-related packets), and event configuration (bitmasks with
45
* permitted values for certain bit fields).
46
*/
47
#define PT_CAP(_n, _l, _r, _m) \
48
[PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l, \
49
.reg = _r, .mask = _m }
50
51
static struct pt_cap_desc {
52
const char *name;
53
u32 leaf;
54
u8 reg;
55
u32 mask;
56
} pt_caps[] = {
57
PT_CAP(max_subleaf, 0, CPUID_EAX, 0xffffffff),
58
PT_CAP(cr3_filtering, 0, CPUID_EBX, BIT(0)),
59
PT_CAP(psb_cyc, 0, CPUID_EBX, BIT(1)),
60
PT_CAP(ip_filtering, 0, CPUID_EBX, BIT(2)),
61
PT_CAP(mtc, 0, CPUID_EBX, BIT(3)),
62
PT_CAP(ptwrite, 0, CPUID_EBX, BIT(4)),
63
PT_CAP(power_event_trace, 0, CPUID_EBX, BIT(5)),
64
PT_CAP(event_trace, 0, CPUID_EBX, BIT(7)),
65
PT_CAP(tnt_disable, 0, CPUID_EBX, BIT(8)),
66
PT_CAP(topa_output, 0, CPUID_ECX, BIT(0)),
67
PT_CAP(topa_multiple_entries, 0, CPUID_ECX, BIT(1)),
68
PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)),
69
PT_CAP(output_subsys, 0, CPUID_ECX, BIT(3)),
70
PT_CAP(payloads_lip, 0, CPUID_ECX, BIT(31)),
71
PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x7),
72
PT_CAP(mtc_periods, 1, CPUID_EAX, 0xffff0000),
73
PT_CAP(cycle_thresholds, 1, CPUID_EBX, 0xffff),
74
PT_CAP(psb_periods, 1, CPUID_EBX, 0xffff0000),
75
};
76
77
u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities capability)
78
{
79
struct pt_cap_desc *cd = &pt_caps[capability];
80
u32 c = caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
81
unsigned int shift = __ffs(cd->mask);
82
83
return (c & cd->mask) >> shift;
84
}
85
EXPORT_SYMBOL_GPL(intel_pt_validate_cap);
86
87
u32 intel_pt_validate_hw_cap(enum pt_capabilities cap)
88
{
89
return intel_pt_validate_cap(pt_pmu.caps, cap);
90
}
91
EXPORT_SYMBOL_GPL(intel_pt_validate_hw_cap);
92
93
static ssize_t pt_cap_show(struct device *cdev,
94
struct device_attribute *attr,
95
char *buf)
96
{
97
struct dev_ext_attribute *ea =
98
container_of(attr, struct dev_ext_attribute, attr);
99
enum pt_capabilities cap = (long)ea->var;
100
101
return snprintf(buf, PAGE_SIZE, "%x\n", intel_pt_validate_hw_cap(cap));
102
}
103
104
static struct attribute_group pt_cap_group __ro_after_init = {
105
.name = "caps",
106
};
107
108
PMU_FORMAT_ATTR(pt, "config:0" );
109
PMU_FORMAT_ATTR(cyc, "config:1" );
110
PMU_FORMAT_ATTR(pwr_evt, "config:4" );
111
PMU_FORMAT_ATTR(fup_on_ptw, "config:5" );
112
PMU_FORMAT_ATTR(mtc, "config:9" );
113
PMU_FORMAT_ATTR(tsc, "config:10" );
114
PMU_FORMAT_ATTR(noretcomp, "config:11" );
115
PMU_FORMAT_ATTR(ptw, "config:12" );
116
PMU_FORMAT_ATTR(branch, "config:13" );
117
PMU_FORMAT_ATTR(event, "config:31" );
118
PMU_FORMAT_ATTR(notnt, "config:55" );
119
PMU_FORMAT_ATTR(mtc_period, "config:14-17" );
120
PMU_FORMAT_ATTR(cyc_thresh, "config:19-22" );
121
PMU_FORMAT_ATTR(psb_period, "config:24-27" );
122
123
static struct attribute *pt_formats_attr[] = {
124
&format_attr_pt.attr,
125
&format_attr_cyc.attr,
126
&format_attr_pwr_evt.attr,
127
&format_attr_event.attr,
128
&format_attr_notnt.attr,
129
&format_attr_fup_on_ptw.attr,
130
&format_attr_mtc.attr,
131
&format_attr_tsc.attr,
132
&format_attr_noretcomp.attr,
133
&format_attr_ptw.attr,
134
&format_attr_branch.attr,
135
&format_attr_mtc_period.attr,
136
&format_attr_cyc_thresh.attr,
137
&format_attr_psb_period.attr,
138
NULL,
139
};
140
141
static struct attribute_group pt_format_group = {
142
.name = "format",
143
.attrs = pt_formats_attr,
144
};
145
146
static ssize_t
147
pt_timing_attr_show(struct device *dev, struct device_attribute *attr,
148
char *page)
149
{
150
struct perf_pmu_events_attr *pmu_attr =
151
container_of(attr, struct perf_pmu_events_attr, attr);
152
153
switch (pmu_attr->id) {
154
case 0:
155
return sprintf(page, "%lu\n", pt_pmu.max_nonturbo_ratio);
156
case 1:
157
return sprintf(page, "%u:%u\n",
158
pt_pmu.tsc_art_num,
159
pt_pmu.tsc_art_den);
160
default:
161
break;
162
}
163
164
return -EINVAL;
165
}
166
167
PMU_EVENT_ATTR(max_nonturbo_ratio, timing_attr_max_nonturbo_ratio, 0,
168
pt_timing_attr_show);
169
PMU_EVENT_ATTR(tsc_art_ratio, timing_attr_tsc_art_ratio, 1,
170
pt_timing_attr_show);
171
172
static struct attribute *pt_timing_attr[] = {
173
&timing_attr_max_nonturbo_ratio.attr.attr,
174
&timing_attr_tsc_art_ratio.attr.attr,
175
NULL,
176
};
177
178
static struct attribute_group pt_timing_group = {
179
.attrs = pt_timing_attr,
180
};
181
182
static const struct attribute_group *pt_attr_groups[] = {
183
&pt_cap_group,
184
&pt_format_group,
185
&pt_timing_group,
186
NULL,
187
};
188
189
static int __init pt_pmu_hw_init(void)
190
{
191
struct dev_ext_attribute *de_attrs;
192
struct attribute **attrs;
193
size_t size;
194
u64 reg;
195
int ret;
196
long i;
197
198
rdmsrq(MSR_PLATFORM_INFO, reg);
199
pt_pmu.max_nonturbo_ratio = (reg & 0xff00) >> 8;
200
201
/*
202
* if available, read in TSC to core crystal clock ratio,
203
* otherwise, zero for numerator stands for "not enumerated"
204
* as per SDM
205
*/
206
if (boot_cpu_data.cpuid_level >= CPUID_LEAF_TSC) {
207
u32 eax, ebx, ecx, edx;
208
209
cpuid(CPUID_LEAF_TSC, &eax, &ebx, &ecx, &edx);
210
211
pt_pmu.tsc_art_num = ebx;
212
pt_pmu.tsc_art_den = eax;
213
}
214
215
/* model-specific quirks */
216
switch (boot_cpu_data.x86_vfm) {
217
case INTEL_BROADWELL:
218
case INTEL_BROADWELL_D:
219
case INTEL_BROADWELL_G:
220
case INTEL_BROADWELL_X:
221
/* not setting BRANCH_EN will #GP, erratum BDM106 */
222
pt_pmu.branch_en_always_on = true;
223
break;
224
default:
225
break;
226
}
227
228
if (boot_cpu_has(X86_FEATURE_VMX)) {
229
/*
230
* Intel SDM, 36.5 "Tracing post-VMXON" says that
231
* "IA32_VMX_MISC[bit 14]" being 1 means PT can trace
232
* post-VMXON.
233
*/
234
rdmsrq(MSR_IA32_VMX_MISC, reg);
235
if (reg & BIT(14))
236
pt_pmu.vmx = true;
237
}
238
239
for (i = 0; i < PT_CPUID_LEAVES; i++) {
240
cpuid_count(20, i,
241
&pt_pmu.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM],
242
&pt_pmu.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM],
243
&pt_pmu.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM],
244
&pt_pmu.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM]);
245
}
246
247
ret = -ENOMEM;
248
size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1);
249
attrs = kzalloc(size, GFP_KERNEL);
250
if (!attrs)
251
goto fail;
252
253
size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1);
254
de_attrs = kzalloc(size, GFP_KERNEL);
255
if (!de_attrs)
256
goto fail;
257
258
for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
259
struct dev_ext_attribute *de_attr = de_attrs + i;
260
261
de_attr->attr.attr.name = pt_caps[i].name;
262
263
sysfs_attr_init(&de_attr->attr.attr);
264
265
de_attr->attr.attr.mode = S_IRUGO;
266
de_attr->attr.show = pt_cap_show;
267
de_attr->var = (void *)i;
268
269
attrs[i] = &de_attr->attr.attr;
270
}
271
272
pt_cap_group.attrs = attrs;
273
274
return 0;
275
276
fail:
277
kfree(attrs);
278
279
return ret;
280
}
281
282
#define RTIT_CTL_CYC_PSB (RTIT_CTL_CYCLEACC | \
283
RTIT_CTL_CYC_THRESH | \
284
RTIT_CTL_PSB_FREQ)
285
286
#define RTIT_CTL_MTC (RTIT_CTL_MTC_EN | \
287
RTIT_CTL_MTC_RANGE)
288
289
#define RTIT_CTL_PTW (RTIT_CTL_PTW_EN | \
290
RTIT_CTL_FUP_ON_PTW)
291
292
/*
293
* Bit 0 (TraceEn) in the attr.config is meaningless as the
294
* corresponding bit in the RTIT_CTL can only be controlled
295
* by the driver; therefore, repurpose it to mean: pass
296
* through the bit that was previously assumed to be always
297
* on for PT, thereby allowing the user to *not* set it if
298
* they so wish. See also pt_event_valid() and pt_config().
299
*/
300
#define RTIT_CTL_PASSTHROUGH RTIT_CTL_TRACEEN
301
302
#define PT_CONFIG_MASK (RTIT_CTL_TRACEEN | \
303
RTIT_CTL_TSC_EN | \
304
RTIT_CTL_DISRETC | \
305
RTIT_CTL_BRANCH_EN | \
306
RTIT_CTL_CYC_PSB | \
307
RTIT_CTL_MTC | \
308
RTIT_CTL_PWR_EVT_EN | \
309
RTIT_CTL_EVENT_EN | \
310
RTIT_CTL_NOTNT | \
311
RTIT_CTL_FUP_ON_PTW | \
312
RTIT_CTL_PTW_EN)
313
314
static bool pt_event_valid(struct perf_event *event)
315
{
316
u64 config = event->attr.config;
317
u64 allowed, requested;
318
319
if ((config & PT_CONFIG_MASK) != config)
320
return false;
321
322
if (config & RTIT_CTL_CYC_PSB) {
323
if (!intel_pt_validate_hw_cap(PT_CAP_psb_cyc))
324
return false;
325
326
allowed = intel_pt_validate_hw_cap(PT_CAP_psb_periods);
327
requested = (config & RTIT_CTL_PSB_FREQ) >>
328
RTIT_CTL_PSB_FREQ_OFFSET;
329
if (requested && (!(allowed & BIT(requested))))
330
return false;
331
332
allowed = intel_pt_validate_hw_cap(PT_CAP_cycle_thresholds);
333
requested = (config & RTIT_CTL_CYC_THRESH) >>
334
RTIT_CTL_CYC_THRESH_OFFSET;
335
if (requested && (!(allowed & BIT(requested))))
336
return false;
337
}
338
339
if (config & RTIT_CTL_MTC) {
340
/*
341
* In the unlikely case that CPUID lists valid mtc periods,
342
* but not the mtc capability, drop out here.
343
*
344
* Spec says that setting mtc period bits while mtc bit in
345
* CPUID is 0 will #GP, so better safe than sorry.
346
*/
347
if (!intel_pt_validate_hw_cap(PT_CAP_mtc))
348
return false;
349
350
allowed = intel_pt_validate_hw_cap(PT_CAP_mtc_periods);
351
if (!allowed)
352
return false;
353
354
requested = (config & RTIT_CTL_MTC_RANGE) >>
355
RTIT_CTL_MTC_RANGE_OFFSET;
356
357
if (!(allowed & BIT(requested)))
358
return false;
359
}
360
361
if (config & RTIT_CTL_PWR_EVT_EN &&
362
!intel_pt_validate_hw_cap(PT_CAP_power_event_trace))
363
return false;
364
365
if (config & RTIT_CTL_EVENT_EN &&
366
!intel_pt_validate_hw_cap(PT_CAP_event_trace))
367
return false;
368
369
if (config & RTIT_CTL_NOTNT &&
370
!intel_pt_validate_hw_cap(PT_CAP_tnt_disable))
371
return false;
372
373
if (config & RTIT_CTL_PTW) {
374
if (!intel_pt_validate_hw_cap(PT_CAP_ptwrite))
375
return false;
376
377
/* FUPonPTW without PTW doesn't make sense */
378
if ((config & RTIT_CTL_FUP_ON_PTW) &&
379
!(config & RTIT_CTL_PTW_EN))
380
return false;
381
}
382
383
/*
384
* Setting bit 0 (TraceEn in RTIT_CTL MSR) in the attr.config
385
* clears the assumption that BranchEn must always be enabled,
386
* as was the case with the first implementation of PT.
387
* If this bit is not set, the legacy behavior is preserved
388
* for compatibility with the older userspace.
389
*
390
* Re-using bit 0 for this purpose is fine because it is never
391
* directly set by the user; previous attempts at setting it in
392
* the attr.config resulted in -EINVAL.
393
*/
394
if (config & RTIT_CTL_PASSTHROUGH) {
395
/*
396
* Disallow not setting BRANCH_EN where BRANCH_EN is
397
* always required.
398
*/
399
if (pt_pmu.branch_en_always_on &&
400
!(config & RTIT_CTL_BRANCH_EN))
401
return false;
402
} else {
403
/*
404
* Disallow BRANCH_EN without the PASSTHROUGH.
405
*/
406
if (config & RTIT_CTL_BRANCH_EN)
407
return false;
408
}
409
410
return true;
411
}
412
413
/*
414
* PT configuration helpers
415
* These all are cpu affine and operate on a local PT
416
*/
417
418
static void pt_config_start(struct perf_event *event)
419
{
420
struct pt *pt = this_cpu_ptr(&pt_ctx);
421
u64 ctl = event->hw.aux_config;
422
423
if (READ_ONCE(event->hw.aux_paused))
424
return;
425
426
ctl |= RTIT_CTL_TRACEEN;
427
if (READ_ONCE(pt->vmx_on))
428
perf_aux_output_flag(&pt->handle, PERF_AUX_FLAG_PARTIAL);
429
else
430
wrmsrq(MSR_IA32_RTIT_CTL, ctl);
431
432
WRITE_ONCE(event->hw.aux_config, ctl);
433
}
434
435
/* Address ranges and their corresponding msr configuration registers */
436
static const struct pt_address_range {
437
unsigned long msr_a;
438
unsigned long msr_b;
439
unsigned int reg_off;
440
} pt_address_ranges[] = {
441
{
442
.msr_a = MSR_IA32_RTIT_ADDR0_A,
443
.msr_b = MSR_IA32_RTIT_ADDR0_B,
444
.reg_off = RTIT_CTL_ADDR0_OFFSET,
445
},
446
{
447
.msr_a = MSR_IA32_RTIT_ADDR1_A,
448
.msr_b = MSR_IA32_RTIT_ADDR1_B,
449
.reg_off = RTIT_CTL_ADDR1_OFFSET,
450
},
451
{
452
.msr_a = MSR_IA32_RTIT_ADDR2_A,
453
.msr_b = MSR_IA32_RTIT_ADDR2_B,
454
.reg_off = RTIT_CTL_ADDR2_OFFSET,
455
},
456
{
457
.msr_a = MSR_IA32_RTIT_ADDR3_A,
458
.msr_b = MSR_IA32_RTIT_ADDR3_B,
459
.reg_off = RTIT_CTL_ADDR3_OFFSET,
460
}
461
};
462
463
static u64 pt_config_filters(struct perf_event *event)
464
{
465
struct pt_filters *filters = event->hw.addr_filters;
466
struct pt *pt = this_cpu_ptr(&pt_ctx);
467
unsigned int range = 0;
468
u64 rtit_ctl = 0;
469
470
if (!filters)
471
return 0;
472
473
perf_event_addr_filters_sync(event);
474
475
for (range = 0; range < filters->nr_filters; range++) {
476
struct pt_filter *filter = &filters->filter[range];
477
478
/*
479
* Note, if the range has zero start/end addresses due
480
* to its dynamic object not being loaded yet, we just
481
* go ahead and program zeroed range, which will simply
482
* produce no data. Note^2: if executable code at 0x0
483
* is a concern, we can set up an "invalid" configuration
484
* such as msr_b < msr_a.
485
*/
486
487
/* avoid redundant msr writes */
488
if (pt->filters.filter[range].msr_a != filter->msr_a) {
489
wrmsrq(pt_address_ranges[range].msr_a, filter->msr_a);
490
pt->filters.filter[range].msr_a = filter->msr_a;
491
}
492
493
if (pt->filters.filter[range].msr_b != filter->msr_b) {
494
wrmsrq(pt_address_ranges[range].msr_b, filter->msr_b);
495
pt->filters.filter[range].msr_b = filter->msr_b;
496
}
497
498
rtit_ctl |= (u64)filter->config << pt_address_ranges[range].reg_off;
499
}
500
501
return rtit_ctl;
502
}
503
504
static void pt_config(struct perf_event *event)
505
{
506
struct pt *pt = this_cpu_ptr(&pt_ctx);
507
struct pt_buffer *buf = perf_get_aux(&pt->handle);
508
u64 reg;
509
510
/* First round: clear STATUS, in particular the PSB byte counter. */
511
if (!event->hw.aux_config) {
512
perf_event_itrace_started(event);
513
wrmsrq(MSR_IA32_RTIT_STATUS, 0);
514
}
515
516
reg = pt_config_filters(event);
517
reg |= RTIT_CTL_TRACEEN;
518
if (!buf->single)
519
reg |= RTIT_CTL_TOPA;
520
521
/*
522
* Previously, we had BRANCH_EN on by default, but now that PT has
523
* grown features outside of branch tracing, it is useful to allow
524
* the user to disable it. Setting bit 0 in the event's attr.config
525
* allows BRANCH_EN to pass through instead of being always on. See
526
* also the comment in pt_event_valid().
527
*/
528
if (event->attr.config & BIT(0)) {
529
reg |= event->attr.config & RTIT_CTL_BRANCH_EN;
530
} else {
531
reg |= RTIT_CTL_BRANCH_EN;
532
}
533
534
if (!event->attr.exclude_kernel)
535
reg |= RTIT_CTL_OS;
536
if (!event->attr.exclude_user)
537
reg |= RTIT_CTL_USR;
538
539
reg |= (event->attr.config & PT_CONFIG_MASK);
540
541
event->hw.aux_config = reg;
542
543
/*
544
* Allow resume before starting so as not to overwrite a value set by a
545
* PMI.
546
*/
547
barrier();
548
WRITE_ONCE(pt->resume_allowed, 1);
549
/* Configuration is complete, it is now OK to handle an NMI */
550
barrier();
551
WRITE_ONCE(pt->handle_nmi, 1);
552
barrier();
553
pt_config_start(event);
554
barrier();
555
/*
556
* Allow pause after starting so its pt_config_stop() doesn't race with
557
* pt_config_start().
558
*/
559
WRITE_ONCE(pt->pause_allowed, 1);
560
}
561
562
static void pt_config_stop(struct perf_event *event)
563
{
564
struct pt *pt = this_cpu_ptr(&pt_ctx);
565
u64 ctl = READ_ONCE(event->hw.aux_config);
566
567
/* may be already stopped by a PMI */
568
if (!(ctl & RTIT_CTL_TRACEEN))
569
return;
570
571
ctl &= ~RTIT_CTL_TRACEEN;
572
if (!READ_ONCE(pt->vmx_on))
573
wrmsrq(MSR_IA32_RTIT_CTL, ctl);
574
575
WRITE_ONCE(event->hw.aux_config, ctl);
576
577
/*
578
* A wrmsr that disables trace generation serializes other PT
579
* registers and causes all data packets to be written to memory,
580
* but a fence is required for the data to become globally visible.
581
*
582
* The below WMB, separating data store and aux_head store matches
583
* the consumer's RMB that separates aux_head load and data load.
584
*/
585
wmb();
586
}
587
588
/**
589
* struct topa - ToPA metadata
590
* @list: linkage to struct pt_buffer's list of tables
591
* @offset: offset of the first entry in this table in the buffer
592
* @size: total size of all entries in this table
593
* @last: index of the last initialized entry in this table
594
* @z_count: how many times the first entry repeats
595
*/
596
struct topa {
597
struct list_head list;
598
u64 offset;
599
size_t size;
600
int last;
601
unsigned int z_count;
602
};
603
604
/*
605
* Keep ToPA table-related metadata on the same page as the actual table,
606
* taking up a few words from the top
607
*/
608
609
#define TENTS_PER_PAGE \
610
((PAGE_SIZE - sizeof(struct topa)) / sizeof(struct topa_entry))
611
612
/**
613
* struct topa_page - page-sized ToPA table with metadata at the top
614
* @table: actual ToPA table entries, as understood by PT hardware
615
* @topa: metadata
616
*/
617
struct topa_page {
618
struct topa_entry table[TENTS_PER_PAGE];
619
struct topa topa;
620
};
621
622
static inline struct topa_page *topa_to_page(struct topa *topa)
623
{
624
return container_of(topa, struct topa_page, topa);
625
}
626
627
static inline struct topa_page *topa_entry_to_page(struct topa_entry *te)
628
{
629
return (struct topa_page *)((unsigned long)te & PAGE_MASK);
630
}
631
632
static inline phys_addr_t topa_pfn(struct topa *topa)
633
{
634
return PFN_DOWN(virt_to_phys(topa_to_page(topa)));
635
}
636
637
/* make -1 stand for the last table entry */
638
#define TOPA_ENTRY(t, i) \
639
((i) == -1 \
640
? &topa_to_page(t)->table[(t)->last] \
641
: &topa_to_page(t)->table[(i)])
642
#define TOPA_ENTRY_SIZE(t, i) (sizes(TOPA_ENTRY((t), (i))->size))
643
#define TOPA_ENTRY_PAGES(t, i) (1 << TOPA_ENTRY((t), (i))->size)
644
645
static void pt_config_buffer(struct pt_buffer *buf)
646
{
647
struct pt *pt = this_cpu_ptr(&pt_ctx);
648
u64 reg, mask;
649
void *base;
650
651
if (buf->single) {
652
base = buf->data_pages[0];
653
mask = (buf->nr_pages * PAGE_SIZE - 1) >> 7;
654
} else {
655
base = topa_to_page(buf->cur)->table;
656
mask = (u64)buf->cur_idx;
657
}
658
659
reg = virt_to_phys(base);
660
if (pt->output_base != reg) {
661
pt->output_base = reg;
662
wrmsrq(MSR_IA32_RTIT_OUTPUT_BASE, reg);
663
}
664
665
reg = 0x7f | (mask << 7) | ((u64)buf->output_off << 32);
666
if (pt->output_mask != reg) {
667
pt->output_mask = reg;
668
wrmsrq(MSR_IA32_RTIT_OUTPUT_MASK, reg);
669
}
670
}
671
672
/**
673
* topa_alloc() - allocate page-sized ToPA table
674
* @cpu: CPU on which to allocate.
675
* @gfp: Allocation flags.
676
*
677
* Return: On success, return the pointer to ToPA table page.
678
*/
679
static struct topa *topa_alloc(int cpu, gfp_t gfp)
680
{
681
int node = cpu_to_node(cpu);
682
struct topa_page *tp;
683
struct page *p;
684
685
p = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
686
if (!p)
687
return NULL;
688
689
tp = page_address(p);
690
tp->topa.last = 0;
691
692
/*
693
* In case of singe-entry ToPA, always put the self-referencing END
694
* link as the 2nd entry in the table
695
*/
696
if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) {
697
TOPA_ENTRY(&tp->topa, 1)->base = page_to_phys(p) >> TOPA_SHIFT;
698
TOPA_ENTRY(&tp->topa, 1)->end = 1;
699
}
700
701
return &tp->topa;
702
}
703
704
/**
705
* topa_free() - free a page-sized ToPA table
706
* @topa: Table to deallocate.
707
*/
708
static void topa_free(struct topa *topa)
709
{
710
free_page((unsigned long)topa);
711
}
712
713
/**
714
* topa_insert_table() - insert a ToPA table into a buffer
715
* @buf: PT buffer that's being extended.
716
* @topa: New topa table to be inserted.
717
*
718
* If it's the first table in this buffer, set up buffer's pointers
719
* accordingly; otherwise, add a END=1 link entry to @topa to the current
720
* "last" table and adjust the last table pointer to @topa.
721
*/
722
static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
723
{
724
struct topa *last = buf->last;
725
726
list_add_tail(&topa->list, &buf->tables);
727
728
if (!buf->first) {
729
buf->first = buf->last = buf->cur = topa;
730
return;
731
}
732
733
topa->offset = last->offset + last->size;
734
buf->last = topa;
735
736
if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries))
737
return;
738
739
BUG_ON(last->last != TENTS_PER_PAGE - 1);
740
741
TOPA_ENTRY(last, -1)->base = topa_pfn(topa);
742
TOPA_ENTRY(last, -1)->end = 1;
743
}
744
745
/**
746
* topa_table_full() - check if a ToPA table is filled up
747
* @topa: ToPA table.
748
*/
749
static bool topa_table_full(struct topa *topa)
750
{
751
/* single-entry ToPA is a special case */
752
if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries))
753
return !!topa->last;
754
755
return topa->last == TENTS_PER_PAGE - 1;
756
}
757
758
/**
759
* topa_insert_pages() - create a list of ToPA tables
760
* @buf: PT buffer being initialized.
761
* @cpu: CPU on which to allocate.
762
* @gfp: Allocation flags.
763
*
764
* This initializes a list of ToPA tables with entries from
765
* the data_pages provided by rb_alloc_aux().
766
*
767
* Return: 0 on success or error code.
768
*/
769
static int topa_insert_pages(struct pt_buffer *buf, int cpu, gfp_t gfp)
770
{
771
struct topa *topa = buf->last;
772
int order = 0;
773
struct page *p;
774
775
p = virt_to_page(buf->data_pages[buf->nr_pages]);
776
if (PagePrivate(p))
777
order = page_private(p);
778
779
if (topa_table_full(topa)) {
780
topa = topa_alloc(cpu, gfp);
781
if (!topa)
782
return -ENOMEM;
783
784
topa_insert_table(buf, topa);
785
}
786
787
if (topa->z_count == topa->last - 1) {
788
if (order == TOPA_ENTRY(topa, topa->last - 1)->size)
789
topa->z_count++;
790
}
791
792
TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT;
793
TOPA_ENTRY(topa, -1)->size = order;
794
if (!buf->snapshot &&
795
!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) {
796
TOPA_ENTRY(topa, -1)->intr = 1;
797
TOPA_ENTRY(topa, -1)->stop = 1;
798
}
799
800
topa->last++;
801
topa->size += sizes(order);
802
803
buf->nr_pages += 1ul << order;
804
805
return 0;
806
}
807
808
/**
809
* pt_topa_dump() - print ToPA tables and their entries
810
* @buf: PT buffer.
811
*/
812
static void pt_topa_dump(struct pt_buffer *buf)
813
{
814
struct topa *topa;
815
816
list_for_each_entry(topa, &buf->tables, list) {
817
struct topa_page *tp = topa_to_page(topa);
818
int i;
819
820
pr_debug("# table @%p, off %llx size %zx\n", tp->table,
821
topa->offset, topa->size);
822
for (i = 0; i < TENTS_PER_PAGE; i++) {
823
pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
824
&tp->table[i],
825
(unsigned long)tp->table[i].base << TOPA_SHIFT,
826
sizes(tp->table[i].size),
827
tp->table[i].end ? 'E' : ' ',
828
tp->table[i].intr ? 'I' : ' ',
829
tp->table[i].stop ? 'S' : ' ',
830
*(u64 *)&tp->table[i]);
831
if ((intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) &&
832
tp->table[i].stop) ||
833
tp->table[i].end)
834
break;
835
if (!i && topa->z_count)
836
i += topa->z_count;
837
}
838
}
839
}
840
841
/**
842
* pt_buffer_advance() - advance to the next output region
843
* @buf: PT buffer.
844
*
845
* Advance the current pointers in the buffer to the next ToPA entry.
846
*/
847
static void pt_buffer_advance(struct pt_buffer *buf)
848
{
849
buf->output_off = 0;
850
buf->cur_idx++;
851
852
if (buf->cur_idx == buf->cur->last) {
853
if (buf->cur == buf->last) {
854
buf->cur = buf->first;
855
buf->wrapped = true;
856
} else {
857
buf->cur = list_entry(buf->cur->list.next, struct topa,
858
list);
859
}
860
buf->cur_idx = 0;
861
}
862
}
863
864
/**
865
* pt_update_head() - calculate current offsets and sizes
866
* @pt: Per-cpu pt context.
867
*
868
* Update buffer's current write pointer position and data size.
869
*/
870
static void pt_update_head(struct pt *pt)
871
{
872
struct pt_buffer *buf = perf_get_aux(&pt->handle);
873
bool wrapped = buf->wrapped;
874
u64 topa_idx, base, old;
875
876
buf->wrapped = false;
877
878
if (buf->single) {
879
local_set(&buf->data_size, buf->output_off);
880
return;
881
}
882
883
/* offset of the first region in this table from the beginning of buf */
884
base = buf->cur->offset + buf->output_off;
885
886
/* offset of the current output region within this table */
887
for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++)
888
base += TOPA_ENTRY_SIZE(buf->cur, topa_idx);
889
890
if (buf->snapshot) {
891
local_set(&buf->data_size, base);
892
} else {
893
old = (local64_xchg(&buf->head, base) &
894
((buf->nr_pages << PAGE_SHIFT) - 1));
895
if (base < old || (base == old && wrapped))
896
base += buf->nr_pages << PAGE_SHIFT;
897
898
local_add(base - old, &buf->data_size);
899
}
900
}
901
902
/**
903
* pt_buffer_region() - obtain current output region's address
904
* @buf: PT buffer.
905
*/
906
static void *pt_buffer_region(struct pt_buffer *buf)
907
{
908
return phys_to_virt((phys_addr_t)TOPA_ENTRY(buf->cur, buf->cur_idx)->base << TOPA_SHIFT);
909
}
910
911
/**
912
* pt_buffer_region_size() - obtain current output region's size
913
* @buf: PT buffer.
914
*/
915
static size_t pt_buffer_region_size(struct pt_buffer *buf)
916
{
917
return TOPA_ENTRY_SIZE(buf->cur, buf->cur_idx);
918
}
919
920
/**
921
* pt_handle_status() - take care of possible status conditions
922
* @pt: Per-cpu pt context.
923
*/
924
static void pt_handle_status(struct pt *pt)
925
{
926
struct pt_buffer *buf = perf_get_aux(&pt->handle);
927
int advance = 0;
928
u64 status;
929
930
rdmsrq(MSR_IA32_RTIT_STATUS, status);
931
932
if (status & RTIT_STATUS_ERROR) {
933
pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n");
934
pt_topa_dump(buf);
935
status &= ~RTIT_STATUS_ERROR;
936
}
937
938
if (status & RTIT_STATUS_STOPPED) {
939
status &= ~RTIT_STATUS_STOPPED;
940
941
/*
942
* On systems that only do single-entry ToPA, hitting STOP
943
* means we are already losing data; need to let the decoder
944
* know.
945
*/
946
if (!buf->single &&
947
(!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) ||
948
buf->output_off == pt_buffer_region_size(buf))) {
949
perf_aux_output_flag(&pt->handle,
950
PERF_AUX_FLAG_TRUNCATED);
951
advance++;
952
}
953
}
954
955
/*
956
* Also on single-entry ToPA implementations, interrupt will come
957
* before the output reaches its output region's boundary.
958
*/
959
if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) &&
960
!buf->snapshot &&
961
pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) {
962
void *head = pt_buffer_region(buf);
963
964
/* everything within this margin needs to be zeroed out */
965
memset(head + buf->output_off, 0,
966
pt_buffer_region_size(buf) -
967
buf->output_off);
968
advance++;
969
}
970
971
if (advance)
972
pt_buffer_advance(buf);
973
974
wrmsrq(MSR_IA32_RTIT_STATUS, status);
975
}
976
977
/**
978
* pt_read_offset() - translate registers into buffer pointers
979
* @buf: PT buffer.
980
*
981
* Set buffer's output pointers from MSR values.
982
*/
983
static void pt_read_offset(struct pt_buffer *buf)
984
{
985
struct pt *pt = this_cpu_ptr(&pt_ctx);
986
struct topa_page *tp;
987
988
if (!buf->single) {
989
rdmsrq(MSR_IA32_RTIT_OUTPUT_BASE, pt->output_base);
990
tp = phys_to_virt(pt->output_base);
991
buf->cur = &tp->topa;
992
}
993
994
rdmsrq(MSR_IA32_RTIT_OUTPUT_MASK, pt->output_mask);
995
/* offset within current output region */
996
buf->output_off = pt->output_mask >> 32;
997
/* index of current output region within this table */
998
if (!buf->single)
999
buf->cur_idx = (pt->output_mask & 0xffffff80) >> 7;
1000
}
1001
1002
static struct topa_entry *
1003
pt_topa_entry_for_page(struct pt_buffer *buf, unsigned int pg)
1004
{
1005
struct topa_page *tp;
1006
struct topa *topa;
1007
unsigned int idx, cur_pg = 0, z_pg = 0, start_idx = 0;
1008
1009
/*
1010
* Indicates a bug in the caller.
1011
*/
1012
if (WARN_ON_ONCE(pg >= buf->nr_pages))
1013
return NULL;
1014
1015
/*
1016
* First, find the ToPA table where @pg fits. With high
1017
* order allocations, there shouldn't be many of these.
1018
*/
1019
list_for_each_entry(topa, &buf->tables, list) {
1020
if (topa->offset + topa->size > (unsigned long)pg << PAGE_SHIFT)
1021
goto found;
1022
}
1023
1024
/*
1025
* Hitting this means we have a problem in the ToPA
1026
* allocation code.
1027
*/
1028
WARN_ON_ONCE(1);
1029
1030
return NULL;
1031
1032
found:
1033
/*
1034
* Indicates a problem in the ToPA allocation code.
1035
*/
1036
if (WARN_ON_ONCE(topa->last == -1))
1037
return NULL;
1038
1039
tp = topa_to_page(topa);
1040
cur_pg = PFN_DOWN(topa->offset);
1041
if (topa->z_count) {
1042
z_pg = TOPA_ENTRY_PAGES(topa, 0) * (topa->z_count + 1);
1043
start_idx = topa->z_count + 1;
1044
}
1045
1046
/*
1047
* Multiple entries at the beginning of the table have the same size,
1048
* ideally all of them; if @pg falls there, the search is done.
1049
*/
1050
if (pg >= cur_pg && pg < cur_pg + z_pg) {
1051
idx = (pg - cur_pg) / TOPA_ENTRY_PAGES(topa, 0);
1052
return &tp->table[idx];
1053
}
1054
1055
/*
1056
* Otherwise, slow path: iterate through the remaining entries.
1057
*/
1058
for (idx = start_idx, cur_pg += z_pg; idx < topa->last; idx++) {
1059
if (cur_pg + TOPA_ENTRY_PAGES(topa, idx) > pg)
1060
return &tp->table[idx];
1061
1062
cur_pg += TOPA_ENTRY_PAGES(topa, idx);
1063
}
1064
1065
/*
1066
* Means we couldn't find a ToPA entry in the table that does match.
1067
*/
1068
WARN_ON_ONCE(1);
1069
1070
return NULL;
1071
}
1072
1073
static struct topa_entry *
1074
pt_topa_prev_entry(struct pt_buffer *buf, struct topa_entry *te)
1075
{
1076
unsigned long table = (unsigned long)te & ~(PAGE_SIZE - 1);
1077
struct topa_page *tp;
1078
struct topa *topa;
1079
1080
tp = (struct topa_page *)table;
1081
if (tp->table != te)
1082
return --te;
1083
1084
topa = &tp->topa;
1085
if (topa == buf->first)
1086
topa = buf->last;
1087
else
1088
topa = list_prev_entry(topa, list);
1089
1090
tp = topa_to_page(topa);
1091
1092
return &tp->table[topa->last - 1];
1093
}
1094
1095
/**
1096
* pt_buffer_reset_markers() - place interrupt and stop bits in the buffer
1097
* @buf: PT buffer.
1098
* @handle: Current output handle.
1099
*
1100
* Place INT and STOP marks to prevent overwriting old data that the consumer
1101
* hasn't yet collected and waking up the consumer after a certain fraction of
1102
* the buffer has filled up. Only needed and sensible for non-snapshot counters.
1103
*
1104
* This obviously relies on buf::head to figure out buffer markers, so it has
1105
* to be called after pt_buffer_reset_offsets() and before the hardware tracing
1106
* is enabled.
1107
*/
1108
static int pt_buffer_reset_markers(struct pt_buffer *buf,
1109
struct perf_output_handle *handle)
1110
1111
{
1112
unsigned long head = local64_read(&buf->head);
1113
unsigned long idx, npages, wakeup;
1114
1115
if (buf->single)
1116
return 0;
1117
1118
/* can't stop in the middle of an output region */
1119
if (buf->output_off + handle->size + 1 < pt_buffer_region_size(buf)) {
1120
perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
1121
return -EINVAL;
1122
}
1123
1124
1125
/* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
1126
if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries))
1127
return 0;
1128
1129
/* clear STOP and INT from current entry */
1130
if (buf->stop_te) {
1131
buf->stop_te->stop = 0;
1132
buf->stop_te->intr = 0;
1133
}
1134
1135
if (buf->intr_te)
1136
buf->intr_te->intr = 0;
1137
1138
/* how many pages till the STOP marker */
1139
npages = handle->size >> PAGE_SHIFT;
1140
1141
/* if it's on a page boundary, fill up one more page */
1142
if (!offset_in_page(head + handle->size + 1))
1143
npages++;
1144
1145
idx = (head >> PAGE_SHIFT) + npages;
1146
idx &= buf->nr_pages - 1;
1147
1148
if (idx != buf->stop_pos) {
1149
buf->stop_pos = idx;
1150
buf->stop_te = pt_topa_entry_for_page(buf, idx);
1151
buf->stop_te = pt_topa_prev_entry(buf, buf->stop_te);
1152
}
1153
1154
wakeup = handle->wakeup >> PAGE_SHIFT;
1155
1156
/* in the worst case, wake up the consumer one page before hard stop */
1157
idx = (head >> PAGE_SHIFT) + npages - 1;
1158
if (idx > wakeup)
1159
idx = wakeup;
1160
1161
idx &= buf->nr_pages - 1;
1162
if (idx != buf->intr_pos) {
1163
buf->intr_pos = idx;
1164
buf->intr_te = pt_topa_entry_for_page(buf, idx);
1165
buf->intr_te = pt_topa_prev_entry(buf, buf->intr_te);
1166
}
1167
1168
buf->stop_te->stop = 1;
1169
buf->stop_te->intr = 1;
1170
buf->intr_te->intr = 1;
1171
1172
return 0;
1173
}
1174
1175
/**
1176
* pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head
1177
* @buf: PT buffer.
1178
* @head: Write pointer (aux_head) from AUX buffer.
1179
*
1180
* Find the ToPA table and entry corresponding to given @head and set buffer's
1181
* "current" pointers accordingly. This is done after we have obtained the
1182
* current aux_head position from a successful call to perf_aux_output_begin()
1183
* to make sure the hardware is writing to the right place.
1184
*
1185
* This function modifies buf::{cur,cur_idx,output_off} that will be programmed
1186
* into PT msrs when the tracing is enabled and buf::head and buf::data_size,
1187
* which are used to determine INT and STOP markers' locations by a subsequent
1188
* call to pt_buffer_reset_markers().
1189
*/
1190
static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
1191
{
1192
struct topa_page *cur_tp;
1193
struct topa_entry *te;
1194
int pg;
1195
1196
if (buf->snapshot)
1197
head &= (buf->nr_pages << PAGE_SHIFT) - 1;
1198
1199
if (!buf->single) {
1200
pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
1201
te = pt_topa_entry_for_page(buf, pg);
1202
1203
cur_tp = topa_entry_to_page(te);
1204
buf->cur = &cur_tp->topa;
1205
buf->cur_idx = te - TOPA_ENTRY(buf->cur, 0);
1206
buf->output_off = head & (pt_buffer_region_size(buf) - 1);
1207
} else {
1208
buf->output_off = head;
1209
}
1210
1211
local64_set(&buf->head, head);
1212
local_set(&buf->data_size, 0);
1213
}
1214
1215
/**
1216
* pt_buffer_fini_topa() - deallocate ToPA structure of a buffer
1217
* @buf: PT buffer.
1218
*/
1219
static void pt_buffer_fini_topa(struct pt_buffer *buf)
1220
{
1221
struct topa *topa, *iter;
1222
1223
if (buf->single)
1224
return;
1225
1226
list_for_each_entry_safe(topa, iter, &buf->tables, list) {
1227
/*
1228
* right now, this is in free_aux() path only, so
1229
* no need to unlink this table from the list
1230
*/
1231
topa_free(topa);
1232
}
1233
}
1234
1235
/**
1236
* pt_buffer_init_topa() - initialize ToPA table for pt buffer
1237
* @buf: PT buffer.
1238
* @cpu: CPU on which to allocate.
1239
* @nr_pages: No. of pages to allocate.
1240
* @gfp: Allocation flags.
1241
*
1242
* Return: 0 on success or error code.
1243
*/
1244
static int pt_buffer_init_topa(struct pt_buffer *buf, int cpu,
1245
unsigned long nr_pages, gfp_t gfp)
1246
{
1247
struct topa *topa;
1248
int err;
1249
1250
topa = topa_alloc(cpu, gfp);
1251
if (!topa)
1252
return -ENOMEM;
1253
1254
topa_insert_table(buf, topa);
1255
1256
while (buf->nr_pages < nr_pages) {
1257
err = topa_insert_pages(buf, cpu, gfp);
1258
if (err) {
1259
pt_buffer_fini_topa(buf);
1260
return -ENOMEM;
1261
}
1262
}
1263
1264
/* link last table to the first one, unless we're double buffering */
1265
if (intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) {
1266
TOPA_ENTRY(buf->last, -1)->base = topa_pfn(buf->first);
1267
TOPA_ENTRY(buf->last, -1)->end = 1;
1268
}
1269
1270
pt_topa_dump(buf);
1271
return 0;
1272
}
1273
1274
static int pt_buffer_try_single(struct pt_buffer *buf, int nr_pages)
1275
{
1276
struct page *p = virt_to_page(buf->data_pages[0]);
1277
int ret = -ENOTSUPP, order = 0;
1278
1279
/*
1280
* We can use single range output mode
1281
* + in snapshot mode, where we don't need interrupts;
1282
* + if the hardware supports it;
1283
* + if the entire buffer is one contiguous allocation.
1284
*/
1285
if (!buf->snapshot)
1286
goto out;
1287
1288
if (!intel_pt_validate_hw_cap(PT_CAP_single_range_output))
1289
goto out;
1290
1291
if (PagePrivate(p))
1292
order = page_private(p);
1293
1294
if (1 << order != nr_pages)
1295
goto out;
1296
1297
/*
1298
* Some processors cannot always support single range for more than
1299
* 4KB - refer errata TGL052, ADL037 and RPL017. Future processors might
1300
* also be affected, so for now rather than trying to keep track of
1301
* which ones, just disable it for all.
1302
*/
1303
if (nr_pages > 1)
1304
goto out;
1305
1306
buf->single = true;
1307
buf->nr_pages = nr_pages;
1308
ret = 0;
1309
out:
1310
return ret;
1311
}
1312
1313
/**
1314
* pt_buffer_setup_aux() - set up topa tables for a PT buffer
1315
* @event: Performance event
1316
* @pages: Array of pointers to buffer pages passed from perf core.
1317
* @nr_pages: Number of pages in the buffer.
1318
* @snapshot: If this is a snapshot/overwrite counter.
1319
*
1320
* This is a pmu::setup_aux callback that sets up ToPA tables and all the
1321
* bookkeeping for an AUX buffer.
1322
*
1323
* Return: Our private PT buffer structure.
1324
*/
1325
static void *
1326
pt_buffer_setup_aux(struct perf_event *event, void **pages,
1327
int nr_pages, bool snapshot)
1328
{
1329
struct pt_buffer *buf;
1330
int node, ret, cpu = event->cpu;
1331
1332
if (!nr_pages)
1333
return NULL;
1334
1335
/*
1336
* Only support AUX sampling in snapshot mode, where we don't
1337
* generate NMIs.
1338
*/
1339
if (event->attr.aux_sample_size && !snapshot)
1340
return NULL;
1341
1342
if (cpu == -1)
1343
cpu = raw_smp_processor_id();
1344
node = cpu_to_node(cpu);
1345
1346
buf = kzalloc_node(sizeof(struct pt_buffer), GFP_KERNEL, node);
1347
if (!buf)
1348
return NULL;
1349
1350
buf->snapshot = snapshot;
1351
buf->data_pages = pages;
1352
buf->stop_pos = -1;
1353
buf->intr_pos = -1;
1354
1355
INIT_LIST_HEAD(&buf->tables);
1356
1357
ret = pt_buffer_try_single(buf, nr_pages);
1358
if (!ret)
1359
return buf;
1360
1361
ret = pt_buffer_init_topa(buf, cpu, nr_pages, GFP_KERNEL);
1362
if (ret) {
1363
kfree(buf);
1364
return NULL;
1365
}
1366
1367
return buf;
1368
}
1369
1370
/**
1371
* pt_buffer_free_aux() - perf AUX deallocation path callback
1372
* @data: PT buffer.
1373
*/
1374
static void pt_buffer_free_aux(void *data)
1375
{
1376
struct pt_buffer *buf = data;
1377
1378
pt_buffer_fini_topa(buf);
1379
kfree(buf);
1380
}
1381
1382
static int pt_addr_filters_init(struct perf_event *event)
1383
{
1384
struct pt_filters *filters;
1385
int node = event->cpu == -1 ? -1 : cpu_to_node(event->cpu);
1386
1387
if (!intel_pt_validate_hw_cap(PT_CAP_num_address_ranges))
1388
return 0;
1389
1390
filters = kzalloc_node(sizeof(struct pt_filters), GFP_KERNEL, node);
1391
if (!filters)
1392
return -ENOMEM;
1393
1394
if (event->parent)
1395
memcpy(filters, event->parent->hw.addr_filters,
1396
sizeof(*filters));
1397
1398
event->hw.addr_filters = filters;
1399
1400
return 0;
1401
}
1402
1403
static void pt_addr_filters_fini(struct perf_event *event)
1404
{
1405
kfree(event->hw.addr_filters);
1406
event->hw.addr_filters = NULL;
1407
}
1408
1409
#ifdef CONFIG_X86_64
1410
/* Clamp to a canonical address greater-than-or-equal-to the address given */
1411
static u64 clamp_to_ge_canonical_addr(u64 vaddr, u8 vaddr_bits)
1412
{
1413
return __is_canonical_address(vaddr, vaddr_bits) ?
1414
vaddr :
1415
-BIT_ULL(vaddr_bits - 1);
1416
}
1417
1418
/* Clamp to a canonical address less-than-or-equal-to the address given */
1419
static u64 clamp_to_le_canonical_addr(u64 vaddr, u8 vaddr_bits)
1420
{
1421
return __is_canonical_address(vaddr, vaddr_bits) ?
1422
vaddr :
1423
BIT_ULL(vaddr_bits - 1) - 1;
1424
}
1425
#else
1426
#define clamp_to_ge_canonical_addr(x, y) (x)
1427
#define clamp_to_le_canonical_addr(x, y) (x)
1428
#endif
1429
1430
static int pt_event_addr_filters_validate(struct list_head *filters)
1431
{
1432
struct perf_addr_filter *filter;
1433
int range = 0;
1434
1435
list_for_each_entry(filter, filters, entry) {
1436
/*
1437
* PT doesn't support single address triggers and
1438
* 'start' filters.
1439
*/
1440
if (!filter->size ||
1441
filter->action == PERF_ADDR_FILTER_ACTION_START)
1442
return -EOPNOTSUPP;
1443
1444
if (++range > intel_pt_validate_hw_cap(PT_CAP_num_address_ranges))
1445
return -EOPNOTSUPP;
1446
}
1447
1448
return 0;
1449
}
1450
1451
static void pt_event_addr_filters_sync(struct perf_event *event)
1452
{
1453
struct perf_addr_filters_head *head = perf_event_addr_filters(event);
1454
unsigned long msr_a, msr_b;
1455
struct perf_addr_filter_range *fr = event->addr_filter_ranges;
1456
struct pt_filters *filters = event->hw.addr_filters;
1457
struct perf_addr_filter *filter;
1458
int range = 0;
1459
1460
if (!filters)
1461
return;
1462
1463
list_for_each_entry(filter, &head->list, entry) {
1464
if (filter->path.dentry && !fr[range].start) {
1465
msr_a = msr_b = 0;
1466
} else {
1467
unsigned long n = fr[range].size - 1;
1468
unsigned long a = fr[range].start;
1469
unsigned long b;
1470
1471
if (a > ULONG_MAX - n)
1472
b = ULONG_MAX;
1473
else
1474
b = a + n;
1475
/*
1476
* Apply the offset. 64-bit addresses written to the
1477
* MSRs must be canonical, but the range can encompass
1478
* non-canonical addresses. Since software cannot
1479
* execute at non-canonical addresses, adjusting to
1480
* canonical addresses does not affect the result of the
1481
* address filter.
1482
*/
1483
msr_a = clamp_to_ge_canonical_addr(a, boot_cpu_data.x86_virt_bits);
1484
msr_b = clamp_to_le_canonical_addr(b, boot_cpu_data.x86_virt_bits);
1485
if (msr_b < msr_a)
1486
msr_a = msr_b = 0;
1487
}
1488
1489
filters->filter[range].msr_a = msr_a;
1490
filters->filter[range].msr_b = msr_b;
1491
if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER)
1492
filters->filter[range].config = 1;
1493
else
1494
filters->filter[range].config = 2;
1495
range++;
1496
}
1497
1498
filters->nr_filters = range;
1499
}
1500
1501
/**
1502
* intel_pt_interrupt() - PT PMI handler
1503
*/
1504
void intel_pt_interrupt(void)
1505
{
1506
struct pt *pt = this_cpu_ptr(&pt_ctx);
1507
struct pt_buffer *buf;
1508
struct perf_event *event = pt->handle.event;
1509
1510
/*
1511
* There may be a dangling PT bit in the interrupt status register
1512
* after PT has been disabled by pt_event_stop(). Make sure we don't
1513
* do anything (particularly, re-enable) for this event here.
1514
*/
1515
if (!READ_ONCE(pt->handle_nmi))
1516
return;
1517
1518
if (!event)
1519
return;
1520
1521
pt_config_stop(event);
1522
1523
buf = perf_get_aux(&pt->handle);
1524
if (!buf)
1525
return;
1526
1527
pt_read_offset(buf);
1528
1529
pt_handle_status(pt);
1530
1531
pt_update_head(pt);
1532
1533
perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0));
1534
1535
if (!event->hw.state) {
1536
int ret;
1537
1538
buf = perf_aux_output_begin(&pt->handle, event);
1539
if (!buf) {
1540
event->hw.state = PERF_HES_STOPPED;
1541
WRITE_ONCE(pt->resume_allowed, 0);
1542
return;
1543
}
1544
1545
pt_buffer_reset_offsets(buf, pt->handle.head);
1546
/* snapshot counters don't use PMI, so it's safe */
1547
ret = pt_buffer_reset_markers(buf, &pt->handle);
1548
if (ret) {
1549
perf_aux_output_end(&pt->handle, 0);
1550
WRITE_ONCE(pt->resume_allowed, 0);
1551
return;
1552
}
1553
1554
pt_config_buffer(buf);
1555
pt_config_start(event);
1556
}
1557
}
1558
1559
void intel_pt_handle_vmx(int on)
1560
{
1561
struct pt *pt = this_cpu_ptr(&pt_ctx);
1562
struct perf_event *event;
1563
unsigned long flags;
1564
1565
/* PT plays nice with VMX, do nothing */
1566
if (pt_pmu.vmx)
1567
return;
1568
1569
/*
1570
* VMXON will clear RTIT_CTL.TraceEn; we need to make
1571
* sure to not try to set it while VMX is on. Disable
1572
* interrupts to avoid racing with pmu callbacks;
1573
* concurrent PMI should be handled fine.
1574
*/
1575
local_irq_save(flags);
1576
WRITE_ONCE(pt->vmx_on, on);
1577
1578
/*
1579
* If an AUX transaction is in progress, it will contain
1580
* gap(s), so flag it PARTIAL to inform the user.
1581
*/
1582
event = pt->handle.event;
1583
if (event)
1584
perf_aux_output_flag(&pt->handle,
1585
PERF_AUX_FLAG_PARTIAL);
1586
1587
/* Turn PTs back on */
1588
if (!on && event)
1589
wrmsrq(MSR_IA32_RTIT_CTL, event->hw.aux_config);
1590
1591
local_irq_restore(flags);
1592
}
1593
EXPORT_SYMBOL_GPL(intel_pt_handle_vmx);
1594
1595
/*
1596
* PMU callbacks
1597
*/
1598
1599
static void pt_event_start(struct perf_event *event, int mode)
1600
{
1601
struct hw_perf_event *hwc = &event->hw;
1602
struct pt *pt = this_cpu_ptr(&pt_ctx);
1603
struct pt_buffer *buf;
1604
1605
if (mode & PERF_EF_RESUME) {
1606
if (READ_ONCE(pt->resume_allowed)) {
1607
u64 status;
1608
1609
/*
1610
* Only if the trace is not active and the error and
1611
* stopped bits are clear, is it safe to start, but a
1612
* PMI might have just cleared these, so resume_allowed
1613
* must be checked again also.
1614
*/
1615
rdmsrq(MSR_IA32_RTIT_STATUS, status);
1616
if (!(status & (RTIT_STATUS_TRIGGEREN |
1617
RTIT_STATUS_ERROR |
1618
RTIT_STATUS_STOPPED)) &&
1619
READ_ONCE(pt->resume_allowed))
1620
pt_config_start(event);
1621
}
1622
return;
1623
}
1624
1625
buf = perf_aux_output_begin(&pt->handle, event);
1626
if (!buf)
1627
goto fail_stop;
1628
1629
pt_buffer_reset_offsets(buf, pt->handle.head);
1630
if (!buf->snapshot) {
1631
if (pt_buffer_reset_markers(buf, &pt->handle))
1632
goto fail_end_stop;
1633
}
1634
1635
hwc->state = 0;
1636
1637
pt_config_buffer(buf);
1638
pt_config(event);
1639
1640
return;
1641
1642
fail_end_stop:
1643
perf_aux_output_end(&pt->handle, 0);
1644
fail_stop:
1645
hwc->state = PERF_HES_STOPPED;
1646
}
1647
1648
static void pt_event_stop(struct perf_event *event, int mode)
1649
{
1650
struct pt *pt = this_cpu_ptr(&pt_ctx);
1651
1652
if (mode & PERF_EF_PAUSE) {
1653
if (READ_ONCE(pt->pause_allowed))
1654
pt_config_stop(event);
1655
return;
1656
}
1657
1658
/*
1659
* Protect against the PMI racing with disabling wrmsr,
1660
* see comment in intel_pt_interrupt().
1661
*/
1662
WRITE_ONCE(pt->handle_nmi, 0);
1663
barrier();
1664
1665
/*
1666
* Prevent a resume from attempting to restart tracing, or a pause
1667
* during a subsequent start. Do this after clearing handle_nmi so that
1668
* pt_event_snapshot_aux() will not re-allow them.
1669
*/
1670
WRITE_ONCE(pt->pause_allowed, 0);
1671
WRITE_ONCE(pt->resume_allowed, 0);
1672
barrier();
1673
1674
pt_config_stop(event);
1675
1676
if (event->hw.state == PERF_HES_STOPPED)
1677
return;
1678
1679
event->hw.state = PERF_HES_STOPPED;
1680
1681
if (mode & PERF_EF_UPDATE) {
1682
struct pt_buffer *buf = perf_get_aux(&pt->handle);
1683
1684
if (!buf)
1685
return;
1686
1687
if (WARN_ON_ONCE(pt->handle.event != event))
1688
return;
1689
1690
pt_read_offset(buf);
1691
1692
pt_handle_status(pt);
1693
1694
pt_update_head(pt);
1695
1696
if (buf->snapshot)
1697
pt->handle.head =
1698
local_xchg(&buf->data_size,
1699
buf->nr_pages << PAGE_SHIFT);
1700
perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0));
1701
}
1702
}
1703
1704
static long pt_event_snapshot_aux(struct perf_event *event,
1705
struct perf_output_handle *handle,
1706
unsigned long size)
1707
{
1708
struct pt *pt = this_cpu_ptr(&pt_ctx);
1709
struct pt_buffer *buf = perf_get_aux(&pt->handle);
1710
unsigned long from = 0, to;
1711
long ret;
1712
1713
if (WARN_ON_ONCE(!buf))
1714
return 0;
1715
1716
/*
1717
* Sampling is only allowed on snapshot events;
1718
* see pt_buffer_setup_aux().
1719
*/
1720
if (WARN_ON_ONCE(!buf->snapshot))
1721
return 0;
1722
1723
/* Prevent pause/resume from attempting to start/stop tracing */
1724
WRITE_ONCE(pt->pause_allowed, 0);
1725
WRITE_ONCE(pt->resume_allowed, 0);
1726
barrier();
1727
/*
1728
* There is no PT interrupt in this mode, so stop the trace and it will
1729
* remain stopped while the buffer is copied.
1730
*/
1731
pt_config_stop(event);
1732
pt_read_offset(buf);
1733
pt_update_head(pt);
1734
1735
to = local_read(&buf->data_size);
1736
if (to < size)
1737
from = buf->nr_pages << PAGE_SHIFT;
1738
from += to - size;
1739
1740
ret = perf_output_copy_aux(&pt->handle, handle, from, to);
1741
1742
/*
1743
* Here, handle_nmi tells us if the tracing was on.
1744
* If the tracing was on, restart it.
1745
*/
1746
if (READ_ONCE(pt->handle_nmi)) {
1747
WRITE_ONCE(pt->resume_allowed, 1);
1748
barrier();
1749
pt_config_start(event);
1750
barrier();
1751
WRITE_ONCE(pt->pause_allowed, 1);
1752
}
1753
1754
return ret;
1755
}
1756
1757
static void pt_event_del(struct perf_event *event, int mode)
1758
{
1759
pt_event_stop(event, PERF_EF_UPDATE);
1760
}
1761
1762
static int pt_event_add(struct perf_event *event, int mode)
1763
{
1764
struct pt *pt = this_cpu_ptr(&pt_ctx);
1765
struct hw_perf_event *hwc = &event->hw;
1766
int ret = -EBUSY;
1767
1768
if (pt->handle.event)
1769
goto fail;
1770
1771
if (mode & PERF_EF_START) {
1772
pt_event_start(event, 0);
1773
ret = -EINVAL;
1774
if (hwc->state == PERF_HES_STOPPED)
1775
goto fail;
1776
} else {
1777
hwc->state = PERF_HES_STOPPED;
1778
}
1779
1780
ret = 0;
1781
fail:
1782
1783
return ret;
1784
}
1785
1786
static void pt_event_read(struct perf_event *event)
1787
{
1788
}
1789
1790
static void pt_event_destroy(struct perf_event *event)
1791
{
1792
pt_addr_filters_fini(event);
1793
x86_del_exclusive(x86_lbr_exclusive_pt);
1794
}
1795
1796
static int pt_event_init(struct perf_event *event)
1797
{
1798
if (event->attr.type != pt_pmu.pmu.type)
1799
return -ENOENT;
1800
1801
if (!pt_event_valid(event))
1802
return -EINVAL;
1803
1804
if (x86_add_exclusive(x86_lbr_exclusive_pt))
1805
return -EBUSY;
1806
1807
if (pt_addr_filters_init(event)) {
1808
x86_del_exclusive(x86_lbr_exclusive_pt);
1809
return -ENOMEM;
1810
}
1811
1812
event->destroy = pt_event_destroy;
1813
1814
return 0;
1815
}
1816
1817
void cpu_emergency_stop_pt(void)
1818
{
1819
struct pt *pt = this_cpu_ptr(&pt_ctx);
1820
1821
if (pt->handle.event)
1822
pt_event_stop(pt->handle.event, PERF_EF_UPDATE);
1823
}
1824
1825
int is_intel_pt_event(struct perf_event *event)
1826
{
1827
return event->pmu == &pt_pmu.pmu;
1828
}
1829
1830
static __init int pt_init(void)
1831
{
1832
int ret, cpu, prior_warn = 0;
1833
1834
BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
1835
1836
if (!boot_cpu_has(X86_FEATURE_INTEL_PT))
1837
return -ENODEV;
1838
1839
cpus_read_lock();
1840
for_each_online_cpu(cpu) {
1841
u64 ctl;
1842
1843
ret = rdmsrq_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl);
1844
if (!ret && (ctl & RTIT_CTL_TRACEEN))
1845
prior_warn++;
1846
}
1847
cpus_read_unlock();
1848
1849
if (prior_warn) {
1850
x86_add_exclusive(x86_lbr_exclusive_pt);
1851
pr_warn("PT is enabled at boot time, doing nothing\n");
1852
1853
return -EBUSY;
1854
}
1855
1856
ret = pt_pmu_hw_init();
1857
if (ret)
1858
return ret;
1859
1860
if (!intel_pt_validate_hw_cap(PT_CAP_topa_output)) {
1861
pr_warn("ToPA output is not supported on this CPU\n");
1862
return -ENODEV;
1863
}
1864
1865
if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries))
1866
pt_pmu.pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG;
1867
else
1868
pt_pmu.pmu.capabilities = PERF_PMU_CAP_AUX_PREFER_LARGE;
1869
1870
pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE |
1871
PERF_PMU_CAP_ITRACE |
1872
PERF_PMU_CAP_AUX_PAUSE;
1873
pt_pmu.pmu.attr_groups = pt_attr_groups;
1874
pt_pmu.pmu.task_ctx_nr = perf_sw_context;
1875
pt_pmu.pmu.event_init = pt_event_init;
1876
pt_pmu.pmu.add = pt_event_add;
1877
pt_pmu.pmu.del = pt_event_del;
1878
pt_pmu.pmu.start = pt_event_start;
1879
pt_pmu.pmu.stop = pt_event_stop;
1880
pt_pmu.pmu.snapshot_aux = pt_event_snapshot_aux;
1881
pt_pmu.pmu.read = pt_event_read;
1882
pt_pmu.pmu.setup_aux = pt_buffer_setup_aux;
1883
pt_pmu.pmu.free_aux = pt_buffer_free_aux;
1884
pt_pmu.pmu.addr_filters_sync = pt_event_addr_filters_sync;
1885
pt_pmu.pmu.addr_filters_validate = pt_event_addr_filters_validate;
1886
pt_pmu.pmu.nr_addr_filters =
1887
intel_pt_validate_hw_cap(PT_CAP_num_address_ranges);
1888
1889
ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
1890
1891
return ret;
1892
}
1893
arch_initcall(pt_init);
1894
1895