Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/amd64/pt/pt.c
104148 views
1
/*
2
* Copyright (c) 2025 Bojan Novković <[email protected]>
3
*
4
* SPDX-License-Identifier: BSD-2-Clause
5
*/
6
7
/*
8
* hwt(4) Intel Processor Trace (PT) backend
9
*
10
* Driver Design Overview
11
*
12
* - Since PT is configured on a per-core basis, the driver uses
13
* 'smp_rendezvous' to start and disable tracing on each target core.
14
* - PT-specific resources are stored in a 'struct pt_ctx' context structure for
15
* each traced CPU core or thread. Upon initialization, a ToPA configuration
16
* is generated for each 'pt_ctx' structure using the HWT tracing buffers.
17
* The HWT tracing buffer is split into 4K ToPA entries. Currently, each
18
* 4K ToPA entry is configured to trigger an interrupt after it is filled.
19
* - The PT driver uses the XSAVE/XRSTOR PT extensions to load and save all
20
* relevant PT registers. Every time a traced thread is switched
21
* out or in, its state will be saved to or loaded from its corresponding
22
* 'pt_ctx' context.
23
* - When tracing starts, the PT hardware will start writing data into the
24
* tracing buffer. When a TOPA_INT entry is filled, it will trigger an
25
* interrupt before continuing. The interrupt handler will then fetch the
26
* last valid tracing buffer offset and enqueue a HWT_RECORD_BUFFER record.
27
* The driver is currently configured to use the NMI interrupt line.
28
* - The userspace PT backend waits for incoming HWT_RECORD_BUFFER records
29
* and uses the offsets to decode data from the tracing buffer.
30
*
31
* Future improvements and limitations
32
*
33
* - We currently configure the PT hardware to trigger an interrupt whenever
34
* a 4K ToPA entry is filled. While this is fine when tracing smaller
35
* functions or infrequent code paths, this will generate too much interrupt
36
* traffic when tracing hotter functions. A proper solution for this issue
37
* should estimate the amount of data generated by the current configuration
38
* and use it to determine interrupt frequency.
39
*
40
* - Support for more tracing options and PT features.
41
*
42
*/
43
44
#include <sys/systm.h>
45
#include <sys/bus.h>
46
#include <sys/hwt.h>
47
#include <sys/interrupt.h>
48
#include <sys/kernel.h>
49
#include <sys/lock.h>
50
#include <sys/malloc.h>
51
#include <sys/module.h>
52
#include <sys/mutex.h>
53
#include <sys/smp.h>
54
55
#include <vm/vm.h>
56
#include <vm/vm_page.h>
57
58
#include <machine/atomic.h>
59
#include <machine/cpufunc.h>
60
#include <machine/fpu.h>
61
#include <machine/smp.h>
62
#include <machine/specialreg.h>
63
64
#include <x86/apicvar.h>
65
#include <x86/x86_var.h>
66
67
#include <dev/hwt/hwt_context.h>
68
#include <dev/hwt/hwt_vm.h>
69
#include <dev/hwt/hwt_backend.h>
70
#include <dev/hwt/hwt_config.h>
71
#include <dev/hwt/hwt_cpu.h>
72
#include <dev/hwt/hwt_record.h>
73
#include <dev/hwt/hwt_thread.h>
74
75
#include <amd64/pt/pt.h>
76
77
#ifdef PT_DEBUG
78
#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__)
79
#else
80
#define dprintf(fmt, ...)
81
#endif
82
#define PT_SUPPORTED_FLAGS \
83
(RTIT_CTL_MTCEN | RTIT_CTL_CR3FILTER | RTIT_CTL_DIS_TNT | \
84
RTIT_CTL_USER | RTIT_CTL_OS | RTIT_CTL_BRANCHEN)
85
#define PT_XSAVE_MASK (XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE)
86
#define PT_XSTATE_BV (PT_XSAVE_MASK | XFEATURE_ENABLED_PT)
87
#define PT_MAX_IP_RANGES 2
88
89
#define PT_TOPA_MASK_PTRS 0x7f
90
#define PT_TOPA_PAGE_MASK 0xffffff80
91
#define PT_TOPA_PAGE_SHIFT 7
92
93
#define CPUID_PT_LEAF 0x14
94
95
MALLOC_DEFINE(M_PT, "pt", "Intel Processor Trace");
96
97
static void pt_send_buffer_record(void *arg);
98
static int pt_topa_intr(struct trapframe *tf);
99
100
/*
101
* Intel Processor Trace XSAVE-managed state.
102
*/
103
struct pt_ext_area {
104
uint64_t rtit_ctl;
105
uint64_t rtit_output_base;
106
uint64_t rtit_output_mask_ptrs;
107
uint64_t rtit_status;
108
uint64_t rtit_cr3_match;
109
uint64_t rtit_addr0_a;
110
uint64_t rtit_addr0_b;
111
uint64_t rtit_addr1_a;
112
uint64_t rtit_addr1_b;
113
};
114
115
struct pt_buffer {
116
uint64_t *topa_hw; /* ToPA table entries. */
117
size_t size;
118
struct mtx lock; /* Lock for fields below. */
119
vm_offset_t offset;
120
};
121
122
struct pt_ctx {
123
int id;
124
struct pt_buffer buf; /* ToPA buffer metadata */
125
struct hwt_context *hwt_ctx;
126
uint8_t *save_area; /* PT XSAVE area */
127
};
128
/* PT tracing contexts used for CPU mode. */
129
static struct pt_ctx *pt_pcpu_ctx;
130
131
enum pt_cpu_state { PT_INACTIVE = 0, PT_ACTIVE };
132
133
static struct pt_cpu {
134
struct pt_ctx *ctx; /* active PT tracing context */
135
enum pt_cpu_state state; /* used as part of trace stop protocol */
136
void *swi_cookie; /* Software interrupt handler context */
137
int in_pcint_handler;
138
} *pt_pcpu;
139
140
/*
141
* PT-related CPUID bits.
142
*/
143
static struct pt_cpu_info {
144
uint32_t l0_eax;
145
uint32_t l0_ebx;
146
uint32_t l0_ecx;
147
uint32_t l1_eax;
148
uint32_t l1_ebx;
149
size_t xsave_area_size;
150
size_t xstate_hdr_offset;
151
size_t pt_xsave_offset;
152
} pt_info __read_mostly;
153
154
static bool initialized = false;
155
static int cpu_mode_ctr = 0;
156
157
static __inline enum pt_cpu_state
158
pt_cpu_get_state(int cpu_id)
159
{
160
return (atomic_load_int(&pt_pcpu[cpu_id].state));
161
}
162
163
static __inline void
164
pt_cpu_set_state(int cpu_id, enum pt_cpu_state state)
165
{
166
atomic_store_int(&pt_pcpu[cpu_id].state, state);
167
}
168
169
static __inline struct xstate_hdr *
170
pt_ctx_get_xstate_hdr(struct pt_ctx *ctx)
171
{
172
return ((struct xstate_hdr *)(ctx->save_area +
173
pt_info.xstate_hdr_offset));
174
}
175
176
177
static __inline struct pt_ext_area *
178
pt_ctx_get_ext_area(struct pt_ctx *ctx)
179
{
180
return ((struct pt_ext_area *)(ctx->save_area +
181
pt_info.pt_xsave_offset));
182
}
183
184
/*
185
* Updates current trace buffer offset from the
186
* ToPA MSRs. Records if the trace buffer wrapped.
187
*/
188
static __inline void
189
pt_update_buffer(struct pt_buffer *buf)
190
{
191
uint64_t reg;
192
uint64_t offset;
193
194
/* Update buffer offset. */
195
reg = rdmsr(MSR_IA32_RTIT_OUTPUT_MASK_PTRS);
196
offset = ((reg & PT_TOPA_PAGE_MASK) >> PT_TOPA_PAGE_SHIFT) * PAGE_SIZE;
197
offset += (reg >> 32);
198
199
atomic_store_rel_64(&buf->offset, offset);
200
}
201
202
static __inline void
203
pt_fill_buffer_record(int id, struct pt_buffer *buf,
204
struct hwt_record_entry *rec)
205
{
206
vm_offset_t offset;
207
208
offset = atomic_load_acq_64(&buf->offset);
209
210
rec->record_type = HWT_RECORD_BUFFER;
211
rec->buf_id = id;
212
rec->curpage = offset / PAGE_SIZE;
213
rec->offset = offset & PAGE_MASK;
214
}
215
216
/*
217
* Enables or disables tracing on curcpu
218
* using the XSAVE/XRSTOR PT extensions.
219
*/
220
static void
221
pt_cpu_toggle_local(uint8_t *save_area, bool enable)
222
{
223
u_long xcr0, cr0;
224
u_long xss;
225
226
cr0 = rcr0();
227
if (cr0 & CR0_TS)
228
clts();
229
xcr0 = rxcr(XCR0);
230
if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK)
231
load_xcr(XCR0, xcr0 | PT_XSAVE_MASK);
232
xss = rdmsr(MSR_IA32_XSS);
233
wrmsr(MSR_IA32_XSS, xss | XFEATURE_ENABLED_PT);
234
235
if (!enable) {
236
KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) != 0,
237
("%s: PT is disabled", __func__));
238
xsaves(save_area, XFEATURE_ENABLED_PT);
239
} else {
240
KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) == 0,
241
("%s: PT is enabled", __func__));
242
xrstors(save_area, XFEATURE_ENABLED_PT);
243
}
244
wrmsr(MSR_IA32_XSS, xss);
245
if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK)
246
load_xcr(XCR0, xcr0);
247
if (cr0 & CR0_TS)
248
load_cr0(cr0);
249
}
250
251
/*
252
* Starts PT tracing on 'curcpu'.
253
*/
254
static void
255
pt_cpu_start(void *dummy)
256
{
257
struct pt_cpu *cpu;
258
259
cpu = &pt_pcpu[curcpu];
260
MPASS(cpu->ctx != NULL);
261
262
dprintf("%s: curcpu %d\n", __func__, curcpu);
263
pt_cpu_set_state(curcpu, PT_ACTIVE);
264
load_cr4(rcr4() | CR4_XSAVE);
265
wrmsr(MSR_IA32_RTIT_STATUS, 0);
266
pt_cpu_toggle_local(cpu->ctx->save_area, true);
267
}
268
269
/*
270
* Stops PT tracing on 'curcpu'.
271
* Updates trace buffer offset to ensure
272
* any data generated between the last interrupt
273
* and the trace stop gets picked up by userspace.
274
*/
275
static void
276
pt_cpu_stop(void *dummy)
277
{
278
struct pt_cpu *cpu;
279
struct pt_ctx *ctx;
280
281
cpu = &pt_pcpu[curcpu];
282
ctx = cpu->ctx;
283
284
dprintf("%s: curcpu %d\n", __func__, curcpu);
285
/* Shutdown may occur before PT gets properly configured. */
286
if (ctx == NULL) {
287
dprintf("%s: missing context on cpu %d; bailing\n", __func__,
288
curcpu);
289
return;
290
}
291
pt_cpu_toggle_local(cpu->ctx->save_area, false);
292
pt_update_buffer(&ctx->buf);
293
}
294
295
/*
296
* Prepares the Table of Physical Addresses (ToPA) metadata for 'pt_ctx'.
297
* The HWT trace buffer is split into 4K ToPA table entries and used
298
* as a circular buffer, meaning that the last ToPA entry points to
299
* the first ToPA entry. Each entry is configured to raise an
300
* interrupt after being filled.
301
*/
302
static int
303
pt_topa_prepare(struct pt_ctx *ctx, struct hwt_vm *vm)
304
{
305
struct pt_buffer *buf;
306
size_t topa_size;
307
int i;
308
309
topa_size = TOPA_SIZE_4K;
310
buf = &ctx->buf;
311
312
KASSERT(buf->topa_hw == NULL,
313
("%s: ToPA info already exists", __func__));
314
buf->topa_hw = mallocarray(vm->npages + 1, sizeof(uint64_t), M_PT,
315
M_ZERO | M_WAITOK);
316
dprintf("%s: ToPA virt addr %p\n", __func__, buf->topa_hw);
317
buf->size = vm->npages * PAGE_SIZE;
318
for (i = 0; i < vm->npages; i++) {
319
buf->topa_hw[i] = VM_PAGE_TO_PHYS(vm->pages[i]) | topa_size;
320
/*
321
* XXX: TOPA_INT should ideally be set according to
322
* expected amount of incoming trace data. Too few TOPA_INT
323
* entries will not trigger interrupts often enough when tracing
324
* smaller functions.
325
*/
326
buf->topa_hw[i] |= TOPA_INT;
327
}
328
buf->topa_hw[vm->npages] = (uint64_t)vtophys(buf->topa_hw) | TOPA_END;
329
330
return (0);
331
}
332
333
/*
334
* Configures IP filtering for trace generation.
335
* A maximum of 2 ranges can be specified due to
336
* limitations imposed by the XSAVE/XRSTOR PT extensions.
337
*/
338
static int
339
pt_configure_ranges(struct pt_ctx *ctx, struct pt_cpu_config *cfg)
340
{
341
struct pt_ext_area *pt_ext;
342
int nranges_supp, n, error = 0;
343
344
pt_ext = pt_ctx_get_ext_area(ctx);
345
if (pt_info.l0_ebx & CPUPT_IPF) {
346
nranges_supp = (pt_info.l1_eax & CPUPT_NADDR_M) >>
347
CPUPT_NADDR_S;
348
349
if (nranges_supp > PT_IP_FILTER_MAX_RANGES)
350
nranges_supp = PT_IP_FILTER_MAX_RANGES;
351
n = cfg->nranges;
352
if (n > nranges_supp) {
353
printf("%s: %d IP filtering ranges requested, CPU "
354
"supports %d, truncating\n",
355
__func__, n, nranges_supp);
356
n = nranges_supp;
357
}
358
359
switch (n) {
360
case 2:
361
pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(1));
362
pt_ext->rtit_addr1_a = cfg->ip_ranges[1].start;
363
pt_ext->rtit_addr1_b = cfg->ip_ranges[1].end;
364
case 1:
365
pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(0));
366
pt_ext->rtit_addr0_a = cfg->ip_ranges[0].start;
367
pt_ext->rtit_addr0_b = cfg->ip_ranges[0].end;
368
break;
369
default:
370
error = (EINVAL);
371
break;
372
};
373
} else
374
error = (ENXIO);
375
376
return (error);
377
}
378
379
static int
380
pt_init_ctx(struct pt_ctx *pt_ctx, struct hwt_vm *vm, int ctx_id)
381
{
382
383
dprintf("%s: ctx id %d\n", __func__, ctx_id);
384
385
KASSERT(pt_ctx->buf.topa_hw == NULL,
386
("%s: active ToPA buffer in context %p\n", __func__, pt_ctx));
387
388
memset(pt_ctx, 0, sizeof(struct pt_ctx));
389
mtx_init(&pt_ctx->buf.lock, "pttopa", NULL, MTX_SPIN);
390
pt_ctx->save_area = malloc_aligned(pt_info.xsave_area_size, 64,
391
M_PT, M_NOWAIT | M_ZERO);
392
if (pt_ctx->save_area == NULL)
393
return (ENOMEM);
394
dprintf("%s: preparing ToPA buffer\n", __func__);
395
if (pt_topa_prepare(pt_ctx, vm) != 0) {
396
free(pt_ctx->save_area, M_PT);
397
return (ENOMEM);
398
}
399
400
pt_ctx->id = ctx_id;
401
402
return (0);
403
}
404
405
static void
406
pt_deinit_ctx(struct pt_ctx *pt_ctx)
407
{
408
409
if (pt_ctx->buf.topa_hw != NULL)
410
free(pt_ctx->buf.topa_hw, M_PT);
411
if (pt_ctx->save_area != NULL)
412
free(pt_ctx->save_area, M_PT);
413
memset(pt_ctx, 0, sizeof(*pt_ctx));
414
}
415
416
/*
417
* HWT backend configuration method.
418
*
419
* Checks and translates the user-defined configuration to a
420
* set of PT tracing features. Uses the feature set to initialize
421
* the tracing context for the target CPU or thread.
422
*/
423
static int
424
pt_backend_configure(struct hwt_context *ctx, int cpu_id, int thread_id)
425
{
426
struct hwt_cpu *hwt_cpu;
427
struct hwt_thread *thr;
428
struct pt_ctx *pt_ctx;
429
struct pt_cpu_config *cfg;
430
struct pt_ext_area *pt_ext;
431
struct xstate_hdr *hdr;
432
int error;
433
434
dprintf("%s\n", __func__);
435
436
cfg = (struct pt_cpu_config *)ctx->config;
437
pt_ctx = NULL;
438
439
/* Clear any flags we don't support yet. */
440
cfg->rtit_ctl &= PT_SUPPORTED_FLAGS;
441
if (cfg->rtit_ctl & RTIT_CTL_MTCEN) {
442
if ((pt_info.l0_ebx & CPUPT_MTC) == 0) {
443
printf("%s: CPU does not support generating MTC "
444
"packets\n", __func__);
445
return (ENXIO);
446
}
447
}
448
449
if (cfg->rtit_ctl & RTIT_CTL_CR3FILTER) {
450
if ((pt_info.l0_ebx & CPUPT_CR3) == 0) {
451
printf("%s: CPU does not support CR3 filtering\n",
452
__func__);
453
return (ENXIO);
454
}
455
}
456
457
if (cfg->rtit_ctl & RTIT_CTL_DIS_TNT) {
458
if ((pt_info.l0_ebx & CPUPT_DIS_TNT) == 0) {
459
printf("%s: CPU does not support TNT\n", __func__);
460
return (ENXIO);
461
}
462
}
463
/* TODO: support for more config bits. */
464
465
if (ctx->mode == HWT_MODE_CPU) {
466
TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
467
if (hwt_cpu->cpu_id != cpu_id)
468
continue;
469
pt_ctx = &pt_pcpu_ctx[cpu_id];
470
break;
471
}
472
} else {
473
TAILQ_FOREACH(thr, &ctx->threads, next) {
474
if (thr->thread_id != thread_id)
475
continue;
476
KASSERT(thr->private != NULL,
477
("%s: hwt thread private"
478
" not set, thr %p",
479
__func__, thr));
480
pt_ctx = (struct pt_ctx *)thr->private;
481
break;
482
}
483
}
484
if (pt_ctx == NULL)
485
return (ENOENT);
486
487
dprintf("%s: preparing MSRs\n", __func__);
488
pt_ext = pt_ctx_get_ext_area(pt_ctx);
489
hdr = pt_ctx_get_xstate_hdr(pt_ctx);
490
491
pt_ext->rtit_ctl |= cfg->rtit_ctl;
492
if (cfg->nranges != 0) {
493
dprintf("%s: preparing IPF ranges\n", __func__);
494
if ((error = pt_configure_ranges(pt_ctx, cfg)) != 0)
495
return (error);
496
}
497
pt_ctx->hwt_ctx = ctx;
498
pt_ext->rtit_ctl |= RTIT_CTL_TOPA;
499
pt_ext->rtit_output_base = (uint64_t)vtophys(pt_ctx->buf.topa_hw);
500
pt_ext->rtit_output_mask_ptrs = PT_TOPA_MASK_PTRS;
501
hdr->xstate_bv = XFEATURE_ENABLED_PT;
502
hdr->xstate_xcomp_bv = XFEATURE_ENABLED_PT |
503
XSTATE_XCOMP_BV_COMPACT;
504
pt_ext->rtit_ctl |= RTIT_CTL_TRACEEN;
505
pt_pcpu[cpu_id].ctx = pt_ctx;
506
507
return (0);
508
}
509
510
/*
511
* hwt backend trace start operation. CPU affine.
512
*/
513
static void
514
pt_backend_enable(struct hwt_context *ctx, int cpu_id)
515
{
516
if (ctx->mode == HWT_MODE_CPU)
517
return;
518
519
KASSERT(curcpu == cpu_id,
520
("%s: attempting to start PT on another cpu", __func__));
521
pt_cpu_start(NULL);
522
CPU_SET(cpu_id, &ctx->cpu_map);
523
}
524
525
/*
526
* hwt backend trace stop operation. CPU affine.
527
*/
528
static void
529
pt_backend_disable(struct hwt_context *ctx, int cpu_id)
530
{
531
struct pt_cpu *cpu;
532
533
if (ctx->mode == HWT_MODE_CPU)
534
return;
535
KASSERT(curcpu == cpu_id,
536
("%s: attempting to disable PT on another cpu", __func__));
537
538
cpu = &pt_pcpu[cpu_id];
539
540
dprintf("%s: waiting for cpu %d to exit interrupt handler\n", __func__,
541
cpu_id);
542
pt_cpu_set_state(cpu_id, PT_INACTIVE);
543
while (atomic_cmpset_int(&cpu->in_pcint_handler, 1, 0))
544
;
545
546
pt_cpu_stop(NULL);
547
CPU_CLR(cpu_id, &ctx->cpu_map);
548
cpu->ctx = NULL;
549
}
550
551
/*
552
* hwt backend trace start operation for remote CPUs.
553
*/
554
static int
555
pt_backend_enable_smp(struct hwt_context *ctx)
556
{
557
dprintf("%s\n", __func__);
558
559
KASSERT(ctx->mode == HWT_MODE_CPU,
560
("%s: should only be used for CPU mode", __func__));
561
if (ctx->mode == HWT_MODE_CPU &&
562
atomic_swap_32(&cpu_mode_ctr, 1) != 0)
563
return (-1);
564
565
smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_start, NULL, NULL);
566
567
return (0);
568
}
569
570
/*
571
* hwt backend trace stop operation for remote CPUs.
572
*/
573
static int
574
pt_backend_disable_smp(struct hwt_context *ctx)
575
{
576
struct pt_cpu *cpu;
577
578
dprintf("%s\n", __func__);
579
if (ctx->mode == HWT_MODE_CPU &&
580
atomic_swap_32(&cpu_mode_ctr, 0) == 0)
581
return (-1);
582
583
if (CPU_EMPTY(&ctx->cpu_map)) {
584
dprintf("%s: empty cpu map\n", __func__);
585
return (-1);
586
}
587
CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) {
588
cpu = &pt_pcpu[cpu_id];
589
dprintf("%s: waiting for cpu %d to exit interrupt handler\n",
590
__func__, cpu_id);
591
pt_cpu_set_state(cpu_id, PT_INACTIVE);
592
while (atomic_cmpset_int(&cpu->in_pcint_handler, 1, 0))
593
;
594
}
595
smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_stop, NULL, NULL);
596
597
return (0);
598
}
599
600
/*
601
* HWT backend initialization method.
602
*
603
* Installs the ToPA interrupt handler and initializes
604
* the tracing contexts used for HWT_MODE_CPU.
605
*/
606
static int
607
pt_backend_init(struct hwt_context *ctx)
608
{
609
struct hwt_cpu *hwt_cpu;
610
int error;
611
612
dprintf("%s\n", __func__);
613
if (ctx->mode != HWT_MODE_CPU)
614
return (0);
615
TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
616
error = pt_init_ctx(&pt_pcpu_ctx[hwt_cpu->cpu_id], hwt_cpu->vm,
617
hwt_cpu->cpu_id);
618
if (error)
619
return (error);
620
}
621
622
return (0);
623
}
624
625
/*
626
* HWT backend teardown method.
627
*
628
* Removes the ToPA interrupt handler, stops tracing on all active CPUs,
629
* and releases all previously allocated ToPA metadata.
630
*/
631
static int
632
pt_backend_deinit(struct hwt_context *ctx)
633
{
634
struct pt_ctx *pt_ctx;
635
struct hwt_thread *thr;
636
int cpu_id;
637
638
dprintf("%s\n", __func__);
639
640
pt_backend_disable_smp(ctx);
641
if (ctx->mode == HWT_MODE_THREAD) {
642
TAILQ_FOREACH(thr, &ctx->threads, next) {
643
KASSERT(thr->private != NULL,
644
("%s: thr->private not set", __func__));
645
pt_ctx = (struct pt_ctx *)thr->private;
646
pt_deinit_ctx(pt_ctx);
647
}
648
} else {
649
CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) {
650
if (pt_pcpu[cpu_id].ctx == NULL)
651
continue;
652
KASSERT(pt_pcpu[cpu_id].ctx == &pt_pcpu_ctx[cpu_id],
653
("%s: CPU mode tracing with non-cpu mode PT"
654
"context active",
655
__func__));
656
pt_deinit_ctx(pt_pcpu[cpu_id].ctx);
657
pt_pcpu[cpu_id].ctx = NULL;
658
atomic_set_int(&pt_pcpu[cpu_id].in_pcint_handler, 0);
659
}
660
}
661
662
return (0);
663
}
664
665
/*
666
* Fetches current offset into the tracing buffer.
667
*/
668
static int
669
pt_backend_read(struct hwt_vm *vm, int *curpage, vm_offset_t *curpage_offset,
670
uint64_t *data)
671
{
672
struct pt_buffer *buf;
673
uint64_t offset;
674
675
if (vm->ctx->mode == HWT_MODE_THREAD)
676
buf = &((struct pt_ctx *)vm->thr->private)->buf;
677
else
678
buf = &pt_pcpu[vm->cpu->cpu_id].ctx->buf;
679
offset = atomic_load_acq_64(&buf->offset);
680
*curpage = offset / PAGE_SIZE;
681
*curpage_offset = offset & PAGE_MASK;
682
683
return (0);
684
}
685
686
/*
687
* HWT thread creation hook.
688
* Allocates and associates a 'struct pt_ctx' for a given hwt thread.
689
*/
690
static int
691
pt_backend_alloc_thread(struct hwt_thread *thr)
692
{
693
struct pt_ctx *pt_ctx;
694
int error;
695
696
/* Omit M_WAITOK since this might get invoked a non-sleepable context */
697
pt_ctx = malloc(sizeof(*pt_ctx), M_PT, M_NOWAIT | M_ZERO);
698
if (pt_ctx == NULL)
699
return (ENOMEM);
700
701
error = pt_init_ctx(pt_ctx, thr->vm, thr->thread_id);
702
if (error)
703
return (error);
704
705
thr->private = pt_ctx;
706
return (0);
707
}
708
/*
709
* HWT thread teardown hook.
710
*/
711
static void
712
pt_backend_free_thread(struct hwt_thread *thr)
713
{
714
struct pt_ctx *ctx;
715
716
ctx = (struct pt_ctx *)thr->private;
717
718
pt_deinit_ctx(ctx);
719
free(ctx, M_PT);
720
}
721
722
static void
723
pt_backend_dump(int cpu_id)
724
{
725
}
726
727
static struct hwt_backend_ops pt_ops = {
728
.hwt_backend_init = pt_backend_init,
729
.hwt_backend_deinit = pt_backend_deinit,
730
731
.hwt_backend_configure = pt_backend_configure,
732
733
.hwt_backend_enable = pt_backend_enable,
734
.hwt_backend_disable = pt_backend_disable,
735
736
#ifdef SMP
737
.hwt_backend_enable_smp = pt_backend_enable_smp,
738
.hwt_backend_disable_smp = pt_backend_disable_smp,
739
#endif
740
741
.hwt_backend_read = pt_backend_read,
742
.hwt_backend_dump = pt_backend_dump,
743
744
.hwt_backend_thread_alloc = pt_backend_alloc_thread,
745
.hwt_backend_thread_free = pt_backend_free_thread,
746
};
747
748
static struct hwt_backend backend = {
749
.ops = &pt_ops,
750
.name = "pt",
751
.kva_req = 1,
752
};
753
754
/*
755
* Reads the latest valid trace buffer offset and enqueues
756
* a HWT_RECORD_BUFFER record.
757
* Used as a taskqueue routine from the ToPA interrupt handler.
758
*/
759
static void
760
pt_send_buffer_record(void *arg)
761
{
762
struct pt_cpu *cpu = (struct pt_cpu *)arg;
763
struct hwt_record_entry record;
764
765
struct pt_ctx *ctx = cpu->ctx;
766
pt_fill_buffer_record(ctx->id, &ctx->buf, &record);
767
hwt_record_ctx(ctx->hwt_ctx, &record, M_ZERO | M_NOWAIT);
768
}
769
static void
770
pt_topa_status_clear(void)
771
{
772
uint64_t reg;
773
774
reg = rdmsr(MSR_IA_GLOBAL_STATUS_RESET);
775
reg &= ~GLOBAL_STATUS_FLAG_TRACETOPAPMI;
776
reg |= GLOBAL_STATUS_FLAG_TRACETOPAPMI;
777
wrmsr(MSR_IA_GLOBAL_STATUS_RESET, reg);
778
}
779
780
/*
781
* ToPA PMI handler.
782
*
783
* Invoked every time a ToPA entry marked with TOPA_INT is filled.
784
* Uses taskqueue to enqueue a buffer record for userspace.
785
* Re-enables the PC interrupt line as long as tracing is active.
786
*/
787
static int
788
pt_topa_intr(struct trapframe *tf)
789
{
790
struct pt_buffer *buf;
791
struct pt_cpu *cpu;
792
struct pt_ctx *ctx;
793
uint64_t reg;
794
795
cpu = &pt_pcpu[curcpu];
796
reg = rdmsr(MSR_IA_GLOBAL_STATUS);
797
if ((reg & GLOBAL_STATUS_FLAG_TRACETOPAPMI) == 0) {
798
pt_topa_status_clear();
799
return (0);
800
}
801
802
if (pt_cpu_get_state(curcpu) != PT_ACTIVE) {
803
return (1);
804
}
805
atomic_set_int(&cpu->in_pcint_handler, 1);
806
807
ctx = cpu->ctx;
808
KASSERT(ctx != NULL,
809
("%s: cpu %d: ToPA PMI interrupt without an active context",
810
__func__, curcpu));
811
buf = &ctx->buf;
812
KASSERT(buf->topa_hw != NULL,
813
("%s: cpu %d: ToPA PMI interrupt with invalid buffer", __func__,
814
curcpu));
815
pt_cpu_toggle_local(ctx->save_area, false);
816
pt_update_buffer(buf);
817
pt_topa_status_clear();
818
819
if (pt_cpu_get_state(curcpu) == PT_ACTIVE) {
820
swi_sched(cpu->swi_cookie, SWI_FROMNMI);
821
pt_cpu_toggle_local(ctx->save_area, true);
822
lapic_reenable_pcint();
823
}
824
atomic_set_int(&cpu->in_pcint_handler, 0);
825
return (1);
826
}
827
828
/*
829
* Module initialization.
830
*
831
* Saves all PT-related cpuid info, registers itself as a HWT backend,
832
* and allocates metadata required to keep track of tracing operations
833
* on each CPU.
834
*/
835
static int
836
pt_init(void)
837
{
838
u_int cp[4];
839
int error, i;
840
841
dprintf("pt: Enumerating part 1\n");
842
cpuid_count(CPUID_PT_LEAF, 0, cp);
843
dprintf("pt: Maximum valid sub-leaf Index: %x\n", cp[0]);
844
dprintf("pt: ebx %x\n", cp[1]);
845
dprintf("pt: ecx %x\n", cp[2]);
846
847
pt_info.l0_eax = cp[0];
848
pt_info.l0_ebx = cp[1];
849
pt_info.l0_ecx = cp[2];
850
851
dprintf("pt: Enumerating part 2\n");
852
cpuid_count(CPUID_PT_LEAF, 1, cp);
853
dprintf("pt: eax %x\n", cp[0]);
854
dprintf("pt: ebx %x\n", cp[1]);
855
856
pt_info.l1_eax = cp[0];
857
pt_info.l1_ebx = cp[1];
858
859
error = hwt_backend_register(&backend);
860
if (error != 0) {
861
printf("pt: unable to register hwt backend, error %d\n", error);
862
return (error);
863
}
864
pt_pcpu = mallocarray(mp_ncpus, sizeof(struct pt_cpu), M_PT,
865
M_ZERO | M_WAITOK);
866
pt_pcpu_ctx = mallocarray(mp_ncpus, sizeof(struct pt_ctx), M_PT,
867
M_ZERO | M_WAITOK);
868
869
for (i = 0; i < mp_ncpus; i++) {
870
error = swi_add(&clk_intr_event, "pt", pt_send_buffer_record,
871
&pt_pcpu[i], SWI_CLOCK, INTR_MPSAFE,
872
&pt_pcpu[i].swi_cookie);
873
if (error != 0) {
874
dprintf(
875
"%s: failed to add interrupt handler for cpu: %d\n",
876
__func__, error);
877
goto err;
878
}
879
}
880
881
nmi_register_handler(pt_topa_intr);
882
if (lapic_enable_pcint()) {
883
initialized = true;
884
return (0);
885
} else
886
printf("pt: failed to setup interrupt line\n");
887
err:
888
nmi_remove_handler(pt_topa_intr);
889
hwt_backend_unregister(&backend);
890
891
for (i = 0; i < mp_ncpus; i++) {
892
if (pt_pcpu[i].swi_cookie != 0)
893
swi_remove(pt_pcpu[i].swi_cookie);
894
}
895
free(pt_pcpu, M_PT);
896
free(pt_pcpu_ctx, M_PT);
897
pt_pcpu = NULL;
898
pt_pcpu_ctx = NULL;
899
900
return (error);
901
}
902
903
/*
904
* Checks whether the CPU support Intel PT and
905
* initializes XSAVE area info.
906
*
907
* The driver relies on XSAVE/XRSTOR PT extensions,
908
* Table of Physical Addresses (ToPA) support, and
909
* support for multiple ToPA entries.
910
*/
911
static bool
912
pt_supported(void)
913
{
914
u_int cp[4];
915
916
if ((cpu_stdext_feature & CPUID_STDEXT_PROCTRACE) == 0) {
917
printf("pt: CPU does not support Intel Processor Trace\n");
918
return (false);
919
}
920
if ((cpu_feature2 & CPUID2_XSAVE) == 0) {
921
printf("pt: XSAVE is not supported\n");
922
return (false);
923
}
924
if (!xsave_extfeature_supported(XFEATURE_ENABLED_PT, true)) {
925
printf("pt: CPU does not support managing PT state using XSAVE\n");
926
return (false);
927
}
928
if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVEC)) {
929
printf("pt: XSAVE compaction is not supported\n");
930
return (false);
931
}
932
if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVES)) {
933
printf("pt: CPU does not support XSAVES/XRSTORS\n");
934
return (false);
935
}
936
937
/* Require ToPA support. */
938
cpuid_count(CPUID_PT_LEAF, 0, cp);
939
if ((cp[2] & CPUPT_TOPA) == 0) {
940
printf("pt: ToPA is not supported\n");
941
return (false);
942
}
943
if ((cp[2] & CPUPT_TOPA_MULTI) == 0) {
944
printf("pt: multiple ToPA outputs are not supported\n");
945
return (false);
946
}
947
948
pt_info.xstate_hdr_offset = xsave_area_hdr_offset();
949
pt_info.xsave_area_size = xsave_area_size(PT_XSTATE_BV, true, true);
950
pt_info.pt_xsave_offset = xsave_area_offset(PT_XSTATE_BV,
951
XFEATURE_ENABLED_PT, true, true);
952
953
return (true);
954
}
955
956
static void
957
pt_deinit(void)
958
{
959
int i;
960
struct pt_cpu *cpu;
961
962
if (!initialized)
963
return;
964
nmi_remove_handler(pt_topa_intr);
965
lapic_disable_pcint();
966
hwt_backend_unregister(&backend);
967
968
for (i = 0; i < mp_ncpus; i++) {
969
cpu = &pt_pcpu[i];
970
swi_remove(cpu->swi_cookie);
971
}
972
973
free(pt_pcpu, M_PT);
974
free(pt_pcpu_ctx, M_PT);
975
pt_pcpu = NULL;
976
pt_pcpu_ctx = NULL;
977
initialized = false;
978
}
979
980
static int
981
pt_modevent(module_t mod, int type, void *data)
982
{
983
switch (type) {
984
case MOD_LOAD:
985
if (!pt_supported() || pt_init() != 0) {
986
return (ENXIO);
987
}
988
break;
989
case MOD_UNLOAD:
990
pt_deinit();
991
break;
992
default:
993
break;
994
}
995
996
return (0);
997
}
998
999
static moduledata_t pt_mod = { "intel_pt", pt_modevent, NULL };
1000
1001
DECLARE_MODULE(intel_pt, pt_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
1002
MODULE_DEPEND(intel_pt, hwt, 1, 1, 1);
1003
MODULE_VERSION(intel_pt, 1);
1004
1005