Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/amd64/pt/pt.c
39536 views
1
/*
2
* Copyright (c) 2025 Bojan Novković <[email protected]>
3
*
4
* SPDX-License-Identifier: BSD-2-Clause
5
*/
6
7
/*
8
* hwt(4) Intel Processor Trace (PT) backend
9
*
10
* Driver Design Overview
11
*
12
* - Since PT is configured on a per-core basis, the driver uses
13
* 'smp_rendezvous' to start and disable tracing on each target core.
14
* - PT-specific resources are stored in a 'struct pt_ctx' context structure for
15
* each traced CPU core or thread. Upon initialization, a ToPA configuration
16
* is generated for each 'pt_ctx' structure using the HWT tracing buffers.
17
* The HWT tracing buffer is split into 4K ToPA entries. Currently, each
18
* 4K ToPA entry is configured to trigger an interrupt after it is filled.
19
* - The PT driver uses the XSAVE/XRSTOR PT extensions to load and save all
20
* relevant PT registers. Every time a traced thread is switched
21
* out or in, its state will be saved to or loaded from its corresponding
22
* 'pt_ctx' context.
23
* - When tracing starts, the PT hardware will start writing data into the
24
* tracing buffer. When a TOPA_INT entry is filled, it will trigger an
25
* interrupt before continuing. The interrupt handler will then fetch the
26
* last valid tracing buffer offset and enqueue a HWT_RECORD_BUFFER record.
27
* The driver is currently configured to use the NMI interrupt line.
28
* - The userspace PT backend waits for incoming HWT_RECORD_BUFFER records
29
* and uses the offsets to decode data from the tracing buffer.
30
*
31
* Future improvements and limitations
32
*
33
* - We currently configure the PT hardware to trigger an interrupt whenever
34
* a 4K ToPA entry is filled. While this is fine when tracing smaller
35
* functions or infrequent code paths, this will generate too much interrupt
36
* traffic when tracing hotter functions. A proper solution for this issue
37
* should estimate the amount of data generated by the current configuration
38
* and use it to determine interrupt frequency.
39
*
40
* - Support for more tracing options and PT features.
41
*
42
*/
43
44
#include <sys/systm.h>
45
#include <sys/hwt.h>
46
#include <sys/kernel.h>
47
#include <sys/lock.h>
48
#include <sys/malloc.h>
49
#include <sys/module.h>
50
#include <sys/mutex.h>
51
#include <sys/sdt.h>
52
#include <sys/smp.h>
53
#include <sys/taskqueue.h>
54
55
#include <vm/vm.h>
56
#include <vm/vm_page.h>
57
58
#include <machine/atomic.h>
59
#include <machine/cpufunc.h>
60
#include <machine/fpu.h>
61
#include <machine/smp.h>
62
#include <machine/specialreg.h>
63
64
#include <x86/apicvar.h>
65
#include <x86/x86_var.h>
66
67
#include <dev/hwt/hwt_context.h>
68
#include <dev/hwt/hwt_vm.h>
69
#include <dev/hwt/hwt_backend.h>
70
#include <dev/hwt/hwt_config.h>
71
#include <dev/hwt/hwt_cpu.h>
72
#include <dev/hwt/hwt_record.h>
73
#include <dev/hwt/hwt_thread.h>
74
75
#include <amd64/pt/pt.h>
76
77
#ifdef PT_DEBUG
78
#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__)
79
#else
80
#define dprintf(fmt, ...)
81
#endif
82
#define PT_SUPPORTED_FLAGS \
83
(RTIT_CTL_MTCEN | RTIT_CTL_CR3FILTER | RTIT_CTL_DIS_TNT | \
84
RTIT_CTL_USER | RTIT_CTL_OS | RTIT_CTL_BRANCHEN)
85
#define PT_XSAVE_MASK (XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE)
86
#define PT_XSTATE_BV (PT_XSAVE_MASK | XFEATURE_ENABLED_PT)
87
#define PT_MAX_IP_RANGES 2
88
89
#define PT_TOPA_MASK_PTRS 0x7f
90
#define PT_TOPA_PAGE_MASK 0xffffff80
91
#define PT_TOPA_PAGE_SHIFT 7
92
93
#define CPUID_PT_LEAF 0x14
94
95
MALLOC_DEFINE(M_PT, "pt", "Intel Processor Trace");
96
97
SDT_PROVIDER_DEFINE(pt);
98
SDT_PROBE_DEFINE(pt, , , topa__intr);
99
100
TASKQUEUE_FAST_DEFINE_THREAD(pt);
101
102
static void pt_send_buffer_record(void *arg, int pending __unused);
103
static int pt_topa_intr(struct trapframe *tf);
104
105
/*
106
* Intel Processor Trace XSAVE-managed state.
107
*/
108
struct pt_ext_area {
109
uint64_t rtit_ctl;
110
uint64_t rtit_output_base;
111
uint64_t rtit_output_mask_ptrs;
112
uint64_t rtit_status;
113
uint64_t rtit_cr3_match;
114
uint64_t rtit_addr0_a;
115
uint64_t rtit_addr0_b;
116
uint64_t rtit_addr1_a;
117
uint64_t rtit_addr1_b;
118
};
119
120
struct pt_buffer {
121
uint64_t *topa_hw; /* ToPA table entries. */
122
size_t size;
123
struct mtx lock; /* Lock for fields below. */
124
vm_offset_t offset;
125
uint64_t wrap_count;
126
int curpage;
127
};
128
129
struct pt_ctx {
130
int id;
131
struct pt_buffer buf; /* ToPA buffer metadata */
132
struct task task; /* ToPA buffer notification task */
133
struct hwt_context *hwt_ctx;
134
uint8_t *save_area; /* PT XSAVE area */
135
};
136
/* PT tracing contexts used for CPU mode. */
137
static struct pt_ctx *pt_pcpu_ctx;
138
139
enum pt_cpu_state {
140
PT_DISABLED = 0,
141
PT_STOPPED,
142
PT_ACTIVE
143
};
144
145
static struct pt_cpu {
146
struct pt_ctx *ctx; /* active PT tracing context */
147
enum pt_cpu_state state; /* used as part of trace stop protocol */
148
} *pt_pcpu;
149
150
/*
151
* PT-related CPUID bits.
152
*/
153
static struct pt_cpu_info {
154
uint32_t l0_eax;
155
uint32_t l0_ebx;
156
uint32_t l0_ecx;
157
uint32_t l1_eax;
158
uint32_t l1_ebx;
159
size_t xsave_area_size;
160
size_t xstate_hdr_offset;
161
size_t pt_xsave_offset;
162
} pt_info __read_mostly;
163
164
static bool initialized = false;
165
static int cpu_mode_ctr = 0;
166
167
static __inline enum pt_cpu_state
168
pt_cpu_get_state(int cpu_id)
169
{
170
return (atomic_load_int(&pt_pcpu[cpu_id].state));
171
}
172
173
static __inline void
174
pt_cpu_set_state(int cpu_id, enum pt_cpu_state state)
175
{
176
atomic_store_int(&pt_pcpu[cpu_id].state, state);
177
}
178
179
static __inline struct xstate_hdr *
180
pt_ctx_get_xstate_hdr(struct pt_ctx *ctx)
181
{
182
return ((struct xstate_hdr *)(ctx->save_area +
183
pt_info.xstate_hdr_offset));
184
}
185
186
187
static __inline struct pt_ext_area *
188
pt_ctx_get_ext_area(struct pt_ctx *ctx)
189
{
190
return ((struct pt_ext_area *)(ctx->save_area +
191
pt_info.pt_xsave_offset));
192
}
193
194
/*
195
* Updates current trace buffer offset from the
196
* ToPA MSRs. Records if the trace buffer wrapped.
197
*/
198
static __inline void
199
pt_update_buffer(struct pt_buffer *buf)
200
{
201
uint64_t reg;
202
int curpage;
203
204
/* Update buffer offset. */
205
reg = rdmsr(MSR_IA32_RTIT_OUTPUT_MASK_PTRS);
206
curpage = (reg & PT_TOPA_PAGE_MASK) >> PT_TOPA_PAGE_SHIFT;
207
mtx_lock_spin(&buf->lock);
208
/* Check if the output wrapped. */
209
if (buf->curpage > curpage)
210
buf->wrap_count++;
211
buf->curpage = curpage;
212
buf->offset = reg >> 32;
213
mtx_unlock_spin(&buf->lock);
214
215
dprintf("%s: wrap_cnt: %lu, curpage: %d, offset: %zu\n", __func__,
216
buf->wrap_count, buf->curpage, buf->offset);
217
}
218
219
static __inline void
220
pt_fill_buffer_record(int id, struct pt_buffer *buf,
221
struct hwt_record_entry *rec)
222
{
223
rec->record_type = HWT_RECORD_BUFFER;
224
rec->buf_id = id;
225
rec->curpage = buf->curpage;
226
rec->offset = buf->offset + (buf->wrap_count * buf->size);
227
}
228
229
/*
230
* Enables or disables tracing on curcpu
231
* using the XSAVE/XRSTOR PT extensions.
232
*/
233
static void
234
pt_cpu_toggle_local(uint8_t *save_area, bool enable)
235
{
236
u_long xcr0, cr0;
237
u_long xss;
238
239
cr0 = rcr0();
240
if (cr0 & CR0_TS)
241
clts();
242
xcr0 = rxcr(XCR0);
243
if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK)
244
load_xcr(XCR0, xcr0 | PT_XSAVE_MASK);
245
xss = rdmsr(MSR_IA32_XSS);
246
wrmsr(MSR_IA32_XSS, xss | XFEATURE_ENABLED_PT);
247
248
if (!enable) {
249
KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) != 0,
250
("%s: PT is disabled", __func__));
251
xsaves(save_area, XFEATURE_ENABLED_PT);
252
} else {
253
KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) == 0,
254
("%s: PT is enabled", __func__));
255
xrstors(save_area, XFEATURE_ENABLED_PT);
256
}
257
wrmsr(MSR_IA32_XSS, xss);
258
if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK)
259
load_xcr(XCR0, xcr0);
260
if (cr0 & CR0_TS)
261
load_cr0(cr0);
262
}
263
264
/*
265
* Starts PT tracing on 'curcpu'.
266
*/
267
static void
268
pt_cpu_start(void *dummy)
269
{
270
struct pt_cpu *cpu;
271
272
cpu = &pt_pcpu[curcpu];
273
MPASS(cpu->ctx != NULL);
274
275
dprintf("%s: curcpu %d\n", __func__, curcpu);
276
load_cr4(rcr4() | CR4_XSAVE);
277
wrmsr(MSR_IA32_RTIT_STATUS, 0);
278
pt_cpu_set_state(curcpu, PT_ACTIVE);
279
pt_cpu_toggle_local(cpu->ctx->save_area, true);
280
}
281
282
/*
283
* Stops PT tracing on 'curcpu'.
284
* Updates trace buffer offset to ensure
285
* any data generated between the last interrupt
286
* and the trace stop gets picked up by userspace.
287
*/
288
static void
289
pt_cpu_stop(void *dummy)
290
{
291
struct pt_cpu *cpu;
292
struct pt_ctx *ctx;
293
294
/* Shutdown may occur before PT gets properly configured. */
295
if (pt_cpu_get_state(curcpu) == PT_DISABLED)
296
return;
297
298
cpu = &pt_pcpu[curcpu];
299
ctx = cpu->ctx;
300
MPASS(ctx != NULL);
301
dprintf("%s: curcpu %d\n", __func__, curcpu);
302
303
pt_cpu_set_state(curcpu, PT_STOPPED);
304
pt_cpu_toggle_local(cpu->ctx->save_area, false);
305
pt_update_buffer(&ctx->buf);
306
}
307
308
/*
309
* Prepares the Table of Physical Addresses (ToPA) metadata for 'pt_ctx'.
310
* The HWT trace buffer is split into 4K ToPA table entries and used
311
* as a circular buffer, meaning that the last ToPA entry points to
312
* the first ToPA entry. Each entry is configured to raise an
313
* interrupt after being filled.
314
*/
315
static int
316
pt_topa_prepare(struct pt_ctx *ctx, struct hwt_vm *vm)
317
{
318
struct pt_buffer *buf;
319
size_t topa_size;
320
int i;
321
322
topa_size = TOPA_SIZE_4K;
323
buf = &ctx->buf;
324
325
KASSERT(buf->topa_hw == NULL,
326
("%s: ToPA info already exists", __func__));
327
buf->topa_hw = mallocarray(vm->npages + 1, sizeof(uint64_t), M_PT,
328
M_ZERO | M_WAITOK);
329
dprintf("%s: ToPA virt addr %p\n", __func__, buf->topa_hw);
330
buf->size = vm->npages * PAGE_SIZE;
331
for (i = 0; i < vm->npages; i++) {
332
buf->topa_hw[i] = VM_PAGE_TO_PHYS(vm->pages[i]) | topa_size;
333
/*
334
* XXX: TOPA_INT should ideally be set according to
335
* expected amount of incoming trace data. Too few TOPA_INT
336
* entries will not trigger interrupts often enough when tracing
337
* smaller functions.
338
*/
339
buf->topa_hw[i] |= TOPA_INT;
340
}
341
buf->topa_hw[vm->npages] = (uint64_t)vtophys(buf->topa_hw) | TOPA_END;
342
343
return (0);
344
}
345
346
/*
347
* Configures IP filtering for trace generation.
348
* A maximum of 2 ranges can be specified due to
349
* limitations imposed by the XSAVE/XRSTOR PT extensions.
350
*/
351
static int
352
pt_configure_ranges(struct pt_ctx *ctx, struct pt_cpu_config *cfg)
353
{
354
struct pt_ext_area *pt_ext;
355
int nranges_supp, n, error = 0;
356
357
pt_ext = pt_ctx_get_ext_area(ctx);
358
if (pt_info.l0_ebx & CPUPT_IPF) {
359
nranges_supp = (pt_info.l1_eax & CPUPT_NADDR_M) >>
360
CPUPT_NADDR_S;
361
362
if (nranges_supp > PT_IP_FILTER_MAX_RANGES)
363
nranges_supp = PT_IP_FILTER_MAX_RANGES;
364
n = cfg->nranges;
365
if (n > nranges_supp) {
366
printf("%s: %d IP filtering ranges requested, CPU "
367
"supports %d, truncating\n",
368
__func__, n, nranges_supp);
369
n = nranges_supp;
370
}
371
372
switch (n) {
373
case 2:
374
pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(1));
375
pt_ext->rtit_addr1_a = cfg->ip_ranges[1].start;
376
pt_ext->rtit_addr1_b = cfg->ip_ranges[1].end;
377
case 1:
378
pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(0));
379
pt_ext->rtit_addr0_a = cfg->ip_ranges[0].start;
380
pt_ext->rtit_addr0_b = cfg->ip_ranges[0].end;
381
break;
382
default:
383
error = (EINVAL);
384
break;
385
};
386
} else
387
error = (ENXIO);
388
389
return (error);
390
}
391
392
static int
393
pt_init_ctx(struct pt_ctx *pt_ctx, struct hwt_vm *vm, int ctx_id)
394
{
395
396
dprintf("%s: ctx id %d\n", __func__, ctx_id);
397
398
KASSERT(pt_ctx->buf.topa_hw == NULL,
399
("%s: active ToPA buffer in context %p\n", __func__, pt_ctx));
400
401
memset(pt_ctx, 0, sizeof(struct pt_ctx));
402
mtx_init(&pt_ctx->buf.lock, "pttopa", NULL, MTX_SPIN);
403
pt_ctx->save_area = malloc_aligned(pt_info.xsave_area_size, 64,
404
M_PT, M_NOWAIT | M_ZERO);
405
if (pt_ctx->save_area == NULL)
406
return (ENOMEM);
407
dprintf("%s: preparing ToPA buffer\n", __func__);
408
if (pt_topa_prepare(pt_ctx, vm) != 0) {
409
dprintf("%s: failed to prepare ToPA buffer\n", __func__);
410
free(pt_ctx->save_area, M_PT);
411
return (ENOMEM);
412
}
413
414
pt_ctx->id = ctx_id;
415
TASK_INIT(&pt_ctx->task, 0, pt_send_buffer_record, pt_ctx);
416
417
return (0);
418
}
419
420
static void
421
pt_deinit_ctx(struct pt_ctx *pt_ctx)
422
{
423
424
if (pt_ctx->buf.topa_hw != NULL)
425
free(pt_ctx->buf.topa_hw, M_PT);
426
if (pt_ctx->save_area != NULL)
427
free(pt_ctx->save_area, M_PT);
428
memset(pt_ctx, 0, sizeof(*pt_ctx));
429
pt_ctx->buf.topa_hw = NULL;
430
}
431
432
/*
433
* HWT backend configuration method.
434
*
435
* Checks and translates the user-defined configuration to a
436
* set of PT tracing features. Uses the feature set to initialize
437
* the tracing context for the target CPU or thread.
438
*/
439
static int
440
pt_backend_configure(struct hwt_context *ctx, int cpu_id, int thread_id)
441
{
442
struct hwt_cpu *hwt_cpu;
443
struct hwt_thread *thr;
444
struct pt_ctx *pt_ctx;
445
struct pt_cpu_config *cfg;
446
struct pt_ext_area *pt_ext;
447
struct xstate_hdr *hdr;
448
int error;
449
450
dprintf("%s\n", __func__);
451
452
cfg = (struct pt_cpu_config *)ctx->config;
453
pt_ctx = NULL;
454
455
/* Clear any flags we don't support yet. */
456
cfg->rtit_ctl &= PT_SUPPORTED_FLAGS;
457
if (cfg->rtit_ctl & RTIT_CTL_MTCEN) {
458
if ((pt_info.l0_ebx & CPUPT_MTC) == 0) {
459
printf("%s: CPU does not support generating MTC "
460
"packets\n", __func__);
461
return (ENXIO);
462
}
463
}
464
465
if (cfg->rtit_ctl & RTIT_CTL_CR3FILTER) {
466
if ((pt_info.l0_ebx & CPUPT_CR3) == 0) {
467
printf("%s: CPU does not support CR3 filtering\n",
468
__func__);
469
return (ENXIO);
470
}
471
}
472
473
if (cfg->rtit_ctl & RTIT_CTL_DIS_TNT) {
474
if ((pt_info.l0_ebx & CPUPT_DIS_TNT) == 0) {
475
printf("%s: CPU does not support TNT\n", __func__);
476
return (ENXIO);
477
}
478
}
479
/* TODO: support for more config bits. */
480
481
if (ctx->mode == HWT_MODE_CPU) {
482
TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
483
if (hwt_cpu->cpu_id != cpu_id)
484
continue;
485
pt_ctx = &pt_pcpu_ctx[cpu_id];
486
break;
487
}
488
} else {
489
TAILQ_FOREACH(thr, &ctx->threads, next) {
490
if (thr->thread_id != thread_id)
491
continue;
492
KASSERT(thr->private != NULL,
493
("%s: hwt thread private"
494
" not set, thr %p",
495
__func__, thr));
496
pt_ctx = (struct pt_ctx *)thr->private;
497
break;
498
}
499
}
500
if (pt_ctx == NULL)
501
return (ENOENT);
502
503
dprintf("%s: preparing MSRs\n", __func__);
504
pt_ext = pt_ctx_get_ext_area(pt_ctx);
505
hdr = pt_ctx_get_xstate_hdr(pt_ctx);
506
507
pt_ext->rtit_ctl |= cfg->rtit_ctl;
508
if (cfg->nranges != 0) {
509
dprintf("%s: preparing IPF ranges\n", __func__);
510
if ((error = pt_configure_ranges(pt_ctx, cfg)) != 0)
511
return (error);
512
}
513
pt_ctx->hwt_ctx = ctx;
514
pt_ext->rtit_ctl |= RTIT_CTL_TOPA;
515
pt_ext->rtit_output_base = (uint64_t)vtophys(pt_ctx->buf.topa_hw);
516
pt_ext->rtit_output_mask_ptrs = PT_TOPA_MASK_PTRS;
517
hdr->xstate_bv = XFEATURE_ENABLED_PT;
518
hdr->xstate_xcomp_bv = XFEATURE_ENABLED_PT |
519
XSTATE_XCOMP_BV_COMPACT;
520
pt_ext->rtit_ctl |= RTIT_CTL_TRACEEN;
521
pt_pcpu[cpu_id].ctx = pt_ctx;
522
pt_cpu_set_state(cpu_id, PT_STOPPED);
523
524
return (0);
525
}
526
527
/*
528
* hwt backend trace start operation. CPU affine.
529
*/
530
static void
531
pt_backend_enable(struct hwt_context *ctx, int cpu_id)
532
{
533
if (ctx->mode == HWT_MODE_CPU)
534
return;
535
536
KASSERT(curcpu == cpu_id,
537
("%s: attempting to start PT on another cpu", __func__));
538
pt_cpu_start(NULL);
539
CPU_SET(cpu_id, &ctx->cpu_map);
540
}
541
542
/*
543
* hwt backend trace stop operation. CPU affine.
544
*/
545
static void
546
pt_backend_disable(struct hwt_context *ctx, int cpu_id)
547
{
548
struct pt_cpu *cpu;
549
550
if (ctx->mode == HWT_MODE_CPU)
551
return;
552
553
KASSERT(curcpu == cpu_id,
554
("%s: attempting to disable PT on another cpu", __func__));
555
pt_cpu_stop(NULL);
556
CPU_CLR(cpu_id, &ctx->cpu_map);
557
cpu = &pt_pcpu[cpu_id];
558
cpu->ctx = NULL;
559
}
560
561
/*
562
* hwt backend trace start operation for remote CPUs.
563
*/
564
static int
565
pt_backend_enable_smp(struct hwt_context *ctx)
566
{
567
568
dprintf("%s\n", __func__);
569
if (ctx->mode == HWT_MODE_CPU &&
570
atomic_swap_32(&cpu_mode_ctr, 1) != 0)
571
return (-1);
572
573
KASSERT(ctx->mode == HWT_MODE_CPU,
574
("%s: should only be used for CPU mode", __func__));
575
smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_start, NULL, NULL);
576
577
return (0);
578
}
579
580
/*
581
* hwt backend trace stop operation for remote CPUs.
582
*/
583
static int
584
pt_backend_disable_smp(struct hwt_context *ctx)
585
{
586
587
dprintf("%s\n", __func__);
588
if (ctx->mode == HWT_MODE_CPU &&
589
atomic_swap_32(&cpu_mode_ctr, 0) == 0)
590
return (-1);
591
592
if (CPU_EMPTY(&ctx->cpu_map)) {
593
dprintf("%s: empty cpu map\n", __func__);
594
return (-1);
595
}
596
smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_stop, NULL, NULL);
597
598
return (0);
599
}
600
601
/*
602
* HWT backend initialization method.
603
*
604
* Installs the ToPA interrupt handler and initializes
605
* the tracing contexts used for HWT_MODE_CPU.
606
*/
607
static int
608
pt_backend_init(struct hwt_context *ctx)
609
{
610
struct hwt_cpu *hwt_cpu;
611
int error;
612
613
dprintf("%s\n", __func__);
614
if (ctx->mode == HWT_MODE_CPU) {
615
TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
616
error = pt_init_ctx(&pt_pcpu_ctx[hwt_cpu->cpu_id],
617
hwt_cpu->vm, hwt_cpu->cpu_id);
618
if (error)
619
return (error);
620
}
621
}
622
623
return (0);
624
}
625
626
/*
627
* HWT backend teardown method.
628
*
629
* Removes the ToPA interrupt handler, stops tracing on all active CPUs,
630
* and releases all previously allocated ToPA metadata.
631
*/
632
static int
633
pt_backend_deinit(struct hwt_context *ctx)
634
{
635
struct pt_ctx *pt_ctx;
636
struct hwt_thread *thr;
637
int cpu_id;
638
639
dprintf("%s\n", __func__);
640
641
pt_backend_disable_smp(ctx);
642
if (ctx->mode == HWT_MODE_THREAD) {
643
TAILQ_FOREACH(thr, &ctx->threads, next) {
644
KASSERT(thr->private != NULL,
645
("%s: thr->private not set", __func__));
646
pt_ctx = (struct pt_ctx *)thr->private;
647
pt_deinit_ctx(pt_ctx);
648
}
649
} else {
650
CPU_FOREACH(cpu_id) {
651
if (!CPU_ISSET(cpu_id, &ctx->cpu_map))
652
continue;
653
if (pt_pcpu[cpu_id].ctx != NULL) {
654
KASSERT(pt_pcpu[cpu_id].ctx ==
655
&pt_pcpu_ctx[cpu_id],
656
("%s: CPU mode tracing with non-cpu mode PT"
657
"context active",
658
__func__));
659
pt_pcpu[cpu_id].ctx = NULL;
660
}
661
pt_ctx = &pt_pcpu_ctx[cpu_id];
662
pt_deinit_ctx(pt_ctx);
663
memset(&pt_pcpu[cpu_id], 0, sizeof(struct pt_cpu));
664
}
665
}
666
667
return (0);
668
}
669
670
/*
671
* Fetches current offset into the tracing buffer.
672
*/
673
static int
674
pt_backend_read(struct hwt_vm *vm, int *curpage, vm_offset_t *curpage_offset,
675
uint64_t *data)
676
{
677
struct pt_buffer *buf;
678
679
if (vm->ctx->mode == HWT_MODE_THREAD)
680
buf = &((struct pt_ctx *)vm->thr->private)->buf;
681
else
682
buf = &pt_pcpu[vm->cpu->cpu_id].ctx->buf;
683
mtx_lock_spin(&buf->lock);
684
*curpage = buf->curpage;
685
*curpage_offset = buf->offset + (buf->wrap_count * vm->ctx->bufsize);
686
mtx_unlock_spin(&buf->lock);
687
688
return (0);
689
}
690
691
/*
692
* HWT thread creation hook.
693
* Allocates and associates a 'struct pt_ctx' for a given hwt thread.
694
*/
695
static int
696
pt_backend_alloc_thread(struct hwt_thread *thr)
697
{
698
struct pt_ctx *pt_ctx;
699
int error;
700
701
/* Omit M_WAITOK since this might get invoked a non-sleepable context */
702
pt_ctx = malloc(sizeof(*pt_ctx), M_PT, M_NOWAIT | M_ZERO);
703
if (pt_ctx == NULL)
704
return (ENOMEM);
705
706
error = pt_init_ctx(pt_ctx, thr->vm, thr->thread_id);
707
if (error)
708
return (error);
709
710
thr->private = pt_ctx;
711
return (0);
712
}
713
/*
714
* HWT thread teardown hook.
715
*/
716
static void
717
pt_backend_free_thread(struct hwt_thread *thr)
718
{
719
struct pt_ctx *ctx;
720
721
ctx = (struct pt_ctx *)thr->private;
722
723
pt_deinit_ctx(ctx);
724
free(ctx, M_PT);
725
}
726
727
static void
728
pt_backend_dump(int cpu_id)
729
{
730
}
731
732
static struct hwt_backend_ops pt_ops = {
733
.hwt_backend_init = pt_backend_init,
734
.hwt_backend_deinit = pt_backend_deinit,
735
736
.hwt_backend_configure = pt_backend_configure,
737
738
.hwt_backend_enable = pt_backend_enable,
739
.hwt_backend_disable = pt_backend_disable,
740
741
#ifdef SMP
742
.hwt_backend_enable_smp = pt_backend_enable_smp,
743
.hwt_backend_disable_smp = pt_backend_disable_smp,
744
#endif
745
746
.hwt_backend_read = pt_backend_read,
747
.hwt_backend_dump = pt_backend_dump,
748
749
.hwt_backend_thread_alloc = pt_backend_alloc_thread,
750
.hwt_backend_thread_free = pt_backend_free_thread,
751
};
752
753
static struct hwt_backend backend = {
754
.ops = &pt_ops,
755
.name = "pt",
756
.kva_req = 1,
757
};
758
759
/*
760
* Reads the latest valid trace buffer offset and enqueues
761
* a HWT_RECORD_BUFFER record.
762
* Used as a taskqueue routine from the ToPA interrupt handler.
763
*/
764
static void
765
pt_send_buffer_record(void *arg, int pending __unused)
766
{
767
struct hwt_record_entry record;
768
struct pt_ctx *ctx = (struct pt_ctx *)arg;
769
770
/* Prepare buffer record. */
771
mtx_lock_spin(&ctx->buf.lock);
772
pt_fill_buffer_record(ctx->id, &ctx->buf, &record);
773
mtx_unlock_spin(&ctx->buf.lock);
774
hwt_record_ctx(ctx->hwt_ctx, &record, M_ZERO | M_NOWAIT);
775
}
776
static void
777
pt_topa_status_clear(void)
778
{
779
uint64_t reg;
780
781
reg = rdmsr(MSR_IA_GLOBAL_STATUS_RESET);
782
reg &= ~GLOBAL_STATUS_FLAG_TRACETOPAPMI;
783
reg |= GLOBAL_STATUS_FLAG_TRACETOPAPMI;
784
wrmsr(MSR_IA_GLOBAL_STATUS_RESET, reg);
785
}
786
787
/*
788
* ToPA PMI handler.
789
*
790
* Invoked every time a ToPA entry marked with TOPA_INT is filled.
791
* Uses taskqueue to enqueue a buffer record for userspace.
792
* Re-enables the PC interrupt line as long as tracing is active.
793
*/
794
static int
795
pt_topa_intr(struct trapframe *tf)
796
{
797
struct pt_buffer *buf;
798
struct pt_ctx *ctx;
799
uint64_t reg;
800
801
SDT_PROBE0(pt, , , topa__intr);
802
803
if (pt_cpu_get_state(curcpu) != PT_ACTIVE) {
804
return (0);
805
}
806
reg = rdmsr(MSR_IA_GLOBAL_STATUS);
807
if ((reg & GLOBAL_STATUS_FLAG_TRACETOPAPMI) == 0) {
808
/* ACK spurious or leftover interrupt. */
809
pt_topa_status_clear();
810
return (1);
811
}
812
813
ctx = pt_pcpu[curcpu].ctx;
814
buf = &ctx->buf;
815
KASSERT(buf->topa_hw != NULL,
816
("%s: ToPA PMI interrupt with invalid buffer", __func__));
817
818
pt_cpu_toggle_local(ctx->save_area, false);
819
pt_update_buffer(buf);
820
pt_topa_status_clear();
821
taskqueue_enqueue_flags(taskqueue_pt, &ctx->task,
822
TASKQUEUE_FAIL_IF_PENDING);
823
824
if (pt_cpu_get_state(curcpu) == PT_ACTIVE) {
825
pt_cpu_toggle_local(ctx->save_area, true);
826
lapic_reenable_pcint();
827
}
828
return (1);
829
}
830
831
/*
832
* Module initialization.
833
*
834
* Saves all PT-related cpuid info, registers itself as a HWT backend,
835
* and allocates metadata required to keep track of tracing operations
836
* on each CPU.
837
*/
838
static int
839
pt_init(void)
840
{
841
u_int cp[4];
842
int error;
843
844
dprintf("pt: Enumerating part 1\n");
845
cpuid_count(CPUID_PT_LEAF, 0, cp);
846
dprintf("pt: Maximum valid sub-leaf Index: %x\n", cp[0]);
847
dprintf("pt: ebx %x\n", cp[1]);
848
dprintf("pt: ecx %x\n", cp[2]);
849
850
pt_info.l0_eax = cp[0];
851
pt_info.l0_ebx = cp[1];
852
pt_info.l0_ecx = cp[2];
853
854
dprintf("pt: Enumerating part 2\n");
855
cpuid_count(CPUID_PT_LEAF, 1, cp);
856
dprintf("pt: eax %x\n", cp[0]);
857
dprintf("pt: ebx %x\n", cp[1]);
858
859
pt_info.l1_eax = cp[0];
860
pt_info.l1_ebx = cp[1];
861
862
error = hwt_backend_register(&backend);
863
if (error != 0) {
864
printf("pt: unable to register hwt backend, error %d\n", error);
865
return (error);
866
}
867
pt_pcpu = mallocarray(mp_ncpus, sizeof(struct pt_cpu), M_PT,
868
M_ZERO | M_WAITOK);
869
pt_pcpu_ctx = mallocarray(mp_ncpus, sizeof(struct pt_ctx), M_PT,
870
M_ZERO | M_WAITOK);
871
872
nmi_register_handler(pt_topa_intr);
873
if (!lapic_enable_pcint()) {
874
nmi_remove_handler(pt_topa_intr);
875
hwt_backend_unregister(&backend);
876
free(pt_pcpu, M_PT);
877
free(pt_pcpu_ctx, M_PT);
878
pt_pcpu = NULL;
879
pt_pcpu_ctx = NULL;
880
printf("pt: failed to setup interrupt line\n");
881
return (error);
882
}
883
initialized = true;
884
885
return (0);
886
}
887
888
/*
889
* Checks whether the CPU support Intel PT and
890
* initializes XSAVE area info.
891
*
892
* The driver relies on XSAVE/XRSTOR PT extensions,
893
* Table of Physical Addresses (ToPA) support, and
894
* support for multiple ToPA entries.
895
*/
896
static bool
897
pt_supported(void)
898
{
899
u_int cp[4];
900
901
if ((cpu_stdext_feature & CPUID_STDEXT_PROCTRACE) == 0) {
902
printf("pt: CPU does not support Intel Processor Trace\n");
903
return (false);
904
}
905
if ((cpu_feature2 & CPUID2_XSAVE) == 0) {
906
printf("pt: XSAVE is not supported\n");
907
return (false);
908
}
909
if (!xsave_extfeature_supported(XFEATURE_ENABLED_PT, true)) {
910
printf("pt: CPU does not support managing PT state using XSAVE\n");
911
return (false);
912
}
913
if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVEC)) {
914
printf("pt: XSAVE compaction is not supported\n");
915
return (false);
916
}
917
if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVES)) {
918
printf("pt: CPU does not support XSAVES/XRSTORS\n");
919
return (false);
920
}
921
922
/* Require ToPA support. */
923
cpuid_count(CPUID_PT_LEAF, 0, cp);
924
if ((cp[2] & CPUPT_TOPA) == 0) {
925
printf("pt: ToPA is not supported\n");
926
return (false);
927
}
928
if ((cp[2] & CPUPT_TOPA_MULTI) == 0) {
929
printf("pt: multiple ToPA outputs are not supported\n");
930
return (false);
931
}
932
933
pt_info.xstate_hdr_offset = xsave_area_hdr_offset();
934
pt_info.xsave_area_size = xsave_area_size(PT_XSTATE_BV, true, true);
935
pt_info.pt_xsave_offset = xsave_area_offset(PT_XSTATE_BV,
936
XFEATURE_ENABLED_PT, true, true);
937
938
return (true);
939
}
940
941
static void
942
pt_deinit(void)
943
{
944
if (!initialized)
945
return;
946
nmi_remove_handler(pt_topa_intr);
947
lapic_disable_pcint();
948
hwt_backend_unregister(&backend);
949
free(pt_pcpu, M_PT);
950
free(pt_pcpu_ctx, M_PT);
951
pt_pcpu = NULL;
952
initialized = false;
953
}
954
955
static int
956
pt_modevent(module_t mod, int type, void *data)
957
{
958
switch (type) {
959
case MOD_LOAD:
960
if (!pt_supported() || pt_init() != 0) {
961
return (ENXIO);
962
}
963
break;
964
case MOD_UNLOAD:
965
pt_deinit();
966
break;
967
default:
968
break;
969
}
970
971
return (0);
972
}
973
974
static moduledata_t pt_mod = { "intel_pt", pt_modevent, NULL };
975
976
DECLARE_MODULE(intel_pt, pt_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
977
MODULE_DEPEND(intel_pt, hwt, 1, 1, 1);
978
MODULE_VERSION(intel_pt, 1);
979
980