CoCalc -- pt.c

GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/amd64/pt/pt.c
³⁹⁵³⁶ views
1
/*
2
 * Copyright (c) 2025 Bojan Novković <[email protected]>
3
 *
4
 * SPDX-License-Identifier: BSD-2-Clause
5
 */
6

7
/*
8
 * hwt(4) Intel Processor Trace (PT) backend
9
 *
10
 * Driver Design Overview
11
 *
12
 * - Since PT is configured on a per-core basis, the driver uses
13
 *   'smp_rendezvous' to start and disable tracing on each target core.
14
 * - PT-specific resources are stored in a 'struct pt_ctx' context structure for
15
 *   each traced CPU core or thread. Upon initialization, a ToPA configuration
16
 *   is generated for each 'pt_ctx' structure using the HWT tracing buffers.
17
 *   The HWT tracing buffer is split into 4K ToPA entries. Currently, each
18
 *   4K ToPA entry is configured to trigger an interrupt after it is filled.
19
 * - The PT driver uses the XSAVE/XRSTOR PT extensions to load and save all
20
 *   relevant PT registers. Every time a traced thread is switched
21
 *   out or in, its state will be saved to or loaded from its corresponding
22
 *   'pt_ctx' context.
23
 * - When tracing starts, the PT hardware will start writing data into the
24
 *   tracing buffer. When a TOPA_INT entry is filled, it will trigger an
25
 *   interrupt before continuing. The interrupt handler will then fetch the
26
 *   last valid tracing buffer offset and enqueue a HWT_RECORD_BUFFER record.
27
 *   The driver is currently configured to use the NMI interrupt line.
28
 * - The userspace PT backend waits for incoming HWT_RECORD_BUFFER records
29
 *   and uses the offsets to decode data from the tracing buffer.
30
 *
31
 * Future improvements and limitations
32
 *
33
 * - We currently configure the PT hardware to trigger an interrupt whenever
34
 *   a 4K ToPA entry is filled. While this is fine when tracing smaller
35
 *   functions or infrequent code paths, this will generate too much interrupt
36
 *   traffic when tracing hotter functions. A proper solution for this issue
37
 *   should estimate the amount of data generated by the current configuration
38
 *   and use it to determine interrupt frequency.
39
 *
40
 * - Support for more tracing options and PT features.
41
 *
42
 */
43

44
#include <sys/systm.h>
45
#include <sys/hwt.h>
46
#include <sys/kernel.h>
47
#include <sys/lock.h>
48
#include <sys/malloc.h>
49
#include <sys/module.h>
50
#include <sys/mutex.h>
51
#include <sys/sdt.h>
52
#include <sys/smp.h>
53
#include <sys/taskqueue.h>
54

55
#include <vm/vm.h>
56
#include <vm/vm_page.h>
57

58
#include <machine/atomic.h>
59
#include <machine/cpufunc.h>
60
#include <machine/fpu.h>
61
#include <machine/smp.h>
62
#include <machine/specialreg.h>
63

64
#include <x86/apicvar.h>
65
#include <x86/x86_var.h>
66

67
#include <dev/hwt/hwt_context.h>
68
#include <dev/hwt/hwt_vm.h>
69
#include <dev/hwt/hwt_backend.h>
70
#include <dev/hwt/hwt_config.h>
71
#include <dev/hwt/hwt_cpu.h>
72
#include <dev/hwt/hwt_record.h>
73
#include <dev/hwt/hwt_thread.h>
74

75
#include <amd64/pt/pt.h>
76

77
#ifdef PT_DEBUG
78
#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__)
79
#else
80
#define dprintf(fmt, ...)
81
#endif
82
#define PT_SUPPORTED_FLAGS						\
83
	(RTIT_CTL_MTCEN | RTIT_CTL_CR3FILTER | RTIT_CTL_DIS_TNT |	\
84
	    RTIT_CTL_USER | RTIT_CTL_OS | RTIT_CTL_BRANCHEN)
85
#define PT_XSAVE_MASK (XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE)
86
#define PT_XSTATE_BV (PT_XSAVE_MASK | XFEATURE_ENABLED_PT)
87
#define PT_MAX_IP_RANGES 2
88

89
#define PT_TOPA_MASK_PTRS 0x7f
90
#define PT_TOPA_PAGE_MASK 0xffffff80
91
#define PT_TOPA_PAGE_SHIFT 7
92

93
#define CPUID_PT_LEAF	0x14
94

95
MALLOC_DEFINE(M_PT, "pt", "Intel Processor Trace");
96

97
SDT_PROVIDER_DEFINE(pt);
98
SDT_PROBE_DEFINE(pt, , , topa__intr);
99

100
TASKQUEUE_FAST_DEFINE_THREAD(pt);
101

102
static void pt_send_buffer_record(void *arg, int pending __unused);
103
static int pt_topa_intr(struct trapframe *tf);
104

105
/*
106
 * Intel Processor Trace XSAVE-managed state.
107
 */
108
struct pt_ext_area {
109
	uint64_t rtit_ctl;
110
	uint64_t rtit_output_base;
111
	uint64_t rtit_output_mask_ptrs;
112
	uint64_t rtit_status;
113
	uint64_t rtit_cr3_match;
114
	uint64_t rtit_addr0_a;
115
	uint64_t rtit_addr0_b;
116
	uint64_t rtit_addr1_a;
117
	uint64_t rtit_addr1_b;
118
};
119

120
struct pt_buffer {
121
	uint64_t *topa_hw; /* ToPA table entries. */
122
	size_t size;
123
	struct mtx lock; /* Lock for fields below. */
124
	vm_offset_t offset;
125
	uint64_t wrap_count;
126
	int curpage;
127
};
128

129
struct pt_ctx {
130
	int id;
131
	struct pt_buffer buf; /* ToPA buffer metadata */
132
	struct task task;     /* ToPA buffer notification task */
133
	struct hwt_context *hwt_ctx;
134
	uint8_t *save_area; /* PT XSAVE area */
135
};
136
/* PT tracing contexts used for CPU mode. */
137
static struct pt_ctx *pt_pcpu_ctx;
138

139
enum pt_cpu_state {
140
	PT_DISABLED = 0,
141
	PT_STOPPED,
142
	PT_ACTIVE
143
};
144

145
static struct pt_cpu {
146
	struct pt_ctx *ctx;	 /* active PT tracing context */
147
	enum pt_cpu_state state; /* used as part of trace stop protocol */
148
} *pt_pcpu;
149

150
/*
151
 * PT-related CPUID bits.
152
 */
153
static struct pt_cpu_info {
154
	uint32_t l0_eax;
155
	uint32_t l0_ebx;
156
	uint32_t l0_ecx;
157
	uint32_t l1_eax;
158
	uint32_t l1_ebx;
159
	size_t xsave_area_size;
160
	size_t xstate_hdr_offset;
161
	size_t pt_xsave_offset;
162
} pt_info  __read_mostly;
163

164
static bool initialized = false;
165
static int cpu_mode_ctr = 0;
166

167
static __inline enum pt_cpu_state
168
pt_cpu_get_state(int cpu_id)
169
{
170
	return (atomic_load_int(&pt_pcpu[cpu_id].state));
171
}
172

173
static __inline void
174
pt_cpu_set_state(int cpu_id, enum pt_cpu_state state)
175
{
176
	atomic_store_int(&pt_pcpu[cpu_id].state, state);
177
}
178

179
static __inline struct xstate_hdr *
180
pt_ctx_get_xstate_hdr(struct pt_ctx *ctx)
181
{
182
	return ((struct xstate_hdr *)(ctx->save_area +
183
	    pt_info.xstate_hdr_offset));
184
}
185

186

187
static __inline struct pt_ext_area *
188
pt_ctx_get_ext_area(struct pt_ctx *ctx)
189
{
190
	return ((struct pt_ext_area *)(ctx->save_area +
191
	    pt_info.pt_xsave_offset));
192
}
193

194
/*
195
 * Updates current trace buffer offset from the
196
 * ToPA MSRs. Records if the trace buffer wrapped.
197
 */
198
static __inline void
199
pt_update_buffer(struct pt_buffer *buf)
200
{
201
	uint64_t reg;
202
	int curpage;
203

204
	/* Update buffer offset. */
205
	reg = rdmsr(MSR_IA32_RTIT_OUTPUT_MASK_PTRS);
206
	curpage = (reg & PT_TOPA_PAGE_MASK) >> PT_TOPA_PAGE_SHIFT;
207
	mtx_lock_spin(&buf->lock);
208
	/* Check if the output wrapped. */
209
	if (buf->curpage > curpage)
210
		buf->wrap_count++;
211
	buf->curpage = curpage;
212
	buf->offset = reg >> 32;
213
	mtx_unlock_spin(&buf->lock);
214

215
	dprintf("%s: wrap_cnt: %lu, curpage: %d, offset: %zu\n", __func__,
216
	    buf->wrap_count, buf->curpage, buf->offset);
217
}
218

219
static __inline void
220
pt_fill_buffer_record(int id, struct pt_buffer *buf,
221
    struct hwt_record_entry *rec)
222
{
223
	rec->record_type = HWT_RECORD_BUFFER;
224
	rec->buf_id = id;
225
	rec->curpage = buf->curpage;
226
	rec->offset = buf->offset + (buf->wrap_count * buf->size);
227
}
228

229
/*
230
 * Enables or disables tracing on curcpu
231
 * using the XSAVE/XRSTOR PT extensions.
232
 */
233
static void
234
pt_cpu_toggle_local(uint8_t *save_area, bool enable)
235
{
236
	u_long xcr0, cr0;
237
	u_long xss;
238

239
	cr0 = rcr0();
240
	if (cr0 & CR0_TS)
241
		clts();
242
	xcr0 = rxcr(XCR0);
243
	if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK)
244
		load_xcr(XCR0, xcr0 | PT_XSAVE_MASK);
245
	xss = rdmsr(MSR_IA32_XSS);
246
	wrmsr(MSR_IA32_XSS, xss | XFEATURE_ENABLED_PT);
247

248
	if (!enable) {
249
		KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) != 0,
250
		    ("%s: PT is disabled", __func__));
251
		xsaves(save_area, XFEATURE_ENABLED_PT);
252
	} else {
253
		KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) == 0,
254
		    ("%s: PT is enabled", __func__));
255
		xrstors(save_area, XFEATURE_ENABLED_PT);
256
	}
257
	wrmsr(MSR_IA32_XSS, xss);
258
	if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK)
259
		load_xcr(XCR0, xcr0);
260
	if (cr0 & CR0_TS)
261
		load_cr0(cr0);
262
}
263

264
/*
265
 * Starts PT tracing on 'curcpu'.
266
 */
267
static void
268
pt_cpu_start(void *dummy)
269
{
270
	struct pt_cpu *cpu;
271

272
	cpu = &pt_pcpu[curcpu];
273
	MPASS(cpu->ctx != NULL);
274

275
	dprintf("%s: curcpu %d\n", __func__, curcpu);
276
	load_cr4(rcr4() | CR4_XSAVE);
277
	wrmsr(MSR_IA32_RTIT_STATUS, 0);
278
	pt_cpu_set_state(curcpu, PT_ACTIVE);
279
	pt_cpu_toggle_local(cpu->ctx->save_area, true);
280
}
281

282
/*
283
 * Stops PT tracing on 'curcpu'.
284
 * Updates trace buffer offset to ensure
285
 * any data generated between the last interrupt
286
 * and the trace stop gets picked up by userspace.
287
 */
288
static void
289
pt_cpu_stop(void *dummy)
290
{
291
	struct pt_cpu *cpu;
292
	struct pt_ctx *ctx;
293

294
	/* Shutdown may occur before PT gets properly configured. */
295
	if (pt_cpu_get_state(curcpu) == PT_DISABLED)
296
		return;
297

298
	cpu = &pt_pcpu[curcpu];
299
	ctx = cpu->ctx;
300
	MPASS(ctx != NULL);
301
	dprintf("%s: curcpu %d\n", __func__, curcpu);
302

303
	pt_cpu_set_state(curcpu, PT_STOPPED);
304
	pt_cpu_toggle_local(cpu->ctx->save_area, false);
305
	pt_update_buffer(&ctx->buf);
306
}
307

308
/*
309
 * Prepares the Table of Physical Addresses (ToPA) metadata for 'pt_ctx'.
310
 * The HWT trace buffer is split into 4K ToPA table entries and used
311
 * as a circular buffer, meaning that the last ToPA entry points to
312
 * the first ToPA entry. Each entry is configured to raise an
313
 * interrupt after being filled.
314
 */
315
static int
316
pt_topa_prepare(struct pt_ctx *ctx, struct hwt_vm *vm)
317
{
318
	struct pt_buffer *buf;
319
	size_t topa_size;
320
	int i;
321

322
	topa_size = TOPA_SIZE_4K;
323
	buf = &ctx->buf;
324

325
	KASSERT(buf->topa_hw == NULL,
326
	    ("%s: ToPA info already exists", __func__));
327
	buf->topa_hw = mallocarray(vm->npages + 1, sizeof(uint64_t), M_PT,
328
	    M_ZERO | M_WAITOK);
329
	dprintf("%s: ToPA virt addr %p\n", __func__, buf->topa_hw);
330
	buf->size = vm->npages * PAGE_SIZE;
331
	for (i = 0; i < vm->npages; i++) {
332
		buf->topa_hw[i] = VM_PAGE_TO_PHYS(vm->pages[i]) | topa_size;
333
		/*
334
		 * XXX: TOPA_INT should ideally be set according to
335
		 * expected amount of incoming trace data. Too few TOPA_INT
336
		 * entries will not trigger interrupts often enough when tracing
337
		 * smaller functions.
338
		 */
339
		buf->topa_hw[i] |= TOPA_INT;
340
	}
341
	buf->topa_hw[vm->npages] = (uint64_t)vtophys(buf->topa_hw) | TOPA_END;
342

343
	return (0);
344
}
345

346
/*
347
 * Configures IP filtering for trace generation.
348
 * A maximum of 2 ranges can be specified due to
349
 * limitations imposed by the XSAVE/XRSTOR PT extensions.
350
 */
351
static int
352
pt_configure_ranges(struct pt_ctx *ctx, struct pt_cpu_config *cfg)
353
{
354
	struct pt_ext_area *pt_ext;
355
	int nranges_supp, n, error = 0;
356

357
	pt_ext = pt_ctx_get_ext_area(ctx);
358
	if (pt_info.l0_ebx & CPUPT_IPF) {
359
		nranges_supp = (pt_info.l1_eax & CPUPT_NADDR_M) >>
360
		    CPUPT_NADDR_S;
361

362
		if (nranges_supp > PT_IP_FILTER_MAX_RANGES)
363
			nranges_supp = PT_IP_FILTER_MAX_RANGES;
364
		n = cfg->nranges;
365
		if (n > nranges_supp) {
366
			printf("%s: %d IP filtering ranges requested, CPU "
367
			       "supports %d, truncating\n",
368
			    __func__, n, nranges_supp);
369
			n = nranges_supp;
370
		}
371

372
		switch (n) {
373
		case 2:
374
			pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(1));
375
			pt_ext->rtit_addr1_a = cfg->ip_ranges[1].start;
376
			pt_ext->rtit_addr1_b = cfg->ip_ranges[1].end;
377
		case 1:
378
			pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(0));
379
			pt_ext->rtit_addr0_a = cfg->ip_ranges[0].start;
380
			pt_ext->rtit_addr0_b = cfg->ip_ranges[0].end;
381
			break;
382
		default:
383
			error = (EINVAL);
384
			break;
385
		};
386
	} else
387
		error = (ENXIO);
388

389
	return (error);
390
}
391

392
static int
393
pt_init_ctx(struct pt_ctx *pt_ctx, struct hwt_vm *vm, int ctx_id)
394
{
395

396
	dprintf("%s: ctx id %d\n", __func__, ctx_id);
397

398
	KASSERT(pt_ctx->buf.topa_hw == NULL,
399
	    ("%s: active ToPA buffer in context %p\n", __func__, pt_ctx));
400

401
	memset(pt_ctx, 0, sizeof(struct pt_ctx));
402
	mtx_init(&pt_ctx->buf.lock, "pttopa", NULL, MTX_SPIN);
403
	pt_ctx->save_area = malloc_aligned(pt_info.xsave_area_size, 64,
404
	    M_PT, M_NOWAIT | M_ZERO);
405
	if (pt_ctx->save_area == NULL)
406
		return (ENOMEM);
407
	dprintf("%s: preparing ToPA buffer\n", __func__);
408
	if (pt_topa_prepare(pt_ctx, vm) != 0) {
409
		dprintf("%s: failed to prepare ToPA buffer\n", __func__);
410
		free(pt_ctx->save_area, M_PT);
411
		return (ENOMEM);
412
	}
413

414
	pt_ctx->id = ctx_id;
415
	TASK_INIT(&pt_ctx->task, 0, pt_send_buffer_record, pt_ctx);
416

417
	return (0);
418
}
419

420
static void
421
pt_deinit_ctx(struct pt_ctx *pt_ctx)
422
{
423

424
	if (pt_ctx->buf.topa_hw != NULL)
425
		free(pt_ctx->buf.topa_hw, M_PT);
426
	if (pt_ctx->save_area != NULL)
427
		free(pt_ctx->save_area, M_PT);
428
	memset(pt_ctx, 0, sizeof(*pt_ctx));
429
	pt_ctx->buf.topa_hw = NULL;
430
}
431

432
/*
433
 * HWT backend configuration method.
434
 *
435
 * Checks and translates the user-defined configuration to a
436
 * set of PT tracing features. Uses the feature set to initialize
437
 * the tracing context for the target CPU or thread.
438
 */
439
static int
440
pt_backend_configure(struct hwt_context *ctx, int cpu_id, int thread_id)
441
{
442
	struct hwt_cpu *hwt_cpu;
443
	struct hwt_thread *thr;
444
	struct pt_ctx *pt_ctx;
445
	struct pt_cpu_config *cfg;
446
	struct pt_ext_area *pt_ext;
447
	struct xstate_hdr *hdr;
448
	int error;
449

450
	dprintf("%s\n", __func__);
451

452
	cfg = (struct pt_cpu_config *)ctx->config;
453
	pt_ctx = NULL;
454

455
	/* Clear any flags we don't support yet. */
456
	cfg->rtit_ctl &= PT_SUPPORTED_FLAGS;
457
	if (cfg->rtit_ctl & RTIT_CTL_MTCEN) {
458
		if ((pt_info.l0_ebx & CPUPT_MTC) == 0) {
459
			printf("%s: CPU does not support generating MTC "
460
			    "packets\n", __func__);
461
			return (ENXIO);
462
		}
463
	}
464

465
	if (cfg->rtit_ctl & RTIT_CTL_CR3FILTER) {
466
		if ((pt_info.l0_ebx & CPUPT_CR3) == 0) {
467
			printf("%s: CPU does not support CR3 filtering\n",
468
			    __func__);
469
			return (ENXIO);
470
		}
471
	}
472

473
	if (cfg->rtit_ctl & RTIT_CTL_DIS_TNT) {
474
		if ((pt_info.l0_ebx & CPUPT_DIS_TNT) == 0) {
475
			printf("%s: CPU does not support TNT\n", __func__);
476
			return (ENXIO);
477
		}
478
	}
479
	/* TODO: support for more config bits. */
480

481
	if (ctx->mode == HWT_MODE_CPU) {
482
		TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
483
			if (hwt_cpu->cpu_id != cpu_id)
484
				continue;
485
			pt_ctx = &pt_pcpu_ctx[cpu_id];
486
			break;
487
		}
488
	} else {
489
		TAILQ_FOREACH(thr, &ctx->threads, next) {
490
			if (thr->thread_id != thread_id)
491
				continue;
492
			KASSERT(thr->private != NULL,
493
			    ("%s: hwt thread private"
494
			     " not set, thr %p",
495
				__func__, thr));
496
			pt_ctx = (struct pt_ctx *)thr->private;
497
			break;
498
		}
499
	}
500
	if (pt_ctx == NULL)
501
		return (ENOENT);
502

503
	dprintf("%s: preparing MSRs\n", __func__);
504
	pt_ext = pt_ctx_get_ext_area(pt_ctx);
505
	hdr = pt_ctx_get_xstate_hdr(pt_ctx);
506

507
	pt_ext->rtit_ctl |= cfg->rtit_ctl;
508
	if (cfg->nranges != 0) {
509
		dprintf("%s: preparing IPF ranges\n", __func__);
510
		if ((error = pt_configure_ranges(pt_ctx, cfg)) != 0)
511
			return (error);
512
	}
513
	pt_ctx->hwt_ctx = ctx;
514
	pt_ext->rtit_ctl |= RTIT_CTL_TOPA;
515
	pt_ext->rtit_output_base = (uint64_t)vtophys(pt_ctx->buf.topa_hw);
516
	pt_ext->rtit_output_mask_ptrs = PT_TOPA_MASK_PTRS;
517
	hdr->xstate_bv = XFEATURE_ENABLED_PT;
518
	hdr->xstate_xcomp_bv = XFEATURE_ENABLED_PT |
519
	    XSTATE_XCOMP_BV_COMPACT;
520
	pt_ext->rtit_ctl |= RTIT_CTL_TRACEEN;
521
	pt_pcpu[cpu_id].ctx = pt_ctx;
522
	pt_cpu_set_state(cpu_id, PT_STOPPED);
523

524
	return (0);
525
}
526

527
/*
528
 * hwt backend trace start operation. CPU affine.
529
 */
530
static void
531
pt_backend_enable(struct hwt_context *ctx, int cpu_id)
532
{
533
	if (ctx->mode == HWT_MODE_CPU)
534
		return;
535

536
	KASSERT(curcpu == cpu_id,
537
	    ("%s: attempting to start PT on another cpu", __func__));
538
	pt_cpu_start(NULL);
539
	CPU_SET(cpu_id, &ctx->cpu_map);
540
}
541

542
/*
543
 * hwt backend trace stop operation. CPU affine.
544
 */
545
static void
546
pt_backend_disable(struct hwt_context *ctx, int cpu_id)
547
{
548
	struct pt_cpu *cpu;
549

550
	if (ctx->mode == HWT_MODE_CPU)
551
		return;
552

553
	KASSERT(curcpu == cpu_id,
554
	    ("%s: attempting to disable PT on another cpu", __func__));
555
	pt_cpu_stop(NULL);
556
	CPU_CLR(cpu_id, &ctx->cpu_map);
557
	cpu = &pt_pcpu[cpu_id];
558
	cpu->ctx = NULL;
559
}
560

561
/*
562
 * hwt backend trace start operation for remote CPUs.
563
 */
564
static int
565
pt_backend_enable_smp(struct hwt_context *ctx)
566
{
567

568
	dprintf("%s\n", __func__);
569
	if (ctx->mode == HWT_MODE_CPU &&
570
	    atomic_swap_32(&cpu_mode_ctr, 1) != 0)
571
		return (-1);
572

573
	KASSERT(ctx->mode == HWT_MODE_CPU,
574
	    ("%s: should only be used for CPU mode", __func__));
575
	smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_start, NULL, NULL);
576

577
	return (0);
578
}
579

580
/*
581
 * hwt backend trace stop operation for remote CPUs.
582
 */
583
static int
584
pt_backend_disable_smp(struct hwt_context *ctx)
585
{
586

587
	dprintf("%s\n", __func__);
588
	if (ctx->mode == HWT_MODE_CPU &&
589
	    atomic_swap_32(&cpu_mode_ctr, 0) == 0)
590
		return (-1);
591

592
	if (CPU_EMPTY(&ctx->cpu_map)) {
593
		dprintf("%s: empty cpu map\n", __func__);
594
		return (-1);
595
	}
596
	smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_stop, NULL, NULL);
597

598
	return (0);
599
}
600

601
/*
602
 * HWT backend initialization method.
603
 *
604
 * Installs the ToPA interrupt handler and initializes
605
 * the tracing contexts used for HWT_MODE_CPU.
606
 */
607
static int
608
pt_backend_init(struct hwt_context *ctx)
609
{
610
	struct hwt_cpu *hwt_cpu;
611
	int error;
612

613
	dprintf("%s\n", __func__);
614
	if (ctx->mode == HWT_MODE_CPU) {
615
		TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
616
			error = pt_init_ctx(&pt_pcpu_ctx[hwt_cpu->cpu_id],
617
			    hwt_cpu->vm, hwt_cpu->cpu_id);
618
			if (error)
619
				return (error);
620
		}
621
	}
622

623
	return (0);
624
}
625

626
/*
627
 * HWT backend teardown method.
628
 *
629
 * Removes the ToPA interrupt handler, stops tracing on all active CPUs,
630
 * and releases all previously allocated ToPA metadata.
631
 */
632
static int
633
pt_backend_deinit(struct hwt_context *ctx)
634
{
635
	struct pt_ctx *pt_ctx;
636
	struct hwt_thread *thr;
637
	int cpu_id;
638

639
	dprintf("%s\n", __func__);
640

641
	pt_backend_disable_smp(ctx);
642
	if (ctx->mode == HWT_MODE_THREAD) {
643
		TAILQ_FOREACH(thr, &ctx->threads, next) {
644
			KASSERT(thr->private != NULL,
645
			    ("%s: thr->private not set", __func__));
646
			pt_ctx = (struct pt_ctx *)thr->private;
647
			pt_deinit_ctx(pt_ctx);
648
		}
649
	} else {
650
		CPU_FOREACH(cpu_id) {
651
			if (!CPU_ISSET(cpu_id, &ctx->cpu_map))
652
				continue;
653
			if (pt_pcpu[cpu_id].ctx != NULL) {
654
				KASSERT(pt_pcpu[cpu_id].ctx ==
655
					&pt_pcpu_ctx[cpu_id],
656
				    ("%s: CPU mode tracing with non-cpu mode PT"
657
				     "context active",
658
					__func__));
659
				pt_pcpu[cpu_id].ctx = NULL;
660
			}
661
			pt_ctx = &pt_pcpu_ctx[cpu_id];
662
			pt_deinit_ctx(pt_ctx);
663
			memset(&pt_pcpu[cpu_id], 0, sizeof(struct pt_cpu));
664
		}
665
	}
666

667
	return (0);
668
}
669

670
/*
671
 * Fetches current offset into the tracing buffer.
672
 */
673
static int
674
pt_backend_read(struct hwt_vm *vm, int *curpage, vm_offset_t *curpage_offset,
675
    uint64_t *data)
676
{
677
	struct pt_buffer *buf;
678

679
	if (vm->ctx->mode == HWT_MODE_THREAD)
680
		buf = &((struct pt_ctx *)vm->thr->private)->buf;
681
	else
682
		buf = &pt_pcpu[vm->cpu->cpu_id].ctx->buf;
683
	mtx_lock_spin(&buf->lock);
684
	*curpage = buf->curpage;
685
	*curpage_offset = buf->offset + (buf->wrap_count * vm->ctx->bufsize);
686
	mtx_unlock_spin(&buf->lock);
687

688
	return (0);
689
}
690

691
/*
692
 * HWT thread creation hook.
693
 * Allocates and associates a 'struct pt_ctx' for a given hwt thread.
694
 */
695
static int
696
pt_backend_alloc_thread(struct hwt_thread *thr)
697
{
698
	struct pt_ctx *pt_ctx;
699
	int error;
700

701
	/* Omit M_WAITOK since this might get invoked a non-sleepable context */
702
	pt_ctx = malloc(sizeof(*pt_ctx), M_PT, M_NOWAIT | M_ZERO);
703
	if (pt_ctx == NULL)
704
		return (ENOMEM);
705

706
	error = pt_init_ctx(pt_ctx, thr->vm, thr->thread_id);
707
	if (error)
708
		return (error);
709

710
	thr->private = pt_ctx;
711
	return (0);
712
}
713
/*
714
 * HWT thread teardown hook.
715
 */
716
static void
717
pt_backend_free_thread(struct hwt_thread *thr)
718
{
719
	struct pt_ctx *ctx;
720

721
	ctx = (struct pt_ctx *)thr->private;
722

723
	pt_deinit_ctx(ctx);
724
	free(ctx, M_PT);
725
}
726

727
static void
728
pt_backend_dump(int cpu_id)
729
{
730
}
731

732
static struct hwt_backend_ops pt_ops = {
733
	.hwt_backend_init = pt_backend_init,
734
	.hwt_backend_deinit = pt_backend_deinit,
735

736
	.hwt_backend_configure = pt_backend_configure,
737

738
	.hwt_backend_enable = pt_backend_enable,
739
	.hwt_backend_disable = pt_backend_disable,
740

741
#ifdef SMP
742
	.hwt_backend_enable_smp = pt_backend_enable_smp,
743
	.hwt_backend_disable_smp = pt_backend_disable_smp,
744
#endif
745

746
	.hwt_backend_read = pt_backend_read,
747
	.hwt_backend_dump = pt_backend_dump,
748

749
	.hwt_backend_thread_alloc = pt_backend_alloc_thread,
750
	.hwt_backend_thread_free = pt_backend_free_thread,
751
};
752

753
static struct hwt_backend backend = {
754
	.ops = &pt_ops,
755
	.name = "pt",
756
	.kva_req = 1,
757
};
758

759
/*
760
 * Reads the latest valid trace buffer offset and enqueues
761
 * a HWT_RECORD_BUFFER record.
762
 * Used as a taskqueue routine from the ToPA interrupt handler.
763
 */
764
static void
765
pt_send_buffer_record(void *arg, int pending __unused)
766
{
767
	struct hwt_record_entry record;
768
	struct pt_ctx *ctx = (struct pt_ctx *)arg;
769

770
	/* Prepare buffer record. */
771
	mtx_lock_spin(&ctx->buf.lock);
772
	pt_fill_buffer_record(ctx->id, &ctx->buf, &record);
773
	mtx_unlock_spin(&ctx->buf.lock);
774
	hwt_record_ctx(ctx->hwt_ctx, &record, M_ZERO | M_NOWAIT);
775
}
776
static void
777
pt_topa_status_clear(void)
778
{
779
	uint64_t reg;
780

781
	reg = rdmsr(MSR_IA_GLOBAL_STATUS_RESET);
782
	reg &= ~GLOBAL_STATUS_FLAG_TRACETOPAPMI;
783
	reg |= GLOBAL_STATUS_FLAG_TRACETOPAPMI;
784
	wrmsr(MSR_IA_GLOBAL_STATUS_RESET, reg);
785
}
786

787
/*
788
 * ToPA PMI handler.
789
 *
790
 * Invoked every time a ToPA entry marked with TOPA_INT is filled.
791
 * Uses taskqueue to enqueue a buffer record for userspace.
792
 * Re-enables the PC interrupt line as long as tracing is active.
793
 */
794
static int
795
pt_topa_intr(struct trapframe *tf)
796
{
797
	struct pt_buffer *buf;
798
	struct pt_ctx *ctx;
799
	uint64_t reg;
800

801
	SDT_PROBE0(pt, , , topa__intr);
802

803
	if (pt_cpu_get_state(curcpu) != PT_ACTIVE) {
804
		return (0);
805
	}
806
	reg = rdmsr(MSR_IA_GLOBAL_STATUS);
807
	if ((reg & GLOBAL_STATUS_FLAG_TRACETOPAPMI) == 0) {
808
		/* ACK spurious or leftover interrupt. */
809
		pt_topa_status_clear();
810
		return (1);
811
	}
812

813
	ctx = pt_pcpu[curcpu].ctx;
814
	buf = &ctx->buf;
815
	KASSERT(buf->topa_hw != NULL,
816
	    ("%s: ToPA PMI interrupt with invalid buffer", __func__));
817

818
	pt_cpu_toggle_local(ctx->save_area, false);
819
	pt_update_buffer(buf);
820
	pt_topa_status_clear();
821
	taskqueue_enqueue_flags(taskqueue_pt, &ctx->task,
822
	    TASKQUEUE_FAIL_IF_PENDING);
823

824
	if (pt_cpu_get_state(curcpu) == PT_ACTIVE) {
825
		pt_cpu_toggle_local(ctx->save_area, true);
826
		lapic_reenable_pcint();
827
	}
828
	return (1);
829
}
830

831
/*
832
 * Module initialization.
833
 *
834
 * Saves all PT-related cpuid info, registers itself as a HWT backend,
835
 * and allocates metadata required to keep track of tracing operations
836
 * on each CPU.
837
 */
838
static int
839
pt_init(void)
840
{
841
	u_int cp[4];
842
	int error;
843

844
	dprintf("pt: Enumerating part 1\n");
845
	cpuid_count(CPUID_PT_LEAF, 0, cp);
846
	dprintf("pt: Maximum valid sub-leaf Index: %x\n", cp[0]);
847
	dprintf("pt: ebx %x\n", cp[1]);
848
	dprintf("pt: ecx %x\n", cp[2]);
849

850
	pt_info.l0_eax = cp[0];
851
	pt_info.l0_ebx = cp[1];
852
	pt_info.l0_ecx = cp[2];
853

854
	dprintf("pt: Enumerating part 2\n");
855
	cpuid_count(CPUID_PT_LEAF, 1, cp);
856
	dprintf("pt: eax %x\n", cp[0]);
857
	dprintf("pt: ebx %x\n", cp[1]);
858

859
	pt_info.l1_eax = cp[0];
860
	pt_info.l1_ebx = cp[1];
861

862
	error = hwt_backend_register(&backend);
863
	if (error != 0) {
864
		printf("pt: unable to register hwt backend, error %d\n", error);
865
		return (error);
866
	}
867
	pt_pcpu = mallocarray(mp_ncpus, sizeof(struct pt_cpu), M_PT,
868
	    M_ZERO | M_WAITOK);
869
	pt_pcpu_ctx = mallocarray(mp_ncpus, sizeof(struct pt_ctx), M_PT,
870
	    M_ZERO | M_WAITOK);
871

872
	nmi_register_handler(pt_topa_intr);
873
	if (!lapic_enable_pcint()) {
874
		nmi_remove_handler(pt_topa_intr);
875
		hwt_backend_unregister(&backend);
876
		free(pt_pcpu, M_PT);
877
		free(pt_pcpu_ctx, M_PT);
878
		pt_pcpu = NULL;
879
		pt_pcpu_ctx = NULL;
880
		printf("pt: failed to setup interrupt line\n");
881
		return (error);
882
	}
883
	initialized = true;
884

885
	return (0);
886
}
887

888
/*
889
 * Checks whether the CPU support Intel PT and
890
 * initializes XSAVE area info.
891
 *
892
 * The driver relies on XSAVE/XRSTOR PT extensions,
893
 * Table of Physical Addresses (ToPA) support, and
894
 * support for multiple ToPA entries.
895
 */
896
static bool
897
pt_supported(void)
898
{
899
	u_int cp[4];
900

901
	if ((cpu_stdext_feature & CPUID_STDEXT_PROCTRACE) == 0) {
902
		printf("pt: CPU does not support Intel Processor Trace\n");
903
		return (false);
904
	}
905
	if ((cpu_feature2 & CPUID2_XSAVE) == 0) {
906
		printf("pt: XSAVE is not supported\n");
907
		return (false);
908
	}
909
	if (!xsave_extfeature_supported(XFEATURE_ENABLED_PT, true)) {
910
		printf("pt: CPU does not support managing PT state using XSAVE\n");
911
		return (false);
912
	}
913
	if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVEC)) {
914
		printf("pt: XSAVE compaction is not supported\n");
915
		return (false);
916
	}
917
	if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVES)) {
918
		printf("pt: CPU does not support XSAVES/XRSTORS\n");
919
		return (false);
920
	}
921

922
	/* Require ToPA support. */
923
	cpuid_count(CPUID_PT_LEAF, 0, cp);
924
	if ((cp[2] & CPUPT_TOPA) == 0) {
925
		printf("pt: ToPA is not supported\n");
926
		return (false);
927
	}
928
	if ((cp[2] & CPUPT_TOPA_MULTI) == 0) {
929
		printf("pt: multiple ToPA outputs are not supported\n");
930
		return (false);
931
	}
932

933
	pt_info.xstate_hdr_offset = xsave_area_hdr_offset();
934
	pt_info.xsave_area_size = xsave_area_size(PT_XSTATE_BV, true, true);
935
	pt_info.pt_xsave_offset = xsave_area_offset(PT_XSTATE_BV,
936
	    XFEATURE_ENABLED_PT, true, true);
937

938
	return (true);
939
}
940

941
static void
942
pt_deinit(void)
943
{
944
	if (!initialized)
945
		return;
946
	nmi_remove_handler(pt_topa_intr);
947
	lapic_disable_pcint();
948
	hwt_backend_unregister(&backend);
949
	free(pt_pcpu, M_PT);
950
	free(pt_pcpu_ctx, M_PT);
951
	pt_pcpu = NULL;
952
	initialized = false;
953
}
954

955
static int
956
pt_modevent(module_t mod, int type, void *data)
957
{
958
	switch (type) {
959
	case MOD_LOAD:
960
		if (!pt_supported() || pt_init() != 0) {
961
			return (ENXIO);
962
		}
963
		break;
964
	case MOD_UNLOAD:
965
		pt_deinit();
966
		break;
967
	default:
968
		break;
969
	}
970

971
	return (0);
972
}
973

974
static moduledata_t pt_mod = { "intel_pt", pt_modevent, NULL };
975

976
DECLARE_MODULE(intel_pt, pt_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
977
MODULE_DEPEND(intel_pt, hwt, 1, 1, 1);
978
MODULE_VERSION(intel_pt, 1);
979

980
Product

Resources

Company