Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/radeonsi/si_sqtt.c
4570 views
1
/*
2
* Copyright 2020 Advanced Micro Devices, Inc.
3
* All Rights Reserved.
4
*
5
* Permission is hereby granted, free of charge, to any person obtaining a
6
* copy of this software and associated documentation files (the "Software"),
7
* to deal in the Software without restriction, including without limitation
8
* on the rights to use, copy, modify, merge, publish, distribute, sub
9
* license, and/or sell copies of the Software, and to permit persons to whom
10
* the Software is furnished to do so, subject to the following conditions:
11
*
12
* The above copyright notice and this permission notice (including the next
13
* paragraph) shall be included in all copies or substantial portions of the
14
* Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19
* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22
* USE OR OTHER DEALINGS IN THE SOFTWARE.
23
*
24
*/
25
26
27
#include "si_pipe.h"
28
#include "si_build_pm4.h"
29
#include "si_compute.h"
30
31
#include "ac_rgp.h"
32
#include "ac_sqtt.h"
33
#include "util/u_memory.h"
34
35
static void
36
si_emit_spi_config_cntl(struct si_context* sctx,
37
struct radeon_cmdbuf *cs, bool enable);
38
39
static bool
40
si_thread_trace_init_bo(struct si_context *sctx)
41
{
42
unsigned max_se = sctx->screen->info.max_se;
43
struct radeon_winsys *ws = sctx->ws;
44
uint64_t size;
45
46
/* The buffer size and address need to be aligned in HW regs. Align the
47
* size as early as possible so that we do all the allocation & addressing
48
* correctly. */
49
sctx->thread_trace->buffer_size = align64(sctx->thread_trace->buffer_size,
50
1u << SQTT_BUFFER_ALIGN_SHIFT);
51
52
/* Compute total size of the thread trace BO for all SEs. */
53
size = align64(sizeof(struct ac_thread_trace_info) * max_se,
54
1 << SQTT_BUFFER_ALIGN_SHIFT);
55
size += sctx->thread_trace->buffer_size * (uint64_t)max_se;
56
57
sctx->thread_trace->bo =
58
ws->buffer_create(ws, size, 4096,
59
RADEON_DOMAIN_VRAM,
60
RADEON_FLAG_NO_INTERPROCESS_SHARING |
61
RADEON_FLAG_GTT_WC |
62
RADEON_FLAG_NO_SUBALLOC);
63
if (!sctx->thread_trace->bo)
64
return false;
65
66
return true;
67
}
68
69
static void
70
si_emit_thread_trace_start(struct si_context* sctx,
71
struct radeon_cmdbuf *cs,
72
uint32_t queue_family_index)
73
{
74
struct si_screen *sscreen = sctx->screen;
75
uint32_t shifted_size = sctx->thread_trace->buffer_size >> SQTT_BUFFER_ALIGN_SHIFT;
76
unsigned max_se = sscreen->info.max_se;
77
78
radeon_begin(cs);
79
80
for (unsigned se = 0; se < max_se; se++) {
81
uint64_t va = sctx->ws->buffer_get_virtual_address(sctx->thread_trace->bo);
82
uint64_t data_va = ac_thread_trace_get_data_va(&sctx->screen->info, sctx->thread_trace, va, se);
83
uint64_t shifted_va = data_va >> SQTT_BUFFER_ALIGN_SHIFT;
84
85
/* Target SEx and SH0. */
86
radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX,
87
S_030800_SE_INDEX(se) |
88
S_030800_SH_INDEX(0) |
89
S_030800_INSTANCE_BROADCAST_WRITES(1));
90
91
/* Select the first active CUs */
92
int first_active_cu = ffs(sctx->screen->info.cu_mask[se][0]);
93
94
if (sctx->chip_class >= GFX10) {
95
/* Order seems important for the following 2 registers. */
96
radeon_set_privileged_config_reg(cs, R_008D04_SQ_THREAD_TRACE_BUF0_SIZE,
97
S_008D04_SIZE(shifted_size) |
98
S_008D04_BASE_HI(shifted_va >> 32));
99
100
radeon_set_privileged_config_reg(cs, R_008D00_SQ_THREAD_TRACE_BUF0_BASE, shifted_va);
101
102
int wgp = first_active_cu / 2;
103
radeon_set_privileged_config_reg(cs, R_008D14_SQ_THREAD_TRACE_MASK,
104
S_008D14_WTYPE_INCLUDE(0x7f) | /* all shader stages */
105
S_008D14_SA_SEL(0) |
106
S_008D14_WGP_SEL(wgp) |
107
S_008D14_SIMD_SEL(0));
108
109
radeon_set_privileged_config_reg(cs, R_008D18_SQ_THREAD_TRACE_TOKEN_MASK,
110
S_008D18_REG_INCLUDE(V_008D18_REG_INCLUDE_SQDEC |
111
V_008D18_REG_INCLUDE_SHDEC |
112
V_008D18_REG_INCLUDE_GFXUDEC |
113
V_008D18_REG_INCLUDE_CONTEXT |
114
V_008D18_REG_INCLUDE_COMP |
115
V_008D18_REG_INCLUDE_CONFIG) |
116
S_008D18_TOKEN_EXCLUDE(V_008D18_TOKEN_EXCLUDE_PERF));
117
118
/* Should be emitted last (it enables thread traces). */
119
radeon_set_privileged_config_reg(cs, R_008D1C_SQ_THREAD_TRACE_CTRL,
120
S_008D1C_MODE(1) |
121
S_008D1C_HIWATER(5) |
122
S_008D1C_UTIL_TIMER(1) |
123
S_008D1C_RT_FREQ(2) | /* 4096 clk */
124
S_008D1C_DRAW_EVENT_EN(1) |
125
S_008D1C_REG_STALL_EN(1) |
126
S_008D1C_SPI_STALL_EN(1) |
127
S_008D1C_SQ_STALL_EN(1) |
128
S_008D1C_REG_DROP_ON_STALL(0) |
129
S_008D1C_LOWATER_OFFSET(
130
sctx->chip_class >= GFX10_3 ? 4 : 0));
131
} else {
132
/* Order seems important for the following 4 registers. */
133
radeon_set_uconfig_reg(cs, R_030CDC_SQ_THREAD_TRACE_BASE2,
134
S_030CDC_ADDR_HI(shifted_va >> 32));
135
136
radeon_set_uconfig_reg(cs, R_030CC0_SQ_THREAD_TRACE_BASE, shifted_va);
137
138
radeon_set_uconfig_reg(cs, R_030CC4_SQ_THREAD_TRACE_SIZE,
139
S_030CC4_SIZE(shifted_size));
140
141
radeon_set_uconfig_reg(cs, R_030CD4_SQ_THREAD_TRACE_CTRL,
142
S_030CD4_RESET_BUFFER(1));
143
144
uint32_t thread_trace_mask = S_030CC8_CU_SEL(first_active_cu) |
145
S_030CC8_SH_SEL(0) |
146
S_030CC8_SIMD_EN(0xf) |
147
S_030CC8_VM_ID_MASK(0) |
148
S_030CC8_REG_STALL_EN(1) |
149
S_030CC8_SPI_STALL_EN(1) |
150
S_030CC8_SQ_STALL_EN(1);
151
152
radeon_set_uconfig_reg(cs, R_030CC8_SQ_THREAD_TRACE_MASK,
153
thread_trace_mask);
154
155
/* Trace all tokens and registers. */
156
radeon_set_uconfig_reg(cs, R_030CCC_SQ_THREAD_TRACE_TOKEN_MASK,
157
S_030CCC_TOKEN_MASK(0xbfff) |
158
S_030CCC_REG_MASK(0xff) |
159
S_030CCC_REG_DROP_ON_STALL(0));
160
161
/* Enable SQTT perf counters for all CUs. */
162
radeon_set_uconfig_reg(cs, R_030CD0_SQ_THREAD_TRACE_PERF_MASK,
163
S_030CD0_SH0_MASK(0xffff) |
164
S_030CD0_SH1_MASK(0xffff));
165
166
radeon_set_uconfig_reg(cs, R_030CE0_SQ_THREAD_TRACE_TOKEN_MASK2, 0xffffffff);
167
168
radeon_set_uconfig_reg(cs, R_030CEC_SQ_THREAD_TRACE_HIWATER,
169
S_030CEC_HIWATER(4));
170
171
if (sctx->chip_class == GFX9) {
172
/* Reset thread trace status errors. */
173
radeon_set_uconfig_reg(cs, R_030CE8_SQ_THREAD_TRACE_STATUS,
174
S_030CE8_UTC_ERROR(0));
175
}
176
177
/* Enable the thread trace mode. */
178
uint32_t thread_trace_mode =
179
S_030CD8_MASK_PS(1) |
180
S_030CD8_MASK_VS(1) |
181
S_030CD8_MASK_GS(1) |
182
S_030CD8_MASK_ES(1) |
183
S_030CD8_MASK_HS(1) |
184
S_030CD8_MASK_LS(1) |
185
S_030CD8_MASK_CS(1) |
186
S_030CD8_AUTOFLUSH_EN(1) | /* periodically flush SQTT data to memory */
187
S_030CD8_MODE(1);
188
189
if (sctx->chip_class == GFX9) {
190
/* Count SQTT traffic in TCC perf counters. */
191
thread_trace_mode |= S_030CD8_TC_PERF_EN(1);
192
}
193
194
radeon_set_uconfig_reg(cs, R_030CD8_SQ_THREAD_TRACE_MODE,
195
thread_trace_mode);
196
}
197
}
198
199
/* Restore global broadcasting. */
200
radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX,
201
S_030800_SE_BROADCAST_WRITES(1) |
202
S_030800_SH_BROADCAST_WRITES(1) |
203
S_030800_INSTANCE_BROADCAST_WRITES(1));
204
205
/* Start the thread trace with a different event based on the queue. */
206
if (queue_family_index == RING_COMPUTE) {
207
radeon_set_sh_reg(cs, R_00B878_COMPUTE_THREAD_TRACE_ENABLE,
208
S_00B878_THREAD_TRACE_ENABLE(1));
209
} else {
210
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
211
radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_START) | EVENT_INDEX(0));
212
}
213
radeon_end();
214
}
215
216
static const uint32_t gfx9_thread_trace_info_regs[] =
217
{
218
R_030CE4_SQ_THREAD_TRACE_WPTR,
219
R_030CE8_SQ_THREAD_TRACE_STATUS,
220
R_030CF0_SQ_THREAD_TRACE_CNTR,
221
};
222
223
static const uint32_t gfx10_thread_trace_info_regs[] =
224
{
225
R_008D10_SQ_THREAD_TRACE_WPTR,
226
R_008D20_SQ_THREAD_TRACE_STATUS,
227
R_008D24_SQ_THREAD_TRACE_DROPPED_CNTR,
228
};
229
230
static void
231
si_copy_thread_trace_info_regs(struct si_context* sctx,
232
struct radeon_cmdbuf *cs,
233
unsigned se_index)
234
{
235
const uint32_t *thread_trace_info_regs = NULL;
236
237
switch (sctx->chip_class) {
238
case GFX10_3:
239
case GFX10:
240
thread_trace_info_regs = gfx10_thread_trace_info_regs;
241
break;
242
case GFX9:
243
thread_trace_info_regs = gfx9_thread_trace_info_regs;
244
break;
245
default:
246
unreachable("Unsupported chip_class");
247
}
248
249
/* Get the VA where the info struct is stored for this SE. */
250
uint64_t va = sctx->ws->buffer_get_virtual_address(sctx->thread_trace->bo);
251
uint64_t info_va = ac_thread_trace_get_info_va(va, se_index);
252
253
radeon_begin(cs);
254
255
/* Copy back the info struct one DWORD at a time. */
256
for (unsigned i = 0; i < 3; i++) {
257
radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
258
radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) |
259
COPY_DATA_DST_SEL(COPY_DATA_TC_L2) |
260
COPY_DATA_WR_CONFIRM);
261
radeon_emit(cs, thread_trace_info_regs[i] >> 2);
262
radeon_emit(cs, 0); /* unused */
263
radeon_emit(cs, (info_va + i * 4));
264
radeon_emit(cs, (info_va + i * 4) >> 32);
265
}
266
radeon_end();
267
}
268
269
270
271
static void
272
si_emit_thread_trace_stop(struct si_context *sctx,
273
struct radeon_cmdbuf *cs,
274
uint32_t queue_family_index)
275
{
276
unsigned max_se = sctx->screen->info.max_se;
277
278
radeon_begin(cs);
279
280
/* Stop the thread trace with a different event based on the queue. */
281
if (queue_family_index == RING_COMPUTE) {
282
radeon_set_sh_reg(cs, R_00B878_COMPUTE_THREAD_TRACE_ENABLE,
283
S_00B878_THREAD_TRACE_ENABLE(0));
284
} else {
285
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
286
radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_STOP) | EVENT_INDEX(0));
287
}
288
289
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
290
radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_FINISH) | EVENT_INDEX(0));
291
radeon_end();
292
293
for (unsigned se = 0; se < max_se; se++) {
294
radeon_begin(cs);
295
296
/* Target SEi and SH0. */
297
radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX,
298
S_030800_SE_INDEX(se) |
299
S_030800_SH_INDEX(0) |
300
S_030800_INSTANCE_BROADCAST_WRITES(1));
301
302
if (sctx->chip_class >= GFX10) {
303
/* Make sure to wait for the trace buffer. */
304
radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
305
radeon_emit(cs, WAIT_REG_MEM_NOT_EQUAL); /* wait until the register is equal to the reference value */
306
radeon_emit(cs, R_008D20_SQ_THREAD_TRACE_STATUS >> 2); /* register */
307
radeon_emit(cs, 0);
308
radeon_emit(cs, 0); /* reference value */
309
radeon_emit(cs, S_008D20_FINISH_DONE(1)); /* mask */
310
radeon_emit(cs, 4); /* poll interval */
311
312
/* Disable the thread trace mode. */
313
radeon_set_privileged_config_reg(cs, R_008D1C_SQ_THREAD_TRACE_CTRL,
314
S_008D1C_MODE(0));
315
316
/* Wait for thread trace completion. */
317
radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
318
radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
319
radeon_emit(cs, R_008D20_SQ_THREAD_TRACE_STATUS >> 2); /* register */
320
radeon_emit(cs, 0);
321
radeon_emit(cs, 0); /* reference value */
322
radeon_emit(cs, S_008D20_BUSY(1)); /* mask */
323
radeon_emit(cs, 4); /* poll interval */
324
} else {
325
/* Disable the thread trace mode. */
326
radeon_set_uconfig_reg(cs, R_030CD8_SQ_THREAD_TRACE_MODE,
327
S_030CD8_MODE(0));
328
329
/* Wait for thread trace completion. */
330
radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
331
radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
332
radeon_emit(cs, R_030CE8_SQ_THREAD_TRACE_STATUS >> 2); /* register */
333
radeon_emit(cs, 0);
334
radeon_emit(cs, 0); /* reference value */
335
radeon_emit(cs, S_030CE8_BUSY(1)); /* mask */
336
radeon_emit(cs, 4); /* poll interval */
337
}
338
radeon_end();
339
340
si_copy_thread_trace_info_regs(sctx, cs, se);
341
}
342
343
/* Restore global broadcasting. */
344
radeon_begin_again(cs);
345
radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX,
346
S_030800_SE_BROADCAST_WRITES(1) |
347
S_030800_SH_BROADCAST_WRITES(1) |
348
S_030800_INSTANCE_BROADCAST_WRITES(1));
349
radeon_end();
350
}
351
352
static void
353
si_thread_trace_start(struct si_context *sctx, int family, struct radeon_cmdbuf *cs)
354
{
355
struct radeon_winsys *ws = sctx->ws;
356
357
radeon_begin(cs);
358
359
switch (family) {
360
case RING_GFX:
361
radeon_emit(cs, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
362
radeon_emit(cs, CC0_UPDATE_LOAD_ENABLES(1));
363
radeon_emit(cs, CC1_UPDATE_SHADOW_ENABLES(1));
364
break;
365
case RING_COMPUTE:
366
radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
367
radeon_emit(cs, 0);
368
break;
369
}
370
radeon_end();
371
372
ws->cs_add_buffer(cs,
373
sctx->thread_trace->bo,
374
RADEON_USAGE_READWRITE,
375
RADEON_DOMAIN_VRAM,
376
0);
377
378
si_cp_dma_wait_for_idle(sctx, cs);
379
380
/* Make sure to wait-for-idle before starting SQTT. */
381
sctx->flags |=
382
SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
383
SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
384
SI_CONTEXT_INV_L2 | SI_CONTEXT_PFP_SYNC_ME;
385
sctx->emit_cache_flush(sctx, cs);
386
387
si_inhibit_clockgating(sctx, cs, true);
388
389
/* Enable SQG events that collects thread trace data. */
390
si_emit_spi_config_cntl(sctx, cs, true);
391
392
si_emit_thread_trace_start(sctx, cs, family);
393
}
394
395
static void
396
si_thread_trace_stop(struct si_context *sctx, int family, struct radeon_cmdbuf *cs)
397
{
398
struct radeon_winsys *ws = sctx->ws;
399
400
radeon_begin(cs);
401
402
switch (family) {
403
case RING_GFX:
404
radeon_emit(sctx->thread_trace->stop_cs[family], PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
405
radeon_emit(sctx->thread_trace->stop_cs[family], CC0_UPDATE_LOAD_ENABLES(1));
406
radeon_emit(sctx->thread_trace->stop_cs[family], CC1_UPDATE_SHADOW_ENABLES(1));
407
break;
408
case RING_COMPUTE:
409
radeon_emit(sctx->thread_trace->stop_cs[family], PKT3(PKT3_NOP, 0, 0));
410
radeon_emit(sctx->thread_trace->stop_cs[family], 0);
411
break;
412
}
413
radeon_end();
414
415
ws->cs_add_buffer(cs,
416
sctx->thread_trace->bo,
417
RADEON_USAGE_READWRITE,
418
RADEON_DOMAIN_VRAM,
419
0);
420
421
si_cp_dma_wait_for_idle(sctx, cs);
422
423
/* Make sure to wait-for-idle before stopping SQTT. */
424
sctx->flags |=
425
SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
426
SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
427
SI_CONTEXT_INV_L2 | SI_CONTEXT_PFP_SYNC_ME;
428
sctx->emit_cache_flush(sctx, cs);
429
430
si_emit_thread_trace_stop(sctx, cs, family);
431
432
/* Restore previous state by disabling SQG events. */
433
si_emit_spi_config_cntl(sctx, cs, false);
434
435
si_inhibit_clockgating(sctx, cs, false);
436
}
437
438
439
static void
440
si_thread_trace_init_cs(struct si_context *sctx)
441
{
442
struct radeon_winsys *ws = sctx->ws;
443
444
/* Thread trace start CS (only handles RING_GFX). */
445
sctx->thread_trace->start_cs[RING_GFX] = CALLOC_STRUCT(radeon_cmdbuf);
446
if (!ws->cs_create(sctx->thread_trace->start_cs[RING_GFX],
447
sctx->ctx, RING_GFX, NULL, NULL, 0)) {
448
free(sctx->thread_trace->start_cs[RING_GFX]);
449
sctx->thread_trace->start_cs[RING_GFX] = NULL;
450
return;
451
}
452
453
si_thread_trace_start(sctx, RING_GFX, sctx->thread_trace->start_cs[RING_GFX]);
454
455
/* Thread trace stop CS. */
456
sctx->thread_trace->stop_cs[RING_GFX] = CALLOC_STRUCT(radeon_cmdbuf);
457
if (!ws->cs_create(sctx->thread_trace->stop_cs[RING_GFX],
458
sctx->ctx, RING_GFX, NULL, NULL, 0)) {
459
free(sctx->thread_trace->start_cs[RING_GFX]);
460
sctx->thread_trace->start_cs[RING_GFX] = NULL;
461
free(sctx->thread_trace->stop_cs[RING_GFX]);
462
sctx->thread_trace->stop_cs[RING_GFX] = NULL;
463
return;
464
}
465
466
si_thread_trace_stop(sctx, RING_GFX, sctx->thread_trace->stop_cs[RING_GFX]);
467
}
468
469
static void
470
si_begin_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs)
471
{
472
struct radeon_cmdbuf *cs = sctx->thread_trace->start_cs[RING_GFX];
473
sctx->ws->cs_flush(cs, 0, NULL);
474
}
475
476
static void
477
si_end_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs)
478
{
479
struct radeon_cmdbuf *cs = sctx->thread_trace->stop_cs[RING_GFX];
480
sctx->ws->cs_flush(cs, 0, &sctx->last_sqtt_fence);
481
}
482
483
static bool
484
si_get_thread_trace(struct si_context *sctx,
485
struct ac_thread_trace *thread_trace)
486
{
487
unsigned max_se = sctx->screen->info.max_se;
488
489
memset(thread_trace, 0, sizeof(*thread_trace));
490
thread_trace->num_traces = max_se;
491
492
sctx->thread_trace->ptr = sctx->ws->buffer_map(sctx->ws, sctx->thread_trace->bo,
493
NULL,
494
PIPE_MAP_READ);
495
496
if (!sctx->thread_trace->ptr)
497
return false;
498
499
void *thread_trace_ptr = sctx->thread_trace->ptr;
500
501
for (unsigned se = 0; se < max_se; se++) {
502
uint64_t info_offset = ac_thread_trace_get_info_offset(se);
503
uint64_t data_offset = ac_thread_trace_get_data_offset(&sctx->screen->info, sctx->thread_trace, se);
504
void *info_ptr = thread_trace_ptr + info_offset;
505
void *data_ptr = thread_trace_ptr + data_offset;
506
struct ac_thread_trace_info *info =
507
(struct ac_thread_trace_info *)info_ptr;
508
509
struct ac_thread_trace_se thread_trace_se = {0};
510
511
if (!ac_is_thread_trace_complete(&sctx->screen->info, sctx->thread_trace, info)) {
512
uint32_t expected_size =
513
ac_get_expected_buffer_size(&sctx->screen->info, info);
514
uint32_t available_size = (info->cur_offset * 32) / 1024;
515
516
fprintf(stderr, "Failed to get the thread trace "
517
"because the buffer is too small. The "
518
"hardware needs %d KB but the "
519
"buffer size is %d KB.\n",
520
expected_size, available_size);
521
fprintf(stderr, "Please update the buffer size with "
522
"AMD_THREAD_TRACE_BUFFER_SIZE=<size_in_kbytes>\n");
523
return false;
524
}
525
526
thread_trace_se.data_ptr = data_ptr;
527
thread_trace_se.info = *info;
528
thread_trace_se.shader_engine = se;
529
530
int first_active_cu = ffs(sctx->screen->info.cu_mask[se][0]);
531
532
/* For GFX10+ compute_unit really means WGP */
533
thread_trace_se.compute_unit =
534
sctx->screen->info.chip_class >= GFX10 ? (first_active_cu / 2) : first_active_cu;
535
536
thread_trace->traces[se] = thread_trace_se;
537
}
538
539
thread_trace->data = sctx->thread_trace;
540
return true;
541
}
542
543
544
bool
545
si_init_thread_trace(struct si_context *sctx)
546
{
547
static bool warn_once = true;
548
if (warn_once) {
549
fprintf(stderr, "*************************************************\n");
550
fprintf(stderr, "* WARNING: Thread trace support is experimental *\n");
551
fprintf(stderr, "*************************************************\n");
552
warn_once = false;
553
}
554
555
sctx->thread_trace = CALLOC_STRUCT(ac_thread_trace_data);
556
557
if (sctx->chip_class < GFX8) {
558
fprintf(stderr, "GPU hardware not supported: refer to "
559
"the RGP documentation for the list of "
560
"supported GPUs!\n");
561
return false;
562
}
563
564
if (sctx->chip_class > GFX10_3) {
565
fprintf(stderr, "radeonsi: Thread trace is not supported "
566
"for that GPU!\n");
567
return false;
568
}
569
570
/* Default buffer size set to 1MB per SE. */
571
sctx->thread_trace->buffer_size = debug_get_num_option("AMD_THREAD_TRACE_BUFFER_SIZE", 1024) * 1024;
572
sctx->thread_trace->start_frame = 10;
573
574
const char *trigger = getenv("AMD_THREAD_TRACE_TRIGGER");
575
if (trigger) {
576
sctx->thread_trace->start_frame = atoi(trigger);
577
if (sctx->thread_trace->start_frame <= 0) {
578
/* This isn't a frame number, must be a file */
579
sctx->thread_trace->trigger_file = strdup(trigger);
580
sctx->thread_trace->start_frame = -1;
581
}
582
}
583
584
if (!si_thread_trace_init_bo(sctx))
585
return false;
586
587
list_inithead(&sctx->thread_trace->rgp_pso_correlation.record);
588
simple_mtx_init(&sctx->thread_trace->rgp_pso_correlation.lock, mtx_plain);
589
590
list_inithead(&sctx->thread_trace->rgp_loader_events.record);
591
simple_mtx_init(&sctx->thread_trace->rgp_loader_events.lock, mtx_plain);
592
593
list_inithead(&sctx->thread_trace->rgp_code_object.record);
594
simple_mtx_init(&sctx->thread_trace->rgp_code_object.lock, mtx_plain);
595
596
si_thread_trace_init_cs(sctx);
597
598
sctx->sqtt_next_event = EventInvalid;
599
600
return true;
601
}
602
603
void
604
si_destroy_thread_trace(struct si_context *sctx)
605
{
606
struct si_screen *sscreen = sctx->screen;
607
struct pb_buffer *bo = sctx->thread_trace->bo;
608
radeon_bo_reference(sctx->screen->ws, &bo, NULL);
609
610
if (sctx->thread_trace->trigger_file)
611
free(sctx->thread_trace->trigger_file);
612
613
sscreen->ws->cs_destroy(sctx->thread_trace->start_cs[RING_GFX]);
614
sscreen->ws->cs_destroy(sctx->thread_trace->stop_cs[RING_GFX]);
615
616
struct rgp_pso_correlation *pso_correlation = &sctx->thread_trace->rgp_pso_correlation;
617
struct rgp_loader_events *loader_events = &sctx->thread_trace->rgp_loader_events;
618
struct rgp_code_object *code_object = &sctx->thread_trace->rgp_code_object;
619
list_for_each_entry_safe(struct rgp_pso_correlation_record, record,
620
&pso_correlation->record, list) {
621
list_del(&record->list);
622
free(record);
623
}
624
simple_mtx_destroy(&sctx->thread_trace->rgp_pso_correlation.lock);
625
626
list_for_each_entry_safe(struct rgp_loader_events_record, record,
627
&loader_events->record, list) {
628
list_del(&record->list);
629
free(record);
630
}
631
simple_mtx_destroy(&sctx->thread_trace->rgp_loader_events.lock);
632
633
list_for_each_entry_safe(struct rgp_code_object_record, record,
634
&code_object->record, list) {
635
uint32_t mask = record->shader_stages_mask;
636
int i;
637
638
/* Free the disassembly. */
639
while (mask) {
640
i = u_bit_scan(&mask);
641
free(record->shader_data[i].code);
642
}
643
list_del(&record->list);
644
free(record);
645
}
646
simple_mtx_destroy(&sctx->thread_trace->rgp_code_object.lock);
647
648
free(sctx->thread_trace);
649
sctx->thread_trace = NULL;
650
}
651
652
static uint64_t num_frames = 0;
653
654
void
655
si_handle_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs)
656
{
657
/* Should we enable SQTT yet? */
658
if (!sctx->thread_trace_enabled) {
659
bool frame_trigger = num_frames == sctx->thread_trace->start_frame;
660
bool file_trigger = false;
661
if (sctx->thread_trace->trigger_file &&
662
access(sctx->thread_trace->trigger_file, W_OK) == 0) {
663
if (unlink(sctx->thread_trace->trigger_file) == 0) {
664
file_trigger = true;
665
} else {
666
/* Do not enable tracing if we cannot remove the file,
667
* because by then we'll trace every frame.
668
*/
669
fprintf(stderr, "radeonsi: could not remove thread trace trigger file, ignoring\n");
670
}
671
}
672
673
if (frame_trigger || file_trigger) {
674
/* Wait for last submission */
675
sctx->ws->fence_wait(sctx->ws, sctx->last_gfx_fence, PIPE_TIMEOUT_INFINITE);
676
677
/* Start SQTT */
678
si_begin_thread_trace(sctx, rcs);
679
680
sctx->thread_trace_enabled = true;
681
sctx->thread_trace->start_frame = -1;
682
683
/* Force shader update to make sure si_sqtt_describe_pipeline_bind is called
684
* for the current "pipeline".
685
*/
686
sctx->do_update_shaders = true;
687
}
688
} else {
689
struct ac_thread_trace thread_trace = {0};
690
691
/* Stop SQTT */
692
si_end_thread_trace(sctx, rcs);
693
sctx->thread_trace_enabled = false;
694
sctx->thread_trace->start_frame = -1;
695
assert (sctx->last_sqtt_fence);
696
697
/* Wait for SQTT to finish and read back the bo */
698
if (sctx->ws->fence_wait(sctx->ws, sctx->last_sqtt_fence, PIPE_TIMEOUT_INFINITE) &&
699
si_get_thread_trace(sctx, &thread_trace)) {
700
ac_dump_rgp_capture(&sctx->screen->info, &thread_trace);
701
} else {
702
fprintf(stderr, "Failed to read the trace\n");
703
}
704
}
705
706
num_frames++;
707
}
708
709
710
static void
711
si_emit_thread_trace_userdata(struct si_context* sctx,
712
struct radeon_cmdbuf *cs,
713
const void *data, uint32_t num_dwords)
714
{
715
const uint32_t *dwords = (uint32_t *)data;
716
717
radeon_begin(cs);
718
719
while (num_dwords > 0) {
720
uint32_t count = MIN2(num_dwords, 2);
721
722
/* Without the perfctr bit the CP might not always pass the
723
* write on correctly. */
724
radeon_set_uconfig_reg_seq(cs, R_030D08_SQ_THREAD_TRACE_USERDATA_2, count, sctx->chip_class >= GFX10);
725
726
radeon_emit_array(cs, dwords, count);
727
728
dwords += count;
729
num_dwords -= count;
730
}
731
radeon_end();
732
}
733
734
static void
735
si_emit_spi_config_cntl(struct si_context* sctx,
736
struct radeon_cmdbuf *cs, bool enable)
737
{
738
radeon_begin(cs);
739
740
if (sctx->chip_class >= GFX9) {
741
uint32_t spi_config_cntl = S_031100_GPR_WRITE_PRIORITY(0x2c688) |
742
S_031100_EXP_PRIORITY_ORDER(3) |
743
S_031100_ENABLE_SQG_TOP_EVENTS(enable) |
744
S_031100_ENABLE_SQG_BOP_EVENTS(enable);
745
746
if (sctx->chip_class >= GFX10)
747
spi_config_cntl |= S_031100_PS_PKR_PRIORITY_CNTL(3);
748
749
radeon_set_uconfig_reg(cs, R_031100_SPI_CONFIG_CNTL, spi_config_cntl);
750
} else {
751
/* SPI_CONFIG_CNTL is a protected register on GFX6-GFX8. */
752
radeon_set_privileged_config_reg(cs, R_009100_SPI_CONFIG_CNTL,
753
S_009100_ENABLE_SQG_TOP_EVENTS(enable) |
754
S_009100_ENABLE_SQG_BOP_EVENTS(enable));
755
}
756
radeon_end();
757
}
758
759
static uint32_t num_events = 0;
760
void
761
si_sqtt_write_event_marker(struct si_context* sctx, struct radeon_cmdbuf *rcs,
762
enum rgp_sqtt_marker_event_type api_type,
763
uint32_t vertex_offset_user_data,
764
uint32_t instance_offset_user_data,
765
uint32_t draw_index_user_data)
766
{
767
struct rgp_sqtt_marker_event marker = {0};
768
769
marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT;
770
marker.api_type = api_type == EventInvalid ? EventCmdDraw : api_type;
771
marker.cmd_id = num_events++;
772
marker.cb_id = 0;
773
774
if (vertex_offset_user_data == UINT_MAX ||
775
instance_offset_user_data == UINT_MAX) {
776
vertex_offset_user_data = 0;
777
instance_offset_user_data = 0;
778
}
779
780
if (draw_index_user_data == UINT_MAX)
781
draw_index_user_data = vertex_offset_user_data;
782
783
marker.vertex_offset_reg_idx = vertex_offset_user_data;
784
marker.instance_offset_reg_idx = instance_offset_user_data;
785
marker.draw_index_reg_idx = draw_index_user_data;
786
787
si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
788
789
sctx->sqtt_next_event = EventInvalid;
790
}
791
792
void
793
si_write_event_with_dims_marker(struct si_context* sctx, struct radeon_cmdbuf *rcs,
794
enum rgp_sqtt_marker_event_type api_type,
795
uint32_t x, uint32_t y, uint32_t z)
796
{
797
struct rgp_sqtt_marker_event_with_dims marker = {0};
798
799
marker.event.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT;
800
marker.event.api_type = api_type;
801
marker.event.cmd_id = num_events++;
802
marker.event.cb_id = 0;
803
marker.event.has_thread_dims = 1;
804
805
marker.thread_x = x;
806
marker.thread_y = y;
807
marker.thread_z = z;
808
809
si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
810
sctx->sqtt_next_event = EventInvalid;
811
}
812
813
void
814
si_sqtt_describe_barrier_start(struct si_context* sctx, struct radeon_cmdbuf *rcs)
815
{
816
struct rgp_sqtt_marker_barrier_start marker = {0};
817
818
marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_START;
819
marker.cb_id = 0;
820
marker.dword02 = 0xC0000000 + 10; /* RGP_BARRIER_INTERNAL_BASE */
821
822
si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
823
}
824
825
void
826
si_sqtt_describe_barrier_end(struct si_context* sctx, struct radeon_cmdbuf *rcs,
827
unsigned flags)
828
{
829
struct rgp_sqtt_marker_barrier_end marker = {0};
830
831
marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_END;
832
marker.cb_id = 0;
833
834
if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH)
835
marker.vs_partial_flush = true;
836
if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH)
837
marker.ps_partial_flush = true;
838
if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH)
839
marker.cs_partial_flush = true;
840
841
if (flags & SI_CONTEXT_PFP_SYNC_ME)
842
marker.pfp_sync_me = true;
843
844
if (flags & SI_CONTEXT_INV_VCACHE)
845
marker.inval_tcp = true;
846
if (flags & SI_CONTEXT_INV_ICACHE)
847
marker.inval_sqI = true;
848
if (flags & SI_CONTEXT_INV_SCACHE)
849
marker.inval_sqK = true;
850
if (flags & SI_CONTEXT_INV_L2)
851
marker.inval_tcc = true;
852
853
if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
854
marker.inval_cb = true;
855
marker.flush_cb = true;
856
}
857
if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
858
marker.inval_db = true;
859
marker.flush_db = true;
860
}
861
862
si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
863
}
864
865
void
866
si_write_user_event(struct si_context* sctx, struct radeon_cmdbuf *rcs,
867
enum rgp_sqtt_marker_user_event_type type,
868
const char *str, int len)
869
{
870
if (type == UserEventPop) {
871
assert (str == NULL);
872
struct rgp_sqtt_marker_user_event marker = { 0 };
873
marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT;
874
marker.data_type = type;
875
876
si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
877
} else {
878
assert (str != NULL);
879
struct rgp_sqtt_marker_user_event_with_length marker = { 0 };
880
marker.user_event.identifier = RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT;
881
marker.user_event.data_type = type;
882
len = MIN2(1024, len);
883
marker.length = align(len, 4);
884
885
uint8_t *buffer = alloca(sizeof(marker) + marker.length);
886
memcpy(buffer, &marker, sizeof(marker));
887
memcpy(buffer + sizeof(marker), str, len);
888
buffer[sizeof(marker) + len - 1] = '\0';
889
890
si_emit_thread_trace_userdata(sctx, rcs, buffer, sizeof(marker) / 4 + marker.length / 4);
891
}
892
}
893
894
895
bool
896
si_sqtt_pipeline_is_registered(struct ac_thread_trace_data *thread_trace_data,
897
uint64_t pipeline_hash)
898
{
899
simple_mtx_lock(&thread_trace_data->rgp_pso_correlation.lock);
900
list_for_each_entry_safe(struct rgp_pso_correlation_record, record,
901
&thread_trace_data->rgp_pso_correlation.record, list) {
902
if (record->pipeline_hash[0] == pipeline_hash) {
903
simple_mtx_unlock(&thread_trace_data->rgp_pso_correlation.lock);
904
return true;
905
}
906
907
}
908
simple_mtx_unlock(&thread_trace_data->rgp_pso_correlation.lock);
909
910
return false;
911
}
912
913
914
915
static enum rgp_hardware_stages
916
si_sqtt_pipe_to_rgp_shader_stage(struct si_shader_key* key, enum pipe_shader_type stage)
917
{
918
switch (stage) {
919
case PIPE_SHADER_VERTEX:
920
if (key->as_ls)
921
return RGP_HW_STAGE_LS;
922
else if (key->as_es)
923
return RGP_HW_STAGE_ES;
924
else if (key->as_ngg)
925
return RGP_HW_STAGE_GS;
926
else
927
return RGP_HW_STAGE_VS;
928
case PIPE_SHADER_TESS_CTRL:
929
return RGP_HW_STAGE_HS;
930
case PIPE_SHADER_TESS_EVAL:
931
if (key->as_es)
932
return RGP_HW_STAGE_ES;
933
else if (key->as_ngg)
934
return RGP_HW_STAGE_GS;
935
else
936
return RGP_HW_STAGE_VS;
937
case PIPE_SHADER_GEOMETRY:
938
return RGP_HW_STAGE_GS;
939
case PIPE_SHADER_FRAGMENT:
940
return RGP_HW_STAGE_PS;
941
case PIPE_SHADER_COMPUTE:
942
return RGP_HW_STAGE_CS;
943
default:
944
unreachable("invalid mesa shader stage");
945
}
946
}
947
948
949
static bool
950
si_sqtt_add_code_object(struct si_context* sctx,
951
uint64_t pipeline_hash,
952
bool is_compute)
953
{
954
struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace;
955
struct rgp_code_object *code_object = &thread_trace_data->rgp_code_object;
956
struct rgp_code_object_record *record;
957
958
record = malloc(sizeof(struct rgp_code_object_record));
959
if (!record)
960
return false;
961
962
record->shader_stages_mask = 0;
963
record->num_shaders_combined = 0;
964
record->pipeline_hash[0] = pipeline_hash;
965
record->pipeline_hash[1] = pipeline_hash;
966
967
for (unsigned i = 0; i < PIPE_SHADER_TYPES; i++) {
968
struct si_shader *shader;
969
enum rgp_hardware_stages hw_stage;
970
971
if (is_compute) {
972
if (i != PIPE_SHADER_COMPUTE)
973
continue;
974
shader = &sctx->cs_shader_state.program->shader;
975
hw_stage = RGP_HW_STAGE_CS;
976
} else if (i != PIPE_SHADER_COMPUTE) {
977
if (!sctx->shaders[i].cso || !sctx->shaders[i].current)
978
continue;
979
shader = sctx->shaders[i].current;
980
hw_stage = si_sqtt_pipe_to_rgp_shader_stage(&shader->key, i);
981
} else {
982
continue;
983
}
984
985
uint8_t *code = malloc(shader->binary.uploaded_code_size);
986
if (!code) {
987
free(record);
988
return false;
989
}
990
memcpy(code, shader->binary.uploaded_code, shader->binary.uploaded_code_size);
991
992
uint64_t va = shader->bo->gpu_address;
993
record->shader_data[i].hash[0] = _mesa_hash_data(code, shader->binary.uploaded_code_size);
994
record->shader_data[i].hash[1] = record->shader_data[i].hash[0];
995
record->shader_data[i].code_size = shader->binary.uploaded_code_size;
996
record->shader_data[i].code = code;
997
record->shader_data[i].vgpr_count = shader->config.num_vgprs;
998
record->shader_data[i].sgpr_count = shader->config.num_sgprs;
999
record->shader_data[i].base_address = va & 0xffffffffffff;
1000
record->shader_data[i].elf_symbol_offset = 0;
1001
record->shader_data[i].hw_stage = hw_stage;
1002
record->shader_data[i].is_combined = false;
1003
1004
record->shader_stages_mask |= (1 << i);
1005
record->num_shaders_combined++;
1006
}
1007
1008
simple_mtx_lock(&code_object->lock);
1009
list_addtail(&record->list, &code_object->record);
1010
code_object->record_count++;
1011
simple_mtx_unlock(&code_object->lock);
1012
1013
return true;
1014
}
1015
1016
bool
1017
si_sqtt_register_pipeline(struct si_context* sctx, uint64_t pipeline_hash, uint64_t base_address, bool is_compute)
1018
{
1019
struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace;
1020
1021
assert (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline_hash));
1022
1023
bool result = ac_sqtt_add_pso_correlation(thread_trace_data, pipeline_hash);
1024
if (!result)
1025
return false;
1026
1027
result = ac_sqtt_add_code_object_loader_event(thread_trace_data, pipeline_hash, base_address);
1028
if (!result)
1029
return false;
1030
1031
return si_sqtt_add_code_object(sctx, pipeline_hash, is_compute);
1032
}
1033
1034
void
1035
si_sqtt_describe_pipeline_bind(struct si_context* sctx,
1036
uint64_t pipeline_hash,
1037
int bind_point)
1038
{
1039
struct rgp_sqtt_marker_pipeline_bind marker = {0};
1040
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
1041
1042
if (likely(!sctx->thread_trace_enabled)) {
1043
return;
1044
}
1045
1046
marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BIND_PIPELINE;
1047
marker.cb_id = 0;
1048
marker.bind_point = bind_point;
1049
marker.api_pso_hash[0] = pipeline_hash;
1050
marker.api_pso_hash[1] = pipeline_hash >> 32;
1051
1052
si_emit_thread_trace_userdata(sctx, cs, &marker, sizeof(marker) / 4);
1053
}
1054
1055