Path: blob/21.2-virgl/src/gallium/drivers/radeonsi/si_sqtt.c
4570 views
/*1* Copyright 2020 Advanced Micro Devices, Inc.2* All Rights Reserved.3*4* Permission is hereby granted, free of charge, to any person obtaining a5* copy of this software and associated documentation files (the "Software"),6* to deal in the Software without restriction, including without limitation7* on the rights to use, copy, modify, merge, publish, distribute, sub8* license, and/or sell copies of the Software, and to permit persons to whom9* the Software is furnished to do so, subject to the following conditions:10*11* The above copyright notice and this permission notice (including the next12* paragraph) shall be included in all copies or substantial portions of the13* Software.14*15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,17* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL18* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,19* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR20* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE21* USE OR OTHER DEALINGS IN THE SOFTWARE.22*23*/242526#include "si_pipe.h"27#include "si_build_pm4.h"28#include "si_compute.h"2930#include "ac_rgp.h"31#include "ac_sqtt.h"32#include "util/u_memory.h"3334static void35si_emit_spi_config_cntl(struct si_context* sctx,36struct radeon_cmdbuf *cs, bool enable);3738static bool39si_thread_trace_init_bo(struct si_context *sctx)40{41unsigned max_se = sctx->screen->info.max_se;42struct radeon_winsys *ws = sctx->ws;43uint64_t size;4445/* The buffer size and address need to be aligned in HW regs. Align the46* size as early as possible so that we do all the allocation & addressing47* correctly. */48sctx->thread_trace->buffer_size = align64(sctx->thread_trace->buffer_size,491u << SQTT_BUFFER_ALIGN_SHIFT);5051/* Compute total size of the thread trace BO for all SEs. */52size = align64(sizeof(struct ac_thread_trace_info) * max_se,531 << SQTT_BUFFER_ALIGN_SHIFT);54size += sctx->thread_trace->buffer_size * (uint64_t)max_se;5556sctx->thread_trace->bo =57ws->buffer_create(ws, size, 4096,58RADEON_DOMAIN_VRAM,59RADEON_FLAG_NO_INTERPROCESS_SHARING |60RADEON_FLAG_GTT_WC |61RADEON_FLAG_NO_SUBALLOC);62if (!sctx->thread_trace->bo)63return false;6465return true;66}6768static void69si_emit_thread_trace_start(struct si_context* sctx,70struct radeon_cmdbuf *cs,71uint32_t queue_family_index)72{73struct si_screen *sscreen = sctx->screen;74uint32_t shifted_size = sctx->thread_trace->buffer_size >> SQTT_BUFFER_ALIGN_SHIFT;75unsigned max_se = sscreen->info.max_se;7677radeon_begin(cs);7879for (unsigned se = 0; se < max_se; se++) {80uint64_t va = sctx->ws->buffer_get_virtual_address(sctx->thread_trace->bo);81uint64_t data_va = ac_thread_trace_get_data_va(&sctx->screen->info, sctx->thread_trace, va, se);82uint64_t shifted_va = data_va >> SQTT_BUFFER_ALIGN_SHIFT;8384/* Target SEx and SH0. */85radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX,86S_030800_SE_INDEX(se) |87S_030800_SH_INDEX(0) |88S_030800_INSTANCE_BROADCAST_WRITES(1));8990/* Select the first active CUs */91int first_active_cu = ffs(sctx->screen->info.cu_mask[se][0]);9293if (sctx->chip_class >= GFX10) {94/* Order seems important for the following 2 registers. */95radeon_set_privileged_config_reg(cs, R_008D04_SQ_THREAD_TRACE_BUF0_SIZE,96S_008D04_SIZE(shifted_size) |97S_008D04_BASE_HI(shifted_va >> 32));9899radeon_set_privileged_config_reg(cs, R_008D00_SQ_THREAD_TRACE_BUF0_BASE, shifted_va);100101int wgp = first_active_cu / 2;102radeon_set_privileged_config_reg(cs, R_008D14_SQ_THREAD_TRACE_MASK,103S_008D14_WTYPE_INCLUDE(0x7f) | /* all shader stages */104S_008D14_SA_SEL(0) |105S_008D14_WGP_SEL(wgp) |106S_008D14_SIMD_SEL(0));107108radeon_set_privileged_config_reg(cs, R_008D18_SQ_THREAD_TRACE_TOKEN_MASK,109S_008D18_REG_INCLUDE(V_008D18_REG_INCLUDE_SQDEC |110V_008D18_REG_INCLUDE_SHDEC |111V_008D18_REG_INCLUDE_GFXUDEC |112V_008D18_REG_INCLUDE_CONTEXT |113V_008D18_REG_INCLUDE_COMP |114V_008D18_REG_INCLUDE_CONFIG) |115S_008D18_TOKEN_EXCLUDE(V_008D18_TOKEN_EXCLUDE_PERF));116117/* Should be emitted last (it enables thread traces). */118radeon_set_privileged_config_reg(cs, R_008D1C_SQ_THREAD_TRACE_CTRL,119S_008D1C_MODE(1) |120S_008D1C_HIWATER(5) |121S_008D1C_UTIL_TIMER(1) |122S_008D1C_RT_FREQ(2) | /* 4096 clk */123S_008D1C_DRAW_EVENT_EN(1) |124S_008D1C_REG_STALL_EN(1) |125S_008D1C_SPI_STALL_EN(1) |126S_008D1C_SQ_STALL_EN(1) |127S_008D1C_REG_DROP_ON_STALL(0) |128S_008D1C_LOWATER_OFFSET(129sctx->chip_class >= GFX10_3 ? 4 : 0));130} else {131/* Order seems important for the following 4 registers. */132radeon_set_uconfig_reg(cs, R_030CDC_SQ_THREAD_TRACE_BASE2,133S_030CDC_ADDR_HI(shifted_va >> 32));134135radeon_set_uconfig_reg(cs, R_030CC0_SQ_THREAD_TRACE_BASE, shifted_va);136137radeon_set_uconfig_reg(cs, R_030CC4_SQ_THREAD_TRACE_SIZE,138S_030CC4_SIZE(shifted_size));139140radeon_set_uconfig_reg(cs, R_030CD4_SQ_THREAD_TRACE_CTRL,141S_030CD4_RESET_BUFFER(1));142143uint32_t thread_trace_mask = S_030CC8_CU_SEL(first_active_cu) |144S_030CC8_SH_SEL(0) |145S_030CC8_SIMD_EN(0xf) |146S_030CC8_VM_ID_MASK(0) |147S_030CC8_REG_STALL_EN(1) |148S_030CC8_SPI_STALL_EN(1) |149S_030CC8_SQ_STALL_EN(1);150151radeon_set_uconfig_reg(cs, R_030CC8_SQ_THREAD_TRACE_MASK,152thread_trace_mask);153154/* Trace all tokens and registers. */155radeon_set_uconfig_reg(cs, R_030CCC_SQ_THREAD_TRACE_TOKEN_MASK,156S_030CCC_TOKEN_MASK(0xbfff) |157S_030CCC_REG_MASK(0xff) |158S_030CCC_REG_DROP_ON_STALL(0));159160/* Enable SQTT perf counters for all CUs. */161radeon_set_uconfig_reg(cs, R_030CD0_SQ_THREAD_TRACE_PERF_MASK,162S_030CD0_SH0_MASK(0xffff) |163S_030CD0_SH1_MASK(0xffff));164165radeon_set_uconfig_reg(cs, R_030CE0_SQ_THREAD_TRACE_TOKEN_MASK2, 0xffffffff);166167radeon_set_uconfig_reg(cs, R_030CEC_SQ_THREAD_TRACE_HIWATER,168S_030CEC_HIWATER(4));169170if (sctx->chip_class == GFX9) {171/* Reset thread trace status errors. */172radeon_set_uconfig_reg(cs, R_030CE8_SQ_THREAD_TRACE_STATUS,173S_030CE8_UTC_ERROR(0));174}175176/* Enable the thread trace mode. */177uint32_t thread_trace_mode =178S_030CD8_MASK_PS(1) |179S_030CD8_MASK_VS(1) |180S_030CD8_MASK_GS(1) |181S_030CD8_MASK_ES(1) |182S_030CD8_MASK_HS(1) |183S_030CD8_MASK_LS(1) |184S_030CD8_MASK_CS(1) |185S_030CD8_AUTOFLUSH_EN(1) | /* periodically flush SQTT data to memory */186S_030CD8_MODE(1);187188if (sctx->chip_class == GFX9) {189/* Count SQTT traffic in TCC perf counters. */190thread_trace_mode |= S_030CD8_TC_PERF_EN(1);191}192193radeon_set_uconfig_reg(cs, R_030CD8_SQ_THREAD_TRACE_MODE,194thread_trace_mode);195}196}197198/* Restore global broadcasting. */199radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX,200S_030800_SE_BROADCAST_WRITES(1) |201S_030800_SH_BROADCAST_WRITES(1) |202S_030800_INSTANCE_BROADCAST_WRITES(1));203204/* Start the thread trace with a different event based on the queue. */205if (queue_family_index == RING_COMPUTE) {206radeon_set_sh_reg(cs, R_00B878_COMPUTE_THREAD_TRACE_ENABLE,207S_00B878_THREAD_TRACE_ENABLE(1));208} else {209radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));210radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_START) | EVENT_INDEX(0));211}212radeon_end();213}214215static const uint32_t gfx9_thread_trace_info_regs[] =216{217R_030CE4_SQ_THREAD_TRACE_WPTR,218R_030CE8_SQ_THREAD_TRACE_STATUS,219R_030CF0_SQ_THREAD_TRACE_CNTR,220};221222static const uint32_t gfx10_thread_trace_info_regs[] =223{224R_008D10_SQ_THREAD_TRACE_WPTR,225R_008D20_SQ_THREAD_TRACE_STATUS,226R_008D24_SQ_THREAD_TRACE_DROPPED_CNTR,227};228229static void230si_copy_thread_trace_info_regs(struct si_context* sctx,231struct radeon_cmdbuf *cs,232unsigned se_index)233{234const uint32_t *thread_trace_info_regs = NULL;235236switch (sctx->chip_class) {237case GFX10_3:238case GFX10:239thread_trace_info_regs = gfx10_thread_trace_info_regs;240break;241case GFX9:242thread_trace_info_regs = gfx9_thread_trace_info_regs;243break;244default:245unreachable("Unsupported chip_class");246}247248/* Get the VA where the info struct is stored for this SE. */249uint64_t va = sctx->ws->buffer_get_virtual_address(sctx->thread_trace->bo);250uint64_t info_va = ac_thread_trace_get_info_va(va, se_index);251252radeon_begin(cs);253254/* Copy back the info struct one DWORD at a time. */255for (unsigned i = 0; i < 3; i++) {256radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));257radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) |258COPY_DATA_DST_SEL(COPY_DATA_TC_L2) |259COPY_DATA_WR_CONFIRM);260radeon_emit(cs, thread_trace_info_regs[i] >> 2);261radeon_emit(cs, 0); /* unused */262radeon_emit(cs, (info_va + i * 4));263radeon_emit(cs, (info_va + i * 4) >> 32);264}265radeon_end();266}267268269270static void271si_emit_thread_trace_stop(struct si_context *sctx,272struct radeon_cmdbuf *cs,273uint32_t queue_family_index)274{275unsigned max_se = sctx->screen->info.max_se;276277radeon_begin(cs);278279/* Stop the thread trace with a different event based on the queue. */280if (queue_family_index == RING_COMPUTE) {281radeon_set_sh_reg(cs, R_00B878_COMPUTE_THREAD_TRACE_ENABLE,282S_00B878_THREAD_TRACE_ENABLE(0));283} else {284radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));285radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_STOP) | EVENT_INDEX(0));286}287288radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));289radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_FINISH) | EVENT_INDEX(0));290radeon_end();291292for (unsigned se = 0; se < max_se; se++) {293radeon_begin(cs);294295/* Target SEi and SH0. */296radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX,297S_030800_SE_INDEX(se) |298S_030800_SH_INDEX(0) |299S_030800_INSTANCE_BROADCAST_WRITES(1));300301if (sctx->chip_class >= GFX10) {302/* Make sure to wait for the trace buffer. */303radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));304radeon_emit(cs, WAIT_REG_MEM_NOT_EQUAL); /* wait until the register is equal to the reference value */305radeon_emit(cs, R_008D20_SQ_THREAD_TRACE_STATUS >> 2); /* register */306radeon_emit(cs, 0);307radeon_emit(cs, 0); /* reference value */308radeon_emit(cs, S_008D20_FINISH_DONE(1)); /* mask */309radeon_emit(cs, 4); /* poll interval */310311/* Disable the thread trace mode. */312radeon_set_privileged_config_reg(cs, R_008D1C_SQ_THREAD_TRACE_CTRL,313S_008D1C_MODE(0));314315/* Wait for thread trace completion. */316radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));317radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */318radeon_emit(cs, R_008D20_SQ_THREAD_TRACE_STATUS >> 2); /* register */319radeon_emit(cs, 0);320radeon_emit(cs, 0); /* reference value */321radeon_emit(cs, S_008D20_BUSY(1)); /* mask */322radeon_emit(cs, 4); /* poll interval */323} else {324/* Disable the thread trace mode. */325radeon_set_uconfig_reg(cs, R_030CD8_SQ_THREAD_TRACE_MODE,326S_030CD8_MODE(0));327328/* Wait for thread trace completion. */329radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));330radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */331radeon_emit(cs, R_030CE8_SQ_THREAD_TRACE_STATUS >> 2); /* register */332radeon_emit(cs, 0);333radeon_emit(cs, 0); /* reference value */334radeon_emit(cs, S_030CE8_BUSY(1)); /* mask */335radeon_emit(cs, 4); /* poll interval */336}337radeon_end();338339si_copy_thread_trace_info_regs(sctx, cs, se);340}341342/* Restore global broadcasting. */343radeon_begin_again(cs);344radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX,345S_030800_SE_BROADCAST_WRITES(1) |346S_030800_SH_BROADCAST_WRITES(1) |347S_030800_INSTANCE_BROADCAST_WRITES(1));348radeon_end();349}350351static void352si_thread_trace_start(struct si_context *sctx, int family, struct radeon_cmdbuf *cs)353{354struct radeon_winsys *ws = sctx->ws;355356radeon_begin(cs);357358switch (family) {359case RING_GFX:360radeon_emit(cs, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));361radeon_emit(cs, CC0_UPDATE_LOAD_ENABLES(1));362radeon_emit(cs, CC1_UPDATE_SHADOW_ENABLES(1));363break;364case RING_COMPUTE:365radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));366radeon_emit(cs, 0);367break;368}369radeon_end();370371ws->cs_add_buffer(cs,372sctx->thread_trace->bo,373RADEON_USAGE_READWRITE,374RADEON_DOMAIN_VRAM,3750);376377si_cp_dma_wait_for_idle(sctx, cs);378379/* Make sure to wait-for-idle before starting SQTT. */380sctx->flags |=381SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |382SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |383SI_CONTEXT_INV_L2 | SI_CONTEXT_PFP_SYNC_ME;384sctx->emit_cache_flush(sctx, cs);385386si_inhibit_clockgating(sctx, cs, true);387388/* Enable SQG events that collects thread trace data. */389si_emit_spi_config_cntl(sctx, cs, true);390391si_emit_thread_trace_start(sctx, cs, family);392}393394static void395si_thread_trace_stop(struct si_context *sctx, int family, struct radeon_cmdbuf *cs)396{397struct radeon_winsys *ws = sctx->ws;398399radeon_begin(cs);400401switch (family) {402case RING_GFX:403radeon_emit(sctx->thread_trace->stop_cs[family], PKT3(PKT3_CONTEXT_CONTROL, 1, 0));404radeon_emit(sctx->thread_trace->stop_cs[family], CC0_UPDATE_LOAD_ENABLES(1));405radeon_emit(sctx->thread_trace->stop_cs[family], CC1_UPDATE_SHADOW_ENABLES(1));406break;407case RING_COMPUTE:408radeon_emit(sctx->thread_trace->stop_cs[family], PKT3(PKT3_NOP, 0, 0));409radeon_emit(sctx->thread_trace->stop_cs[family], 0);410break;411}412radeon_end();413414ws->cs_add_buffer(cs,415sctx->thread_trace->bo,416RADEON_USAGE_READWRITE,417RADEON_DOMAIN_VRAM,4180);419420si_cp_dma_wait_for_idle(sctx, cs);421422/* Make sure to wait-for-idle before stopping SQTT. */423sctx->flags |=424SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |425SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |426SI_CONTEXT_INV_L2 | SI_CONTEXT_PFP_SYNC_ME;427sctx->emit_cache_flush(sctx, cs);428429si_emit_thread_trace_stop(sctx, cs, family);430431/* Restore previous state by disabling SQG events. */432si_emit_spi_config_cntl(sctx, cs, false);433434si_inhibit_clockgating(sctx, cs, false);435}436437438static void439si_thread_trace_init_cs(struct si_context *sctx)440{441struct radeon_winsys *ws = sctx->ws;442443/* Thread trace start CS (only handles RING_GFX). */444sctx->thread_trace->start_cs[RING_GFX] = CALLOC_STRUCT(radeon_cmdbuf);445if (!ws->cs_create(sctx->thread_trace->start_cs[RING_GFX],446sctx->ctx, RING_GFX, NULL, NULL, 0)) {447free(sctx->thread_trace->start_cs[RING_GFX]);448sctx->thread_trace->start_cs[RING_GFX] = NULL;449return;450}451452si_thread_trace_start(sctx, RING_GFX, sctx->thread_trace->start_cs[RING_GFX]);453454/* Thread trace stop CS. */455sctx->thread_trace->stop_cs[RING_GFX] = CALLOC_STRUCT(radeon_cmdbuf);456if (!ws->cs_create(sctx->thread_trace->stop_cs[RING_GFX],457sctx->ctx, RING_GFX, NULL, NULL, 0)) {458free(sctx->thread_trace->start_cs[RING_GFX]);459sctx->thread_trace->start_cs[RING_GFX] = NULL;460free(sctx->thread_trace->stop_cs[RING_GFX]);461sctx->thread_trace->stop_cs[RING_GFX] = NULL;462return;463}464465si_thread_trace_stop(sctx, RING_GFX, sctx->thread_trace->stop_cs[RING_GFX]);466}467468static void469si_begin_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs)470{471struct radeon_cmdbuf *cs = sctx->thread_trace->start_cs[RING_GFX];472sctx->ws->cs_flush(cs, 0, NULL);473}474475static void476si_end_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs)477{478struct radeon_cmdbuf *cs = sctx->thread_trace->stop_cs[RING_GFX];479sctx->ws->cs_flush(cs, 0, &sctx->last_sqtt_fence);480}481482static bool483si_get_thread_trace(struct si_context *sctx,484struct ac_thread_trace *thread_trace)485{486unsigned max_se = sctx->screen->info.max_se;487488memset(thread_trace, 0, sizeof(*thread_trace));489thread_trace->num_traces = max_se;490491sctx->thread_trace->ptr = sctx->ws->buffer_map(sctx->ws, sctx->thread_trace->bo,492NULL,493PIPE_MAP_READ);494495if (!sctx->thread_trace->ptr)496return false;497498void *thread_trace_ptr = sctx->thread_trace->ptr;499500for (unsigned se = 0; se < max_se; se++) {501uint64_t info_offset = ac_thread_trace_get_info_offset(se);502uint64_t data_offset = ac_thread_trace_get_data_offset(&sctx->screen->info, sctx->thread_trace, se);503void *info_ptr = thread_trace_ptr + info_offset;504void *data_ptr = thread_trace_ptr + data_offset;505struct ac_thread_trace_info *info =506(struct ac_thread_trace_info *)info_ptr;507508struct ac_thread_trace_se thread_trace_se = {0};509510if (!ac_is_thread_trace_complete(&sctx->screen->info, sctx->thread_trace, info)) {511uint32_t expected_size =512ac_get_expected_buffer_size(&sctx->screen->info, info);513uint32_t available_size = (info->cur_offset * 32) / 1024;514515fprintf(stderr, "Failed to get the thread trace "516"because the buffer is too small. The "517"hardware needs %d KB but the "518"buffer size is %d KB.\n",519expected_size, available_size);520fprintf(stderr, "Please update the buffer size with "521"AMD_THREAD_TRACE_BUFFER_SIZE=<size_in_kbytes>\n");522return false;523}524525thread_trace_se.data_ptr = data_ptr;526thread_trace_se.info = *info;527thread_trace_se.shader_engine = se;528529int first_active_cu = ffs(sctx->screen->info.cu_mask[se][0]);530531/* For GFX10+ compute_unit really means WGP */532thread_trace_se.compute_unit =533sctx->screen->info.chip_class >= GFX10 ? (first_active_cu / 2) : first_active_cu;534535thread_trace->traces[se] = thread_trace_se;536}537538thread_trace->data = sctx->thread_trace;539return true;540}541542543bool544si_init_thread_trace(struct si_context *sctx)545{546static bool warn_once = true;547if (warn_once) {548fprintf(stderr, "*************************************************\n");549fprintf(stderr, "* WARNING: Thread trace support is experimental *\n");550fprintf(stderr, "*************************************************\n");551warn_once = false;552}553554sctx->thread_trace = CALLOC_STRUCT(ac_thread_trace_data);555556if (sctx->chip_class < GFX8) {557fprintf(stderr, "GPU hardware not supported: refer to "558"the RGP documentation for the list of "559"supported GPUs!\n");560return false;561}562563if (sctx->chip_class > GFX10_3) {564fprintf(stderr, "radeonsi: Thread trace is not supported "565"for that GPU!\n");566return false;567}568569/* Default buffer size set to 1MB per SE. */570sctx->thread_trace->buffer_size = debug_get_num_option("AMD_THREAD_TRACE_BUFFER_SIZE", 1024) * 1024;571sctx->thread_trace->start_frame = 10;572573const char *trigger = getenv("AMD_THREAD_TRACE_TRIGGER");574if (trigger) {575sctx->thread_trace->start_frame = atoi(trigger);576if (sctx->thread_trace->start_frame <= 0) {577/* This isn't a frame number, must be a file */578sctx->thread_trace->trigger_file = strdup(trigger);579sctx->thread_trace->start_frame = -1;580}581}582583if (!si_thread_trace_init_bo(sctx))584return false;585586list_inithead(&sctx->thread_trace->rgp_pso_correlation.record);587simple_mtx_init(&sctx->thread_trace->rgp_pso_correlation.lock, mtx_plain);588589list_inithead(&sctx->thread_trace->rgp_loader_events.record);590simple_mtx_init(&sctx->thread_trace->rgp_loader_events.lock, mtx_plain);591592list_inithead(&sctx->thread_trace->rgp_code_object.record);593simple_mtx_init(&sctx->thread_trace->rgp_code_object.lock, mtx_plain);594595si_thread_trace_init_cs(sctx);596597sctx->sqtt_next_event = EventInvalid;598599return true;600}601602void603si_destroy_thread_trace(struct si_context *sctx)604{605struct si_screen *sscreen = sctx->screen;606struct pb_buffer *bo = sctx->thread_trace->bo;607radeon_bo_reference(sctx->screen->ws, &bo, NULL);608609if (sctx->thread_trace->trigger_file)610free(sctx->thread_trace->trigger_file);611612sscreen->ws->cs_destroy(sctx->thread_trace->start_cs[RING_GFX]);613sscreen->ws->cs_destroy(sctx->thread_trace->stop_cs[RING_GFX]);614615struct rgp_pso_correlation *pso_correlation = &sctx->thread_trace->rgp_pso_correlation;616struct rgp_loader_events *loader_events = &sctx->thread_trace->rgp_loader_events;617struct rgp_code_object *code_object = &sctx->thread_trace->rgp_code_object;618list_for_each_entry_safe(struct rgp_pso_correlation_record, record,619&pso_correlation->record, list) {620list_del(&record->list);621free(record);622}623simple_mtx_destroy(&sctx->thread_trace->rgp_pso_correlation.lock);624625list_for_each_entry_safe(struct rgp_loader_events_record, record,626&loader_events->record, list) {627list_del(&record->list);628free(record);629}630simple_mtx_destroy(&sctx->thread_trace->rgp_loader_events.lock);631632list_for_each_entry_safe(struct rgp_code_object_record, record,633&code_object->record, list) {634uint32_t mask = record->shader_stages_mask;635int i;636637/* Free the disassembly. */638while (mask) {639i = u_bit_scan(&mask);640free(record->shader_data[i].code);641}642list_del(&record->list);643free(record);644}645simple_mtx_destroy(&sctx->thread_trace->rgp_code_object.lock);646647free(sctx->thread_trace);648sctx->thread_trace = NULL;649}650651static uint64_t num_frames = 0;652653void654si_handle_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs)655{656/* Should we enable SQTT yet? */657if (!sctx->thread_trace_enabled) {658bool frame_trigger = num_frames == sctx->thread_trace->start_frame;659bool file_trigger = false;660if (sctx->thread_trace->trigger_file &&661access(sctx->thread_trace->trigger_file, W_OK) == 0) {662if (unlink(sctx->thread_trace->trigger_file) == 0) {663file_trigger = true;664} else {665/* Do not enable tracing if we cannot remove the file,666* because by then we'll trace every frame.667*/668fprintf(stderr, "radeonsi: could not remove thread trace trigger file, ignoring\n");669}670}671672if (frame_trigger || file_trigger) {673/* Wait for last submission */674sctx->ws->fence_wait(sctx->ws, sctx->last_gfx_fence, PIPE_TIMEOUT_INFINITE);675676/* Start SQTT */677si_begin_thread_trace(sctx, rcs);678679sctx->thread_trace_enabled = true;680sctx->thread_trace->start_frame = -1;681682/* Force shader update to make sure si_sqtt_describe_pipeline_bind is called683* for the current "pipeline".684*/685sctx->do_update_shaders = true;686}687} else {688struct ac_thread_trace thread_trace = {0};689690/* Stop SQTT */691si_end_thread_trace(sctx, rcs);692sctx->thread_trace_enabled = false;693sctx->thread_trace->start_frame = -1;694assert (sctx->last_sqtt_fence);695696/* Wait for SQTT to finish and read back the bo */697if (sctx->ws->fence_wait(sctx->ws, sctx->last_sqtt_fence, PIPE_TIMEOUT_INFINITE) &&698si_get_thread_trace(sctx, &thread_trace)) {699ac_dump_rgp_capture(&sctx->screen->info, &thread_trace);700} else {701fprintf(stderr, "Failed to read the trace\n");702}703}704705num_frames++;706}707708709static void710si_emit_thread_trace_userdata(struct si_context* sctx,711struct radeon_cmdbuf *cs,712const void *data, uint32_t num_dwords)713{714const uint32_t *dwords = (uint32_t *)data;715716radeon_begin(cs);717718while (num_dwords > 0) {719uint32_t count = MIN2(num_dwords, 2);720721/* Without the perfctr bit the CP might not always pass the722* write on correctly. */723radeon_set_uconfig_reg_seq(cs, R_030D08_SQ_THREAD_TRACE_USERDATA_2, count, sctx->chip_class >= GFX10);724725radeon_emit_array(cs, dwords, count);726727dwords += count;728num_dwords -= count;729}730radeon_end();731}732733static void734si_emit_spi_config_cntl(struct si_context* sctx,735struct radeon_cmdbuf *cs, bool enable)736{737radeon_begin(cs);738739if (sctx->chip_class >= GFX9) {740uint32_t spi_config_cntl = S_031100_GPR_WRITE_PRIORITY(0x2c688) |741S_031100_EXP_PRIORITY_ORDER(3) |742S_031100_ENABLE_SQG_TOP_EVENTS(enable) |743S_031100_ENABLE_SQG_BOP_EVENTS(enable);744745if (sctx->chip_class >= GFX10)746spi_config_cntl |= S_031100_PS_PKR_PRIORITY_CNTL(3);747748radeon_set_uconfig_reg(cs, R_031100_SPI_CONFIG_CNTL, spi_config_cntl);749} else {750/* SPI_CONFIG_CNTL is a protected register on GFX6-GFX8. */751radeon_set_privileged_config_reg(cs, R_009100_SPI_CONFIG_CNTL,752S_009100_ENABLE_SQG_TOP_EVENTS(enable) |753S_009100_ENABLE_SQG_BOP_EVENTS(enable));754}755radeon_end();756}757758static uint32_t num_events = 0;759void760si_sqtt_write_event_marker(struct si_context* sctx, struct radeon_cmdbuf *rcs,761enum rgp_sqtt_marker_event_type api_type,762uint32_t vertex_offset_user_data,763uint32_t instance_offset_user_data,764uint32_t draw_index_user_data)765{766struct rgp_sqtt_marker_event marker = {0};767768marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT;769marker.api_type = api_type == EventInvalid ? EventCmdDraw : api_type;770marker.cmd_id = num_events++;771marker.cb_id = 0;772773if (vertex_offset_user_data == UINT_MAX ||774instance_offset_user_data == UINT_MAX) {775vertex_offset_user_data = 0;776instance_offset_user_data = 0;777}778779if (draw_index_user_data == UINT_MAX)780draw_index_user_data = vertex_offset_user_data;781782marker.vertex_offset_reg_idx = vertex_offset_user_data;783marker.instance_offset_reg_idx = instance_offset_user_data;784marker.draw_index_reg_idx = draw_index_user_data;785786si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4);787788sctx->sqtt_next_event = EventInvalid;789}790791void792si_write_event_with_dims_marker(struct si_context* sctx, struct radeon_cmdbuf *rcs,793enum rgp_sqtt_marker_event_type api_type,794uint32_t x, uint32_t y, uint32_t z)795{796struct rgp_sqtt_marker_event_with_dims marker = {0};797798marker.event.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT;799marker.event.api_type = api_type;800marker.event.cmd_id = num_events++;801marker.event.cb_id = 0;802marker.event.has_thread_dims = 1;803804marker.thread_x = x;805marker.thread_y = y;806marker.thread_z = z;807808si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4);809sctx->sqtt_next_event = EventInvalid;810}811812void813si_sqtt_describe_barrier_start(struct si_context* sctx, struct radeon_cmdbuf *rcs)814{815struct rgp_sqtt_marker_barrier_start marker = {0};816817marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_START;818marker.cb_id = 0;819marker.dword02 = 0xC0000000 + 10; /* RGP_BARRIER_INTERNAL_BASE */820821si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4);822}823824void825si_sqtt_describe_barrier_end(struct si_context* sctx, struct radeon_cmdbuf *rcs,826unsigned flags)827{828struct rgp_sqtt_marker_barrier_end marker = {0};829830marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_END;831marker.cb_id = 0;832833if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH)834marker.vs_partial_flush = true;835if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH)836marker.ps_partial_flush = true;837if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH)838marker.cs_partial_flush = true;839840if (flags & SI_CONTEXT_PFP_SYNC_ME)841marker.pfp_sync_me = true;842843if (flags & SI_CONTEXT_INV_VCACHE)844marker.inval_tcp = true;845if (flags & SI_CONTEXT_INV_ICACHE)846marker.inval_sqI = true;847if (flags & SI_CONTEXT_INV_SCACHE)848marker.inval_sqK = true;849if (flags & SI_CONTEXT_INV_L2)850marker.inval_tcc = true;851852if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {853marker.inval_cb = true;854marker.flush_cb = true;855}856if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) {857marker.inval_db = true;858marker.flush_db = true;859}860861si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4);862}863864void865si_write_user_event(struct si_context* sctx, struct radeon_cmdbuf *rcs,866enum rgp_sqtt_marker_user_event_type type,867const char *str, int len)868{869if (type == UserEventPop) {870assert (str == NULL);871struct rgp_sqtt_marker_user_event marker = { 0 };872marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT;873marker.data_type = type;874875si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4);876} else {877assert (str != NULL);878struct rgp_sqtt_marker_user_event_with_length marker = { 0 };879marker.user_event.identifier = RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT;880marker.user_event.data_type = type;881len = MIN2(1024, len);882marker.length = align(len, 4);883884uint8_t *buffer = alloca(sizeof(marker) + marker.length);885memcpy(buffer, &marker, sizeof(marker));886memcpy(buffer + sizeof(marker), str, len);887buffer[sizeof(marker) + len - 1] = '\0';888889si_emit_thread_trace_userdata(sctx, rcs, buffer, sizeof(marker) / 4 + marker.length / 4);890}891}892893894bool895si_sqtt_pipeline_is_registered(struct ac_thread_trace_data *thread_trace_data,896uint64_t pipeline_hash)897{898simple_mtx_lock(&thread_trace_data->rgp_pso_correlation.lock);899list_for_each_entry_safe(struct rgp_pso_correlation_record, record,900&thread_trace_data->rgp_pso_correlation.record, list) {901if (record->pipeline_hash[0] == pipeline_hash) {902simple_mtx_unlock(&thread_trace_data->rgp_pso_correlation.lock);903return true;904}905906}907simple_mtx_unlock(&thread_trace_data->rgp_pso_correlation.lock);908909return false;910}911912913914static enum rgp_hardware_stages915si_sqtt_pipe_to_rgp_shader_stage(struct si_shader_key* key, enum pipe_shader_type stage)916{917switch (stage) {918case PIPE_SHADER_VERTEX:919if (key->as_ls)920return RGP_HW_STAGE_LS;921else if (key->as_es)922return RGP_HW_STAGE_ES;923else if (key->as_ngg)924return RGP_HW_STAGE_GS;925else926return RGP_HW_STAGE_VS;927case PIPE_SHADER_TESS_CTRL:928return RGP_HW_STAGE_HS;929case PIPE_SHADER_TESS_EVAL:930if (key->as_es)931return RGP_HW_STAGE_ES;932else if (key->as_ngg)933return RGP_HW_STAGE_GS;934else935return RGP_HW_STAGE_VS;936case PIPE_SHADER_GEOMETRY:937return RGP_HW_STAGE_GS;938case PIPE_SHADER_FRAGMENT:939return RGP_HW_STAGE_PS;940case PIPE_SHADER_COMPUTE:941return RGP_HW_STAGE_CS;942default:943unreachable("invalid mesa shader stage");944}945}946947948static bool949si_sqtt_add_code_object(struct si_context* sctx,950uint64_t pipeline_hash,951bool is_compute)952{953struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace;954struct rgp_code_object *code_object = &thread_trace_data->rgp_code_object;955struct rgp_code_object_record *record;956957record = malloc(sizeof(struct rgp_code_object_record));958if (!record)959return false;960961record->shader_stages_mask = 0;962record->num_shaders_combined = 0;963record->pipeline_hash[0] = pipeline_hash;964record->pipeline_hash[1] = pipeline_hash;965966for (unsigned i = 0; i < PIPE_SHADER_TYPES; i++) {967struct si_shader *shader;968enum rgp_hardware_stages hw_stage;969970if (is_compute) {971if (i != PIPE_SHADER_COMPUTE)972continue;973shader = &sctx->cs_shader_state.program->shader;974hw_stage = RGP_HW_STAGE_CS;975} else if (i != PIPE_SHADER_COMPUTE) {976if (!sctx->shaders[i].cso || !sctx->shaders[i].current)977continue;978shader = sctx->shaders[i].current;979hw_stage = si_sqtt_pipe_to_rgp_shader_stage(&shader->key, i);980} else {981continue;982}983984uint8_t *code = malloc(shader->binary.uploaded_code_size);985if (!code) {986free(record);987return false;988}989memcpy(code, shader->binary.uploaded_code, shader->binary.uploaded_code_size);990991uint64_t va = shader->bo->gpu_address;992record->shader_data[i].hash[0] = _mesa_hash_data(code, shader->binary.uploaded_code_size);993record->shader_data[i].hash[1] = record->shader_data[i].hash[0];994record->shader_data[i].code_size = shader->binary.uploaded_code_size;995record->shader_data[i].code = code;996record->shader_data[i].vgpr_count = shader->config.num_vgprs;997record->shader_data[i].sgpr_count = shader->config.num_sgprs;998record->shader_data[i].base_address = va & 0xffffffffffff;999record->shader_data[i].elf_symbol_offset = 0;1000record->shader_data[i].hw_stage = hw_stage;1001record->shader_data[i].is_combined = false;10021003record->shader_stages_mask |= (1 << i);1004record->num_shaders_combined++;1005}10061007simple_mtx_lock(&code_object->lock);1008list_addtail(&record->list, &code_object->record);1009code_object->record_count++;1010simple_mtx_unlock(&code_object->lock);10111012return true;1013}10141015bool1016si_sqtt_register_pipeline(struct si_context* sctx, uint64_t pipeline_hash, uint64_t base_address, bool is_compute)1017{1018struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace;10191020assert (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline_hash));10211022bool result = ac_sqtt_add_pso_correlation(thread_trace_data, pipeline_hash);1023if (!result)1024return false;10251026result = ac_sqtt_add_code_object_loader_event(thread_trace_data, pipeline_hash, base_address);1027if (!result)1028return false;10291030return si_sqtt_add_code_object(sctx, pipeline_hash, is_compute);1031}10321033void1034si_sqtt_describe_pipeline_bind(struct si_context* sctx,1035uint64_t pipeline_hash,1036int bind_point)1037{1038struct rgp_sqtt_marker_pipeline_bind marker = {0};1039struct radeon_cmdbuf *cs = &sctx->gfx_cs;10401041if (likely(!sctx->thread_trace_enabled)) {1042return;1043}10441045marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BIND_PIPELINE;1046marker.cb_id = 0;1047marker.bind_point = bind_point;1048marker.api_pso_hash[0] = pipeline_hash;1049marker.api_pso_hash[1] = pipeline_hash >> 32;10501051si_emit_thread_trace_userdata(sctx, cs, &marker, sizeof(marker) / 4);1052}105310541055