Path: blob/21.2-virgl/src/freedreno/computerator/a6xx.c
4564 views
/*1* Copyright © 2020 Google, Inc.2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,19* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE20* SOFTWARE.21*/2223#include "ir3/ir3_compiler.h"2425#include "util/u_math.h"2627#include "adreno_pm4.xml.h"28#include "adreno_common.xml.h"29#include "a6xx.xml.h"3031#include "ir3_asm.h"32#include "main.h"3334struct a6xx_backend {35struct backend base;3637struct ir3_compiler *compiler;38struct fd_device *dev;3940unsigned seqno;41struct fd_bo *control_mem;4243struct fd_bo *query_mem;44const struct perfcntr *perfcntrs;45unsigned num_perfcntrs;46};47define_cast(backend, a6xx_backend);4849/*50* Data structures shared with GPU:51*/5253/* This struct defines the layout of the fd6_context::control buffer: */54struct fd6_control {55uint32_t seqno; /* seqno for async CP_EVENT_WRITE, etc */56uint32_t _pad0;57volatile uint32_t vsc_overflow;58uint32_t _pad1;59/* flag set from cmdstream when VSC overflow detected: */60uint32_t vsc_scratch;61uint32_t _pad2;62uint32_t _pad3;63uint32_t _pad4;6465/* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */66struct {67uint32_t offset;68uint32_t pad[7];69} flush_base[4];70};7172#define control_ptr(a6xx_backend, member) \73(a6xx_backend)->control_mem, offsetof(struct fd6_control, member), 0, 07475struct PACKED fd6_query_sample {76uint64_t start;77uint64_t result;78uint64_t stop;79};8081/* offset of a single field of an array of fd6_query_sample: */82#define query_sample_idx(a6xx_backend, idx, field) \83(a6xx_backend)->query_mem, \84(idx * sizeof(struct fd6_query_sample)) + \85offsetof(struct fd6_query_sample, field), \860, 08788/*89* Backend implementation:90*/9192static struct kernel *93a6xx_assemble(struct backend *b, FILE *in)94{95struct a6xx_backend *a6xx_backend = to_a6xx_backend(b);96struct ir3_kernel *ir3_kernel = ir3_asm_assemble(a6xx_backend->compiler, in);97ir3_kernel->backend = b;98return &ir3_kernel->base;99}100101static void102a6xx_disassemble(struct kernel *kernel, FILE *out)103{104ir3_asm_disassemble(to_ir3_kernel(kernel), out);105}106107static void108cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel)109{110struct ir3_kernel *ir3_kernel = to_ir3_kernel(kernel);111struct ir3_shader_variant *v = ir3_kernel->v;112const struct ir3_info *i = &v->info;113enum a6xx_threadsize thrsz = i->double_threadsize ? THREAD128 : THREAD64;114115OUT_PKT4(ring, REG_A6XX_SP_MODE_CONTROL, 1);116OUT_RING(ring, A6XX_SP_MODE_CONTROL_CONSTANT_DEMOTION_ENABLE | 4);117118OUT_PKT4(ring, REG_A6XX_SP_PERFCTR_ENABLE, 1);119OUT_RING(ring, A6XX_SP_PERFCTR_ENABLE_CS);120121OUT_PKT4(ring, REG_A6XX_SP_FLOAT_CNTL, 1);122OUT_RING(ring, 0);123124OUT_PKT4(ring, REG_A6XX_HLSQ_INVALIDATE_CMD, 1);125OUT_RING(126ring,127A6XX_HLSQ_INVALIDATE_CMD_VS_STATE | A6XX_HLSQ_INVALIDATE_CMD_HS_STATE |128A6XX_HLSQ_INVALIDATE_CMD_DS_STATE | A6XX_HLSQ_INVALIDATE_CMD_GS_STATE |129A6XX_HLSQ_INVALIDATE_CMD_FS_STATE | A6XX_HLSQ_INVALIDATE_CMD_CS_STATE |130A6XX_HLSQ_INVALIDATE_CMD_CS_IBO | A6XX_HLSQ_INVALIDATE_CMD_GFX_IBO);131132unsigned constlen = align(v->constlen, 4);133OUT_PKT4(ring, REG_A6XX_HLSQ_CS_CNTL, 1);134OUT_RING(ring,135A6XX_HLSQ_CS_CNTL_CONSTLEN(constlen) | A6XX_HLSQ_CS_CNTL_ENABLED);136137OUT_PKT4(ring, REG_A6XX_SP_CS_CONFIG, 2);138OUT_RING(ring, A6XX_SP_CS_CONFIG_ENABLED |139A6XX_SP_CS_CONFIG_NIBO(kernel->num_bufs) |140A6XX_SP_CS_CONFIG_NTEX(v->num_samp) |141A6XX_SP_CS_CONFIG_NSAMP(v->num_samp)); /* SP_VS_CONFIG */142OUT_RING(ring, v->instrlen); /* SP_VS_INSTRLEN */143144OUT_PKT4(ring, REG_A6XX_SP_CS_CTRL_REG0, 1);145OUT_RING(ring,146A6XX_SP_CS_CTRL_REG0_THREADSIZE(thrsz) |147A6XX_SP_CS_CTRL_REG0_FULLREGFOOTPRINT(i->max_reg + 1) |148A6XX_SP_CS_CTRL_REG0_HALFREGFOOTPRINT(i->max_half_reg + 1) |149COND(v->mergedregs, A6XX_SP_CS_CTRL_REG0_MERGEDREGS) |150A6XX_SP_CS_CTRL_REG0_BRANCHSTACK(ir3_shader_branchstack_hw(v)));151152OUT_PKT4(ring, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1);153OUT_RING(ring, 0x41);154155uint32_t local_invocation_id, work_group_id;156local_invocation_id =157ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID);158work_group_id = ir3_find_sysval_regid(v, SYSTEM_VALUE_WORKGROUP_ID);159160OUT_PKT4(ring, REG_A6XX_HLSQ_CS_CNTL_0, 2);161OUT_RING(ring, A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) |162A6XX_HLSQ_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |163A6XX_HLSQ_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |164A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));165OUT_RING(ring, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |166A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz));167168OUT_PKT4(ring, REG_A6XX_SP_CS_OBJ_START, 2);169OUT_RELOC(ring, v->bo, 0, 0, 0); /* SP_CS_OBJ_START_LO/HI */170171OUT_PKT4(ring, REG_A6XX_SP_CS_INSTRLEN, 1);172OUT_RING(ring, v->instrlen);173174OUT_PKT4(ring, REG_A6XX_SP_CS_OBJ_START, 2);175OUT_RELOC(ring, v->bo, 0, 0, 0);176177OUT_PKT7(ring, CP_LOAD_STATE6_FRAG, 3);178OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) |179CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |180CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |181CP_LOAD_STATE6_0_STATE_BLOCK(SB6_CS_SHADER) |182CP_LOAD_STATE6_0_NUM_UNIT(v->instrlen));183OUT_RELOC(ring, v->bo, 0, 0, 0);184}185186static void187emit_const(struct fd_ringbuffer *ring, uint32_t regid, uint32_t sizedwords,188const uint32_t *dwords)189{190uint32_t align_sz;191192debug_assert((regid % 4) == 0);193194align_sz = align(sizedwords, 4);195196OUT_PKT7(ring, CP_LOAD_STATE6_FRAG, 3 + align_sz);197OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(regid / 4) |198CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |199CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |200CP_LOAD_STATE6_0_STATE_BLOCK(SB6_CS_SHADER) |201CP_LOAD_STATE6_0_NUM_UNIT(DIV_ROUND_UP(sizedwords, 4)));202OUT_RING(ring, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));203OUT_RING(ring, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));204205for (uint32_t i = 0; i < sizedwords; i++) {206OUT_RING(ring, dwords[i]);207}208209/* Zero-pad to multiple of 4 dwords */210for (uint32_t i = sizedwords; i < align_sz; i++) {211OUT_RING(ring, 0);212}213}214215static void216cs_const_emit(struct fd_ringbuffer *ring, struct kernel *kernel,217uint32_t grid[3])218{219struct ir3_kernel *ir3_kernel = to_ir3_kernel(kernel);220struct ir3_shader_variant *v = ir3_kernel->v;221222const struct ir3_const_state *const_state = ir3_const_state(v);223uint32_t base = const_state->offsets.immediate;224int size = DIV_ROUND_UP(const_state->immediates_count, 4);225226if (ir3_kernel->info.numwg != INVALID_REG) {227assert((ir3_kernel->info.numwg & 0x3) == 0);228int idx = ir3_kernel->info.numwg >> 2;229const_state->immediates[idx * 4 + 0] = grid[0];230const_state->immediates[idx * 4 + 1] = grid[1];231const_state->immediates[idx * 4 + 2] = grid[2];232}233234for (int i = 0; i < MAX_BUFS; i++) {235if (kernel->buf_addr_regs[i] != INVALID_REG) {236assert((kernel->buf_addr_regs[i] & 0x3) == 0);237int idx = kernel->buf_addr_regs[i] >> 2;238239uint64_t iova = fd_bo_get_iova(kernel->bufs[i]);240241const_state->immediates[idx * 4 + 1] = iova >> 32;242const_state->immediates[idx * 4 + 0] = (iova << 32) >> 32;243}244}245246/* truncate size to avoid writing constants that shader247* does not use:248*/249size = MIN2(size + base, v->constlen) - base;250251/* convert out of vec4: */252base *= 4;253size *= 4;254255if (size > 0) {256emit_const(ring, base, size, const_state->immediates);257}258}259260static void261cs_ibo_emit(struct fd_ringbuffer *ring, struct fd_submit *submit,262struct kernel *kernel)263{264struct fd_ringbuffer *state = fd_submit_new_ringbuffer(265submit, kernel->num_bufs * 16 * 4, FD_RINGBUFFER_STREAMING);266267for (unsigned i = 0; i < kernel->num_bufs; i++) {268/* size is encoded with low 15b in WIDTH and high bits in HEIGHT,269* in units of elements:270*/271unsigned sz = kernel->buf_sizes[i];272unsigned width = sz & MASK(15);273unsigned height = sz >> 15;274275OUT_RING(state, A6XX_IBO_0_FMT(FMT6_32_UINT) | A6XX_IBO_0_TILE_MODE(0));276OUT_RING(state, A6XX_IBO_1_WIDTH(width) | A6XX_IBO_1_HEIGHT(height));277OUT_RING(state, A6XX_IBO_2_PITCH(0) | A6XX_IBO_2_UNK4 | A6XX_IBO_2_UNK31 |278A6XX_IBO_2_TYPE(A6XX_TEX_1D));279OUT_RING(state, A6XX_IBO_3_ARRAY_PITCH(0));280OUT_RELOC(state, kernel->bufs[i], 0, 0, 0);281OUT_RING(state, 0x00000000);282OUT_RING(state, 0x00000000);283OUT_RING(state, 0x00000000);284OUT_RING(state, 0x00000000);285OUT_RING(state, 0x00000000);286OUT_RING(state, 0x00000000);287OUT_RING(state, 0x00000000);288OUT_RING(state, 0x00000000);289OUT_RING(state, 0x00000000);290OUT_RING(state, 0x00000000);291}292293OUT_PKT7(ring, CP_LOAD_STATE6_FRAG, 3);294OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) |295CP_LOAD_STATE6_0_STATE_TYPE(ST6_IBO) |296CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |297CP_LOAD_STATE6_0_STATE_BLOCK(SB6_CS_SHADER) |298CP_LOAD_STATE6_0_NUM_UNIT(kernel->num_bufs));299OUT_RB(ring, state);300301OUT_PKT4(ring, REG_A6XX_SP_CS_IBO, 2);302OUT_RB(ring, state);303304OUT_PKT4(ring, REG_A6XX_SP_CS_IBO_COUNT, 1);305OUT_RING(ring, kernel->num_bufs);306307fd_ringbuffer_del(state);308}309310static inline unsigned311event_write(struct fd_ringbuffer *ring, struct kernel *kernel,312enum vgt_event_type evt, bool timestamp)313{314unsigned seqno = 0;315316OUT_PKT7(ring, CP_EVENT_WRITE, timestamp ? 4 : 1);317OUT_RING(ring, CP_EVENT_WRITE_0_EVENT(evt));318if (timestamp) {319struct ir3_kernel *ir3_kernel = to_ir3_kernel(kernel);320struct a6xx_backend *a6xx_backend = to_a6xx_backend(ir3_kernel->backend);321seqno = ++a6xx_backend->seqno;322OUT_RELOC(ring, control_ptr(a6xx_backend, seqno)); /* ADDR_LO/HI */323OUT_RING(ring, seqno);324}325326return seqno;327}328329static inline void330cache_flush(struct fd_ringbuffer *ring, struct kernel *kernel)331{332struct ir3_kernel *ir3_kernel = to_ir3_kernel(kernel);333struct a6xx_backend *a6xx_backend = to_a6xx_backend(ir3_kernel->backend);334unsigned seqno;335336seqno = event_write(ring, kernel, RB_DONE_TS, true);337338OUT_PKT7(ring, CP_WAIT_REG_MEM, 6);339OUT_RING(ring, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |340CP_WAIT_REG_MEM_0_POLL_MEMORY);341OUT_RELOC(ring, control_ptr(a6xx_backend, seqno));342OUT_RING(ring, CP_WAIT_REG_MEM_3_REF(seqno));343OUT_RING(ring, CP_WAIT_REG_MEM_4_MASK(~0));344OUT_RING(ring, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));345346seqno = event_write(ring, kernel, CACHE_FLUSH_TS, true);347348OUT_PKT7(ring, CP_WAIT_MEM_GTE, 4);349OUT_RING(ring, CP_WAIT_MEM_GTE_0_RESERVED(0));350OUT_RELOC(ring, control_ptr(a6xx_backend, seqno));351OUT_RING(ring, CP_WAIT_MEM_GTE_3_REF(seqno));352}353354static void355a6xx_emit_grid(struct kernel *kernel, uint32_t grid[3],356struct fd_submit *submit)357{358struct ir3_kernel *ir3_kernel = to_ir3_kernel(kernel);359struct a6xx_backend *a6xx_backend = to_a6xx_backend(ir3_kernel->backend);360struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(361submit, 0, FD_RINGBUFFER_PRIMARY | FD_RINGBUFFER_GROWABLE);362363cs_program_emit(ring, kernel);364cs_const_emit(ring, kernel, grid);365cs_ibo_emit(ring, submit, kernel);366367OUT_PKT7(ring, CP_SET_MARKER, 1);368OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_COMPUTE));369370const unsigned *local_size = kernel->local_size;371const unsigned *num_groups = grid;372373unsigned work_dim = 0;374for (int i = 0; i < 3; i++) {375if (!grid[i])376break;377work_dim++;378}379380OUT_PKT4(ring, REG_A6XX_HLSQ_CS_NDRANGE_0, 7);381OUT_RING(ring, A6XX_HLSQ_CS_NDRANGE_0_KERNELDIM(work_dim) |382A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEX(local_size[0] - 1) |383A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEY(local_size[1] - 1) |384A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEZ(local_size[2] - 1));385OUT_RING(ring,386A6XX_HLSQ_CS_NDRANGE_1_GLOBALSIZE_X(local_size[0] * num_groups[0]));387OUT_RING(ring, 0); /* HLSQ_CS_NDRANGE_2_GLOBALOFF_X */388OUT_RING(ring,389A6XX_HLSQ_CS_NDRANGE_3_GLOBALSIZE_Y(local_size[1] * num_groups[1]));390OUT_RING(ring, 0); /* HLSQ_CS_NDRANGE_4_GLOBALOFF_Y */391OUT_RING(ring,392A6XX_HLSQ_CS_NDRANGE_5_GLOBALSIZE_Z(local_size[2] * num_groups[2]));393OUT_RING(ring, 0); /* HLSQ_CS_NDRANGE_6_GLOBALOFF_Z */394395OUT_PKT4(ring, REG_A6XX_HLSQ_CS_KERNEL_GROUP_X, 3);396OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_X */397OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_Y */398OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_Z */399400if (a6xx_backend->num_perfcntrs > 0) {401a6xx_backend->query_mem = fd_bo_new(402a6xx_backend->dev,403a6xx_backend->num_perfcntrs * sizeof(struct fd6_query_sample), 0, "query");404405/* configure the performance counters to count the requested406* countables:407*/408for (unsigned i = 0; i < a6xx_backend->num_perfcntrs; i++) {409const struct perfcntr *counter = &a6xx_backend->perfcntrs[i];410411OUT_PKT4(ring, counter->select_reg, 1);412OUT_RING(ring, counter->selector);413}414415OUT_PKT7(ring, CP_WAIT_FOR_IDLE, 0);416417/* and snapshot the start values: */418for (unsigned i = 0; i < a6xx_backend->num_perfcntrs; i++) {419const struct perfcntr *counter = &a6xx_backend->perfcntrs[i];420421OUT_PKT7(ring, CP_REG_TO_MEM, 3);422OUT_RING(ring, CP_REG_TO_MEM_0_64B |423CP_REG_TO_MEM_0_REG(counter->counter_reg_lo));424OUT_RELOC(ring, query_sample_idx(a6xx_backend, i, start));425}426}427428OUT_PKT7(ring, CP_EXEC_CS, 4);429OUT_RING(ring, 0x00000000);430OUT_RING(ring, CP_EXEC_CS_1_NGROUPS_X(grid[0]));431OUT_RING(ring, CP_EXEC_CS_2_NGROUPS_Y(grid[1]));432OUT_RING(ring, CP_EXEC_CS_3_NGROUPS_Z(grid[2]));433434OUT_PKT7(ring, CP_WAIT_FOR_IDLE, 0);435436if (a6xx_backend->num_perfcntrs > 0) {437/* snapshot the end values: */438for (unsigned i = 0; i < a6xx_backend->num_perfcntrs; i++) {439const struct perfcntr *counter = &a6xx_backend->perfcntrs[i];440441OUT_PKT7(ring, CP_REG_TO_MEM, 3);442OUT_RING(ring, CP_REG_TO_MEM_0_64B |443CP_REG_TO_MEM_0_REG(counter->counter_reg_lo));444OUT_RELOC(ring, query_sample_idx(a6xx_backend, i, stop));445}446447/* and compute the result: */448for (unsigned i = 0; i < a6xx_backend->num_perfcntrs; i++) {449/* result += stop - start: */450OUT_PKT7(ring, CP_MEM_TO_MEM, 9);451OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);452OUT_RELOC(ring, query_sample_idx(a6xx_backend, i, result)); /* dst */453OUT_RELOC(ring, query_sample_idx(a6xx_backend, i, result)); /* srcA */454OUT_RELOC(ring, query_sample_idx(a6xx_backend, i, stop)); /* srcB */455OUT_RELOC(ring, query_sample_idx(a6xx_backend, i, start)); /* srcC */456}457}458459cache_flush(ring, kernel);460}461462static void463a6xx_set_perfcntrs(struct backend *b, const struct perfcntr *perfcntrs,464unsigned num_perfcntrs)465{466struct a6xx_backend *a6xx_backend = to_a6xx_backend(b);467468a6xx_backend->perfcntrs = perfcntrs;469a6xx_backend->num_perfcntrs = num_perfcntrs;470}471472static void473a6xx_read_perfcntrs(struct backend *b, uint64_t *results)474{475struct a6xx_backend *a6xx_backend = to_a6xx_backend(b);476477fd_bo_cpu_prep(a6xx_backend->query_mem, NULL, FD_BO_PREP_READ);478struct fd6_query_sample *samples = fd_bo_map(a6xx_backend->query_mem);479480for (unsigned i = 0; i < a6xx_backend->num_perfcntrs; i++) {481results[i] = samples[i].result;482}483}484485struct backend *486a6xx_init(struct fd_device *dev, uint32_t gpu_id)487{488struct a6xx_backend *a6xx_backend = calloc(1, sizeof(*a6xx_backend));489490a6xx_backend->base = (struct backend){491.assemble = a6xx_assemble,492.disassemble = a6xx_disassemble,493.emit_grid = a6xx_emit_grid,494.set_perfcntrs = a6xx_set_perfcntrs,495.read_perfcntrs = a6xx_read_perfcntrs,496};497498a6xx_backend->compiler = ir3_compiler_create(dev, gpu_id, false);499a6xx_backend->dev = dev;500501a6xx_backend->control_mem =502fd_bo_new(dev, 0x1000, 0, "control");503504return &a6xx_backend->base;505}506507508