Path: blob/21.2-virgl/src/gallium/drivers/radeonsi/si_state_draw.cpp
4570 views
/*1* Copyright 2012 Advanced Micro Devices, Inc.2* All Rights Reserved.3*4* Permission is hereby granted, free of charge, to any person obtaining a5* copy of this software and associated documentation files (the "Software"),6* to deal in the Software without restriction, including without limitation7* on the rights to use, copy, modify, merge, publish, distribute, sub8* license, and/or sell copies of the Software, and to permit persons to whom9* the Software is furnished to do so, subject to the following conditions:10*11* The above copyright notice and this permission notice (including the next12* paragraph) shall be included in all copies or substantial portions of the13* Software.14*15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,17* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL18* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,19* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR20* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE21* USE OR OTHER DEALINGS IN THE SOFTWARE.22*/2324#include "ac_sqtt.h"25#include "si_build_pm4.h"26#include "util/u_index_modify.h"27#include "util/u_prim.h"28#include "util/u_upload_mgr.h"2930#if (GFX_VER == 6)31#define GFX(name) name##GFX632#elif (GFX_VER == 7)33#define GFX(name) name##GFX734#elif (GFX_VER == 8)35#define GFX(name) name##GFX836#elif (GFX_VER == 9)37#define GFX(name) name##GFX938#elif (GFX_VER == 10)39#define GFX(name) name##GFX1040#elif (GFX_VER == 103)41#define GFX(name) name##GFX10_342#else43#error "Unknown gfx version"44#endif4546/* special primitive types */47#define SI_PRIM_RECTANGLE_LIST PIPE_PRIM_MAX4849ALWAYS_INLINE50static unsigned si_conv_pipe_prim(unsigned mode)51{52static const unsigned prim_conv[] = {53[PIPE_PRIM_POINTS] = V_008958_DI_PT_POINTLIST,54[PIPE_PRIM_LINES] = V_008958_DI_PT_LINELIST,55[PIPE_PRIM_LINE_LOOP] = V_008958_DI_PT_LINELOOP,56[PIPE_PRIM_LINE_STRIP] = V_008958_DI_PT_LINESTRIP,57[PIPE_PRIM_TRIANGLES] = V_008958_DI_PT_TRILIST,58[PIPE_PRIM_TRIANGLE_STRIP] = V_008958_DI_PT_TRISTRIP,59[PIPE_PRIM_TRIANGLE_FAN] = V_008958_DI_PT_TRIFAN,60[PIPE_PRIM_QUADS] = V_008958_DI_PT_QUADLIST,61[PIPE_PRIM_QUAD_STRIP] = V_008958_DI_PT_QUADSTRIP,62[PIPE_PRIM_POLYGON] = V_008958_DI_PT_POLYGON,63[PIPE_PRIM_LINES_ADJACENCY] = V_008958_DI_PT_LINELIST_ADJ,64[PIPE_PRIM_LINE_STRIP_ADJACENCY] = V_008958_DI_PT_LINESTRIP_ADJ,65[PIPE_PRIM_TRIANGLES_ADJACENCY] = V_008958_DI_PT_TRILIST_ADJ,66[PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = V_008958_DI_PT_TRISTRIP_ADJ,67[PIPE_PRIM_PATCHES] = V_008958_DI_PT_PATCH,68[SI_PRIM_RECTANGLE_LIST] = V_008958_DI_PT_RECTLIST};69assert(mode < ARRAY_SIZE(prim_conv));70return prim_conv[mode];71}7273static void si_prefetch_shader_async(struct si_context *sctx, struct si_pm4_state *state)74{75struct pipe_resource *bo = &state->shader->bo->b.b;7677si_cp_dma_prefetch(sctx, bo, 0, bo->width0);78}7980enum si_L2_prefetch_mode {81PREFETCH_BEFORE_DRAW = 1,82PREFETCH_AFTER_DRAW,83PREFETCH_ALL,84};8586/**87* Prefetch shaders.88*/89template<chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,90si_L2_prefetch_mode mode>91static void si_prefetch_shaders(struct si_context *sctx)92{93unsigned mask = sctx->prefetch_L2_mask;9495/* GFX6 doesn't support the L2 prefetch. */96if (GFX_VERSION < GFX7 || !mask)97return;9899/* Prefetch shaders and VBO descriptors to TC L2. */100if (GFX_VERSION >= GFX9) {101/* Choose the right spot for the VBO prefetch. */102if (HAS_TESS) {103if (mode != PREFETCH_AFTER_DRAW) {104if (mask & SI_PREFETCH_HS)105si_prefetch_shader_async(sctx, sctx->queued.named.hs);106107if (mode == PREFETCH_BEFORE_DRAW)108return;109}110111if ((HAS_GS || NGG) && mask & SI_PREFETCH_GS)112si_prefetch_shader_async(sctx, sctx->queued.named.gs);113if (!NGG && mask & SI_PREFETCH_VS)114si_prefetch_shader_async(sctx, sctx->queued.named.vs);115} else if (HAS_GS || NGG) {116if (mode != PREFETCH_AFTER_DRAW) {117if (mask & SI_PREFETCH_GS)118si_prefetch_shader_async(sctx, sctx->queued.named.gs);119120if (mode == PREFETCH_BEFORE_DRAW)121return;122}123124if (!NGG && mask & SI_PREFETCH_VS)125si_prefetch_shader_async(sctx, sctx->queued.named.vs);126} else {127if (mode != PREFETCH_AFTER_DRAW) {128if (mask & SI_PREFETCH_VS)129si_prefetch_shader_async(sctx, sctx->queued.named.vs);130131if (mode == PREFETCH_BEFORE_DRAW)132return;133}134}135} else {136/* GFX6-GFX8 */137/* Choose the right spot for the VBO prefetch. */138if (HAS_TESS) {139if (mode != PREFETCH_AFTER_DRAW) {140if (mask & SI_PREFETCH_LS)141si_prefetch_shader_async(sctx, sctx->queued.named.ls);142143if (mode == PREFETCH_BEFORE_DRAW)144return;145}146147if (mask & SI_PREFETCH_HS)148si_prefetch_shader_async(sctx, sctx->queued.named.hs);149if (mask & SI_PREFETCH_ES)150si_prefetch_shader_async(sctx, sctx->queued.named.es);151if (mask & SI_PREFETCH_GS)152si_prefetch_shader_async(sctx, sctx->queued.named.gs);153if (mask & SI_PREFETCH_VS)154si_prefetch_shader_async(sctx, sctx->queued.named.vs);155} else if (HAS_GS) {156if (mode != PREFETCH_AFTER_DRAW) {157if (mask & SI_PREFETCH_ES)158si_prefetch_shader_async(sctx, sctx->queued.named.es);159160if (mode == PREFETCH_BEFORE_DRAW)161return;162}163164if (mask & SI_PREFETCH_GS)165si_prefetch_shader_async(sctx, sctx->queued.named.gs);166if (mask & SI_PREFETCH_VS)167si_prefetch_shader_async(sctx, sctx->queued.named.vs);168} else {169if (mode != PREFETCH_AFTER_DRAW) {170if (mask & SI_PREFETCH_VS)171si_prefetch_shader_async(sctx, sctx->queued.named.vs);172173if (mode == PREFETCH_BEFORE_DRAW)174return;175}176}177}178179if (mask & SI_PREFETCH_PS)180si_prefetch_shader_async(sctx, sctx->queued.named.ps);181182/* This must be cleared only when AFTER_DRAW is true. */183sctx->prefetch_L2_mask = 0;184}185186/**187* This calculates the LDS size for tessellation shaders (VS, TCS, TES).188* LS.LDS_SIZE is shared by all 3 shader stages.189*190* The information about LDS and other non-compile-time parameters is then191* written to userdata SGPRs.192*/193static void si_emit_derived_tess_state(struct si_context *sctx,194unsigned num_tcs_input_cp,195unsigned *num_patches)196{197struct si_shader *ls_current;198struct si_shader_selector *ls;199/* The TES pointer will only be used for sctx->last_tcs.200* It would be wrong to think that TCS = TES. */201struct si_shader_selector *tcs =202sctx->shader.tcs.cso ? sctx->shader.tcs.cso : sctx->shader.tes.cso;203unsigned tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id;204bool has_primid_instancing_bug = sctx->chip_class == GFX6 && sctx->screen->info.max_se == 1;205unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL];206207/* Since GFX9 has merged LS-HS in the TCS state, set LS = TCS. */208if (sctx->chip_class >= GFX9) {209if (sctx->shader.tcs.cso)210ls_current = sctx->shader.tcs.current;211else212ls_current = sctx->fixed_func_tcs_shader.current;213214ls = ls_current->key.part.tcs.ls;215} else {216ls_current = sctx->shader.vs.current;217ls = sctx->shader.vs.cso;218}219220if (sctx->last_ls == ls_current && sctx->last_tcs == tcs &&221sctx->last_tes_sh_base == tes_sh_base && sctx->last_num_tcs_input_cp == num_tcs_input_cp &&222(!has_primid_instancing_bug || (sctx->last_tess_uses_primid == tess_uses_primid))) {223*num_patches = sctx->last_num_patches;224return;225}226227sctx->last_ls = ls_current;228sctx->last_tcs = tcs;229sctx->last_tes_sh_base = tes_sh_base;230sctx->last_num_tcs_input_cp = num_tcs_input_cp;231sctx->last_tess_uses_primid = tess_uses_primid;232233/* This calculates how shader inputs and outputs among VS, TCS, and TES234* are laid out in LDS. */235unsigned num_tcs_inputs = util_last_bit64(ls->outputs_written);236unsigned num_tcs_output_cp, num_tcs_outputs, num_tcs_patch_outputs;237238if (sctx->shader.tcs.cso) {239num_tcs_outputs = util_last_bit64(tcs->outputs_written);240num_tcs_output_cp = tcs->info.base.tess.tcs_vertices_out;241num_tcs_patch_outputs = util_last_bit64(tcs->patch_outputs_written);242} else {243/* No TCS. Route varyings from LS to TES. */244num_tcs_outputs = num_tcs_inputs;245num_tcs_output_cp = num_tcs_input_cp;246num_tcs_patch_outputs = 2; /* TESSINNER + TESSOUTER */247}248249unsigned input_vertex_size = ls->lshs_vertex_stride;250unsigned output_vertex_size = num_tcs_outputs * 16;251unsigned input_patch_size;252253/* Allocate LDS for TCS inputs only if it's used. */254if (!ls_current->key.opt.same_patch_vertices ||255tcs->info.base.inputs_read & ~tcs->tcs_vgpr_only_inputs)256input_patch_size = num_tcs_input_cp * input_vertex_size;257else258input_patch_size = 0;259260unsigned pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size;261unsigned output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;262unsigned lds_per_patch;263264/* Compute the LDS size per patch.265*266* LDS is used to store TCS outputs if they are read, and to store tess267* factors if they are not defined in all invocations.268*/269if (tcs->info.base.outputs_read ||270tcs->info.base.patch_outputs_read ||271!tcs->info.tessfactors_are_def_in_all_invocs) {272lds_per_patch = input_patch_size + output_patch_size;273} else {274/* LDS will only store TCS inputs. The offchip buffer will only store TCS outputs. */275lds_per_patch = MAX2(input_patch_size, output_patch_size);276}277278/* Ensure that we only need 4 waves per CU, so that we don't need to check279* resource usage (such as whether we have enough VGPRs to fit the whole280* threadgroup into the CU). It also ensures that the number of tcs in and out281* vertices per threadgroup are at most 256, which is the hw limit.282*/283unsigned max_verts_per_patch = MAX2(num_tcs_input_cp, num_tcs_output_cp);284*num_patches = 256 / max_verts_per_patch;285286/* Not necessary for correctness, but higher numbers are slower.287* The hardware can do more, but the radeonsi shader constant is288* limited to 6 bits.289*/290*num_patches = MIN2(*num_patches, 64); /* e.g. 64 triangles in exactly 3 waves */291292/* When distributed tessellation is unsupported, switch between SEs293* at a higher frequency to manually balance the workload between SEs.294*/295if (!sctx->screen->info.has_distributed_tess && sctx->screen->info.max_se > 1)296*num_patches = MIN2(*num_patches, 16); /* recommended */297298/* Make sure the output data fits in the offchip buffer */299*num_patches =300MIN2(*num_patches, (sctx->screen->tess_offchip_block_dw_size * 4) / output_patch_size);301302/* Make sure that the data fits in LDS. This assumes the shaders only303* use LDS for the inputs and outputs.304*305* The maximum allowed LDS size is 32K. Higher numbers can hang.306* Use 16K as the maximum, so that we can fit 2 workgroups on the same CU.307*/308ASSERTED unsigned max_lds_size = 32 * 1024; /* hw limit */309unsigned target_lds_size = 16 * 1024; /* target at least 2 workgroups per CU, 16K each */310*num_patches = MIN2(*num_patches, target_lds_size / lds_per_patch);311*num_patches = MAX2(*num_patches, 1);312assert(*num_patches * lds_per_patch <= max_lds_size);313314/* Make sure that vector lanes are fully occupied by cutting off the last wave315* if it's only partially filled.316*/317unsigned temp_verts_per_tg = *num_patches * max_verts_per_patch;318unsigned wave_size = sctx->screen->ge_wave_size;319320if (temp_verts_per_tg > wave_size &&321(wave_size - temp_verts_per_tg % wave_size >= MAX2(max_verts_per_patch, 8)))322*num_patches = (temp_verts_per_tg & ~(wave_size - 1)) / max_verts_per_patch;323324if (sctx->chip_class == GFX6) {325/* GFX6 bug workaround, related to power management. Limit LS-HS326* threadgroups to only one wave.327*/328unsigned one_wave = wave_size / max_verts_per_patch;329*num_patches = MIN2(*num_patches, one_wave);330}331332/* The VGT HS block increments the patch ID unconditionally333* within a single threadgroup. This results in incorrect334* patch IDs when instanced draws are used.335*336* The intended solution is to restrict threadgroups to337* a single instance by setting SWITCH_ON_EOI, which338* should cause IA to split instances up. However, this339* doesn't work correctly on GFX6 when there is no other340* SE to switch to.341*/342if (has_primid_instancing_bug && tess_uses_primid)343*num_patches = 1;344345sctx->last_num_patches = *num_patches;346347unsigned output_patch0_offset = input_patch_size * *num_patches;348unsigned perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;349350/* Compute userdata SGPRs. */351assert(((input_vertex_size / 4) & ~0xff) == 0);352assert(((output_vertex_size / 4) & ~0xff) == 0);353assert(((input_patch_size / 4) & ~0x1fff) == 0);354assert(((output_patch_size / 4) & ~0x1fff) == 0);355assert(((output_patch0_offset / 16) & ~0xffff) == 0);356assert(((perpatch_output_offset / 16) & ~0xffff) == 0);357assert(num_tcs_input_cp <= 32);358assert(num_tcs_output_cp <= 32);359assert(*num_patches <= 64);360assert(((pervertex_output_patch_size * *num_patches) & ~0x1fffff) == 0);361362uint64_t ring_va = (unlikely(sctx->ws->cs_is_secure(&sctx->gfx_cs)) ?363si_resource(sctx->tess_rings_tmz) : si_resource(sctx->tess_rings))->gpu_address;364assert((ring_va & u_bit_consecutive(0, 19)) == 0);365366unsigned tcs_in_layout = S_VS_STATE_LS_OUT_PATCH_SIZE(input_patch_size / 4) |367S_VS_STATE_LS_OUT_VERTEX_SIZE(input_vertex_size / 4);368unsigned tcs_out_layout = (output_patch_size / 4) | (num_tcs_input_cp << 13) | ring_va;369unsigned tcs_out_offsets = (output_patch0_offset / 16) | ((perpatch_output_offset / 16) << 16);370unsigned offchip_layout =371(*num_patches - 1) | ((num_tcs_output_cp - 1) << 6) |372((pervertex_output_patch_size * *num_patches) << 11);373374/* Compute the LDS size. */375unsigned lds_size = lds_per_patch * *num_patches;376377if (sctx->chip_class >= GFX7) {378assert(lds_size <= 65536);379lds_size = align(lds_size, 512) / 512;380} else {381assert(lds_size <= 32768);382lds_size = align(lds_size, 256) / 256;383}384385/* Set SI_SGPR_VS_STATE_BITS. */386sctx->current_vs_state &= C_VS_STATE_LS_OUT_PATCH_SIZE & C_VS_STATE_LS_OUT_VERTEX_SIZE;387sctx->current_vs_state |= tcs_in_layout;388389/* We should be able to support in-shader LDS use with LLVM >= 9390* by just adding the lds_sizes together, but it has never391* been tested. */392assert(ls_current->config.lds_size == 0);393394struct radeon_cmdbuf *cs = &sctx->gfx_cs;395radeon_begin(cs);396397if (sctx->chip_class >= GFX9) {398unsigned hs_rsrc2 = ls_current->config.rsrc2;399400if (sctx->chip_class >= GFX10)401hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX10(lds_size);402else403hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX9(lds_size);404405radeon_set_sh_reg(cs, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, hs_rsrc2);406407/* Set userdata SGPRs for merged LS-HS. */408radeon_set_sh_reg_seq(409cs, R_00B430_SPI_SHADER_USER_DATA_LS_0 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4, 3);410radeon_emit(cs, offchip_layout);411radeon_emit(cs, tcs_out_offsets);412radeon_emit(cs, tcs_out_layout);413} else {414unsigned ls_rsrc2 = ls_current->config.rsrc2;415416si_multiwave_lds_size_workaround(sctx->screen, &lds_size);417ls_rsrc2 |= S_00B52C_LDS_SIZE(lds_size);418419/* Due to a hw bug, RSRC2_LS must be written twice with another420* LS register written in between. */421if (sctx->chip_class == GFX7 && sctx->family != CHIP_HAWAII)422radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2);423radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);424radeon_emit(cs, ls_current->config.rsrc1);425radeon_emit(cs, ls_rsrc2);426427/* Set userdata SGPRs for TCS. */428radeon_set_sh_reg_seq(429cs, R_00B430_SPI_SHADER_USER_DATA_HS_0 + GFX6_SGPR_TCS_OFFCHIP_LAYOUT * 4, 4);430radeon_emit(cs, offchip_layout);431radeon_emit(cs, tcs_out_offsets);432radeon_emit(cs, tcs_out_layout);433radeon_emit(cs, tcs_in_layout);434}435436/* Set userdata SGPRs for TES. */437radeon_set_sh_reg_seq(cs, tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, 2);438radeon_emit(cs, offchip_layout);439radeon_emit(cs, ring_va);440radeon_end();441442unsigned ls_hs_config =443S_028B58_NUM_PATCHES(*num_patches) |444S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp) |445S_028B58_HS_NUM_OUTPUT_CP(num_tcs_output_cp);446447if (sctx->last_ls_hs_config != ls_hs_config) {448radeon_begin(cs);449if (sctx->chip_class >= GFX7) {450radeon_set_context_reg_idx(cs, R_028B58_VGT_LS_HS_CONFIG, 2, ls_hs_config);451} else {452radeon_set_context_reg(cs, R_028B58_VGT_LS_HS_CONFIG, ls_hs_config);453}454radeon_end_update_context_roll(sctx);455sctx->last_ls_hs_config = ls_hs_config;456}457}458459static unsigned si_num_prims_for_vertices(enum pipe_prim_type prim,460unsigned count, unsigned vertices_per_patch)461{462switch (prim) {463case PIPE_PRIM_PATCHES:464return count / vertices_per_patch;465case PIPE_PRIM_POLYGON:466/* It's a triangle fan with different edge flags. */467return count >= 3 ? count - 2 : 0;468case SI_PRIM_RECTANGLE_LIST:469return count / 3;470default:471return u_decomposed_prims_for_vertices(prim, count);472}473}474475static unsigned si_get_init_multi_vgt_param(struct si_screen *sscreen, union si_vgt_param_key *key)476{477STATIC_ASSERT(sizeof(union si_vgt_param_key) == 2);478unsigned max_primgroup_in_wave = 2;479480/* SWITCH_ON_EOP(0) is always preferable. */481bool wd_switch_on_eop = false;482bool ia_switch_on_eop = false;483bool ia_switch_on_eoi = false;484bool partial_vs_wave = false;485bool partial_es_wave = false;486487if (key->u.uses_tess) {488/* SWITCH_ON_EOI must be set if PrimID is used. */489if (key->u.tess_uses_prim_id)490ia_switch_on_eoi = true;491492/* Bug with tessellation and GS on Bonaire and older 2 SE chips. */493if ((sscreen->info.family == CHIP_TAHITI || sscreen->info.family == CHIP_PITCAIRN ||494sscreen->info.family == CHIP_BONAIRE) &&495key->u.uses_gs)496partial_vs_wave = true;497498/* Needed for 028B6C_DISTRIBUTION_MODE != 0. (implies >= GFX8) */499if (sscreen->info.has_distributed_tess) {500if (key->u.uses_gs) {501if (sscreen->info.chip_class == GFX8)502partial_es_wave = true;503} else {504partial_vs_wave = true;505}506}507}508509/* This is a hardware requirement. */510if (key->u.line_stipple_enabled || (sscreen->debug_flags & DBG(SWITCH_ON_EOP))) {511ia_switch_on_eop = true;512wd_switch_on_eop = true;513}514515if (sscreen->info.chip_class >= GFX7) {516/* WD_SWITCH_ON_EOP has no effect on GPUs with less than517* 4 shader engines. Set 1 to pass the assertion below.518* The other cases are hardware requirements.519*520* Polaris supports primitive restart with WD_SWITCH_ON_EOP=0521* for points, line strips, and tri strips.522*/523if (sscreen->info.max_se <= 2 || key->u.prim == PIPE_PRIM_POLYGON ||524key->u.prim == PIPE_PRIM_LINE_LOOP || key->u.prim == PIPE_PRIM_TRIANGLE_FAN ||525key->u.prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY ||526(key->u.primitive_restart &&527(sscreen->info.family < CHIP_POLARIS10 ||528(key->u.prim != PIPE_PRIM_POINTS && key->u.prim != PIPE_PRIM_LINE_STRIP &&529key->u.prim != PIPE_PRIM_TRIANGLE_STRIP))) ||530key->u.count_from_stream_output)531wd_switch_on_eop = true;532533/* Hawaii hangs if instancing is enabled and WD_SWITCH_ON_EOP is 0.534* We don't know that for indirect drawing, so treat it as535* always problematic. */536if (sscreen->info.family == CHIP_HAWAII && key->u.uses_instancing)537wd_switch_on_eop = true;538539/* Performance recommendation for 4 SE Gfx7-8 parts if540* instances are smaller than a primgroup.541* Assume indirect draws always use small instances.542* This is needed for good VS wave utilization.543*/544if (sscreen->info.chip_class <= GFX8 && sscreen->info.max_se == 4 &&545key->u.multi_instances_smaller_than_primgroup)546wd_switch_on_eop = true;547548/* Required on GFX7 and later. */549if (sscreen->info.max_se == 4 && !wd_switch_on_eop)550ia_switch_on_eoi = true;551552/* HW engineers suggested that PARTIAL_VS_WAVE_ON should be set553* to work around a GS hang.554*/555if (key->u.uses_gs &&556(sscreen->info.family == CHIP_TONGA || sscreen->info.family == CHIP_FIJI ||557sscreen->info.family == CHIP_POLARIS10 || sscreen->info.family == CHIP_POLARIS11 ||558sscreen->info.family == CHIP_POLARIS12 || sscreen->info.family == CHIP_VEGAM))559partial_vs_wave = true;560561/* Required by Hawaii and, for some special cases, by GFX8. */562if (ia_switch_on_eoi &&563(sscreen->info.family == CHIP_HAWAII ||564(sscreen->info.chip_class == GFX8 && (key->u.uses_gs || max_primgroup_in_wave != 2))))565partial_vs_wave = true;566567/* Instancing bug on Bonaire. */568if (sscreen->info.family == CHIP_BONAIRE && ia_switch_on_eoi && key->u.uses_instancing)569partial_vs_wave = true;570571/* This only applies to Polaris10 and later 4 SE chips.572* wd_switch_on_eop is already true on all other chips.573*/574if (!wd_switch_on_eop && key->u.primitive_restart)575partial_vs_wave = true;576577/* If the WD switch is false, the IA switch must be false too. */578assert(wd_switch_on_eop || !ia_switch_on_eop);579}580581/* If SWITCH_ON_EOI is set, PARTIAL_ES_WAVE must be set too. */582if (sscreen->info.chip_class <= GFX8 && ia_switch_on_eoi)583partial_es_wave = true;584585return S_028AA8_SWITCH_ON_EOP(ia_switch_on_eop) | S_028AA8_SWITCH_ON_EOI(ia_switch_on_eoi) |586S_028AA8_PARTIAL_VS_WAVE_ON(partial_vs_wave) |587S_028AA8_PARTIAL_ES_WAVE_ON(partial_es_wave) |588S_028AA8_WD_SWITCH_ON_EOP(sscreen->info.chip_class >= GFX7 ? wd_switch_on_eop : 0) |589/* The following field was moved to VGT_SHADER_STAGES_EN in GFX9. */590S_028AA8_MAX_PRIMGRP_IN_WAVE(sscreen->info.chip_class == GFX8 ? max_primgroup_in_wave591: 0) |592S_030960_EN_INST_OPT_BASIC(sscreen->info.chip_class >= GFX9) |593S_030960_EN_INST_OPT_ADV(sscreen->info.chip_class >= GFX9);594}595596static void si_init_ia_multi_vgt_param_table(struct si_context *sctx)597{598for (int prim = 0; prim <= SI_PRIM_RECTANGLE_LIST; prim++)599for (int uses_instancing = 0; uses_instancing < 2; uses_instancing++)600for (int multi_instances = 0; multi_instances < 2; multi_instances++)601for (int primitive_restart = 0; primitive_restart < 2; primitive_restart++)602for (int count_from_so = 0; count_from_so < 2; count_from_so++)603for (int line_stipple = 0; line_stipple < 2; line_stipple++)604for (int uses_tess = 0; uses_tess < 2; uses_tess++)605for (int tess_uses_primid = 0; tess_uses_primid < 2; tess_uses_primid++)606for (int uses_gs = 0; uses_gs < 2; uses_gs++) {607union si_vgt_param_key key;608609key.index = 0;610key.u.prim = prim;611key.u.uses_instancing = uses_instancing;612key.u.multi_instances_smaller_than_primgroup = multi_instances;613key.u.primitive_restart = primitive_restart;614key.u.count_from_stream_output = count_from_so;615key.u.line_stipple_enabled = line_stipple;616key.u.uses_tess = uses_tess;617key.u.tess_uses_prim_id = tess_uses_primid;618key.u.uses_gs = uses_gs;619620sctx->ia_multi_vgt_param[key.index] =621si_get_init_multi_vgt_param(sctx->screen, &key);622}623}624625static bool si_is_line_stipple_enabled(struct si_context *sctx)626{627struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;628629return rs->line_stipple_enable && sctx->current_rast_prim != PIPE_PRIM_POINTS &&630(rs->polygon_mode_is_lines || util_prim_is_lines(sctx->current_rast_prim));631}632633static bool num_instanced_prims_less_than(const struct pipe_draw_indirect_info *indirect,634enum pipe_prim_type prim,635unsigned min_vertex_count,636unsigned instance_count,637unsigned num_prims,638ubyte vertices_per_patch)639{640if (indirect) {641return indirect->buffer ||642(instance_count > 1 && indirect->count_from_stream_output);643} else {644return instance_count > 1 &&645si_num_prims_for_vertices(prim, min_vertex_count, vertices_per_patch) < num_prims;646}647}648649template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS> ALWAYS_INLINE650static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,651const struct pipe_draw_indirect_info *indirect,652enum pipe_prim_type prim, unsigned num_patches,653unsigned instance_count, bool primitive_restart,654unsigned min_vertex_count, ubyte vertices_per_patch)655{656union si_vgt_param_key key = sctx->ia_multi_vgt_param_key;657unsigned primgroup_size;658unsigned ia_multi_vgt_param;659660if (HAS_TESS) {661primgroup_size = num_patches; /* must be a multiple of NUM_PATCHES */662} else if (HAS_GS) {663primgroup_size = 64; /* recommended with a GS */664} else {665primgroup_size = 128; /* recommended without a GS and tess */666}667668key.u.prim = prim;669key.u.uses_instancing = (indirect && indirect->buffer) || instance_count > 1;670key.u.multi_instances_smaller_than_primgroup =671num_instanced_prims_less_than(indirect, prim, min_vertex_count, instance_count,672primgroup_size, vertices_per_patch);673key.u.primitive_restart = primitive_restart;674key.u.count_from_stream_output = indirect && indirect->count_from_stream_output;675key.u.line_stipple_enabled = si_is_line_stipple_enabled(sctx);676677ia_multi_vgt_param =678sctx->ia_multi_vgt_param[key.index] | S_028AA8_PRIMGROUP_SIZE(primgroup_size - 1);679680if (HAS_GS) {681/* GS requirement. */682if (GFX_VERSION <= GFX8 &&683SI_GS_PER_ES / primgroup_size >= sctx->screen->gs_table_depth - 3)684ia_multi_vgt_param |= S_028AA8_PARTIAL_ES_WAVE_ON(1);685686/* GS hw bug with single-primitive instances and SWITCH_ON_EOI.687* The hw doc says all multi-SE chips are affected, but Vulkan688* only applies it to Hawaii. Do what Vulkan does.689*/690if (GFX_VERSION == GFX7 &&691sctx->family == CHIP_HAWAII && G_028AA8_SWITCH_ON_EOI(ia_multi_vgt_param) &&692num_instanced_prims_less_than(indirect, prim, min_vertex_count, instance_count, 2,693vertices_per_patch))694sctx->flags |= SI_CONTEXT_VGT_FLUSH;695}696697return ia_multi_vgt_param;698}699700ALWAYS_INLINE701static unsigned si_conv_prim_to_gs_out(unsigned mode)702{703static const int prim_conv[] = {704[PIPE_PRIM_POINTS] = V_028A6C_POINTLIST,705[PIPE_PRIM_LINES] = V_028A6C_LINESTRIP,706[PIPE_PRIM_LINE_LOOP] = V_028A6C_LINESTRIP,707[PIPE_PRIM_LINE_STRIP] = V_028A6C_LINESTRIP,708[PIPE_PRIM_TRIANGLES] = V_028A6C_TRISTRIP,709[PIPE_PRIM_TRIANGLE_STRIP] = V_028A6C_TRISTRIP,710[PIPE_PRIM_TRIANGLE_FAN] = V_028A6C_TRISTRIP,711[PIPE_PRIM_QUADS] = V_028A6C_TRISTRIP,712[PIPE_PRIM_QUAD_STRIP] = V_028A6C_TRISTRIP,713[PIPE_PRIM_POLYGON] = V_028A6C_TRISTRIP,714[PIPE_PRIM_LINES_ADJACENCY] = V_028A6C_LINESTRIP,715[PIPE_PRIM_LINE_STRIP_ADJACENCY] = V_028A6C_LINESTRIP,716[PIPE_PRIM_TRIANGLES_ADJACENCY] = V_028A6C_TRISTRIP,717[PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = V_028A6C_TRISTRIP,718[PIPE_PRIM_PATCHES] = V_028A6C_POINTLIST,719[SI_PRIM_RECTANGLE_LIST] = V_028A6C_RECTLIST,720};721assert(mode < ARRAY_SIZE(prim_conv));722723return prim_conv[mode];724}725726/* rast_prim is the primitive type after GS. */727template<chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG> ALWAYS_INLINE728static void si_emit_rasterizer_prim_state(struct si_context *sctx)729{730struct radeon_cmdbuf *cs = &sctx->gfx_cs;731enum pipe_prim_type rast_prim = sctx->current_rast_prim;732struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;733734radeon_begin(cs);735736if (unlikely(si_is_line_stipple_enabled(sctx))) {737/* For lines, reset the stipple pattern at each primitive. Otherwise,738* reset the stipple pattern at each packet (line strips, line loops).739*/740bool reset_per_prim = rast_prim == PIPE_PRIM_LINES ||741rast_prim == PIPE_PRIM_LINES_ADJACENCY;742/* 0 = no reset, 1 = reset per prim, 2 = reset per packet */743unsigned value =744rs->pa_sc_line_stipple | S_028A0C_AUTO_RESET_CNTL(reset_per_prim ? 1 : 2);745746radeon_opt_set_context_reg(sctx, R_028A0C_PA_SC_LINE_STIPPLE, SI_TRACKED_PA_SC_LINE_STIPPLE,747value);748}749750unsigned gs_out_prim = si_conv_prim_to_gs_out(rast_prim);751if (unlikely(gs_out_prim != sctx->last_gs_out_prim && (NGG || HAS_GS))) {752radeon_set_context_reg(cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out_prim);753sctx->last_gs_out_prim = gs_out_prim;754}755756if (GFX_VERSION == GFX9)757radeon_end_update_context_roll(sctx);758else759radeon_end();760761if (NGG) {762struct si_shader *hw_vs = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current;763764if (hw_vs->uses_vs_state_provoking_vertex) {765unsigned vtx_index = rs->flatshade_first ? 0 : gs_out_prim;766767sctx->current_vs_state &= C_VS_STATE_PROVOKING_VTX_INDEX;768sctx->current_vs_state |= S_VS_STATE_PROVOKING_VTX_INDEX(vtx_index);769}770771if (hw_vs->uses_vs_state_outprim) {772sctx->current_vs_state &= C_VS_STATE_OUTPRIM;773sctx->current_vs_state |= S_VS_STATE_OUTPRIM(gs_out_prim);774}775}776}777778template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG>779ALWAYS_INLINE780static void si_emit_vs_state(struct si_context *sctx, unsigned index_size)781{782if (sctx->num_vs_blit_sgprs) {783/* Re-emit the state after we leave u_blitter. */784sctx->last_vs_state = ~0;785return;786}787788if (sctx->shader.vs.cso->info.uses_base_vertex) {789sctx->current_vs_state &= C_VS_STATE_INDEXED;790sctx->current_vs_state |= S_VS_STATE_INDEXED(!!index_size);791}792793if (sctx->current_vs_state != sctx->last_vs_state) {794struct radeon_cmdbuf *cs = &sctx->gfx_cs;795796/* For the API vertex shader (VS_STATE_INDEXED, LS_OUT_*). */797unsigned vs_base = si_get_user_data_base(GFX_VERSION, HAS_TESS, HAS_GS, NGG,798PIPE_SHADER_VERTEX);799radeon_begin(cs);800radeon_set_sh_reg(cs, vs_base + SI_SGPR_VS_STATE_BITS * 4,801sctx->current_vs_state);802803/* Set CLAMP_VERTEX_COLOR and OUTPRIM in the last stage804* before the rasterizer.805*806* For TES or the GS copy shader without NGG:807*/808if (vs_base != R_00B130_SPI_SHADER_USER_DATA_VS_0) {809radeon_set_sh_reg(cs, R_00B130_SPI_SHADER_USER_DATA_VS_0 + SI_SGPR_VS_STATE_BITS * 4,810sctx->current_vs_state);811}812813/* For NGG: */814if (GFX_VERSION >= GFX10 && vs_base != R_00B230_SPI_SHADER_USER_DATA_GS_0) {815radeon_set_sh_reg(cs, R_00B230_SPI_SHADER_USER_DATA_GS_0 + SI_SGPR_VS_STATE_BITS * 4,816sctx->current_vs_state);817}818radeon_end();819820sctx->last_vs_state = sctx->current_vs_state;821}822}823824ALWAYS_INLINE825static bool si_prim_restart_index_changed(struct si_context *sctx, bool primitive_restart,826unsigned restart_index)827{828return primitive_restart && (restart_index != sctx->last_restart_index ||829sctx->last_restart_index == SI_RESTART_INDEX_UNKNOWN);830}831832template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS> ALWAYS_INLINE833static void si_emit_ia_multi_vgt_param(struct si_context *sctx,834const struct pipe_draw_indirect_info *indirect,835enum pipe_prim_type prim, unsigned num_patches,836unsigned instance_count, bool primitive_restart,837unsigned min_vertex_count, ubyte vertices_per_patch)838{839struct radeon_cmdbuf *cs = &sctx->gfx_cs;840unsigned ia_multi_vgt_param;841842ia_multi_vgt_param =843si_get_ia_multi_vgt_param<GFX_VERSION, HAS_TESS, HAS_GS>844(sctx, indirect, prim, num_patches, instance_count, primitive_restart,845min_vertex_count, vertices_per_patch);846847/* Draw state. */848if (ia_multi_vgt_param != sctx->last_multi_vgt_param) {849radeon_begin(cs);850851if (GFX_VERSION == GFX9)852radeon_set_uconfig_reg_idx(cs, sctx->screen, GFX_VERSION,853R_030960_IA_MULTI_VGT_PARAM, 4, ia_multi_vgt_param);854else if (GFX_VERSION >= GFX7)855radeon_set_context_reg_idx(cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param);856else857radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);858859radeon_end();860861sctx->last_multi_vgt_param = ia_multi_vgt_param;862}863}864865/* GFX10 removed IA_MULTI_VGT_PARAM in exchange for GE_CNTL.866* We overload last_multi_vgt_param.867*/868template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG> ALWAYS_INLINE869static void gfx10_emit_ge_cntl(struct si_context *sctx, unsigned num_patches)870{871union si_vgt_param_key key = sctx->ia_multi_vgt_param_key;872unsigned ge_cntl;873874if (NGG) {875if (HAS_TESS) {876ge_cntl = S_03096C_PRIM_GRP_SIZE(num_patches) |877S_03096C_VERT_GRP_SIZE(0) |878S_03096C_BREAK_WAVE_AT_EOI(key.u.tess_uses_prim_id);879} else {880ge_cntl = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current->ge_cntl;881}882} else {883unsigned primgroup_size;884unsigned vertgroup_size;885886if (HAS_TESS) {887primgroup_size = num_patches; /* must be a multiple of NUM_PATCHES */888vertgroup_size = 0;889} else if (HAS_GS) {890unsigned vgt_gs_onchip_cntl = sctx->shader.gs.current->ctx_reg.gs.vgt_gs_onchip_cntl;891primgroup_size = G_028A44_GS_PRIMS_PER_SUBGRP(vgt_gs_onchip_cntl);892vertgroup_size = G_028A44_ES_VERTS_PER_SUBGRP(vgt_gs_onchip_cntl);893} else {894primgroup_size = 128; /* recommended without a GS and tess */895vertgroup_size = 0;896}897898ge_cntl = S_03096C_PRIM_GRP_SIZE(primgroup_size) | S_03096C_VERT_GRP_SIZE(vertgroup_size) |899S_03096C_BREAK_WAVE_AT_EOI(key.u.uses_tess && key.u.tess_uses_prim_id);900}901902ge_cntl |= S_03096C_PACKET_TO_ONE_PA(si_is_line_stipple_enabled(sctx));903904if (ge_cntl != sctx->last_multi_vgt_param) {905struct radeon_cmdbuf *cs = &sctx->gfx_cs;906907radeon_begin(cs);908radeon_set_uconfig_reg(cs, R_03096C_GE_CNTL, ge_cntl);909radeon_end();910sctx->last_multi_vgt_param = ge_cntl;911}912}913914template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG> ALWAYS_INLINE915static void si_emit_draw_registers(struct si_context *sctx,916const struct pipe_draw_indirect_info *indirect,917enum pipe_prim_type prim, unsigned num_patches,918unsigned instance_count, ubyte vertices_per_patch,919bool primitive_restart, unsigned restart_index,920unsigned min_vertex_count)921{922struct radeon_cmdbuf *cs = &sctx->gfx_cs;923924if (GFX_VERSION >= GFX10)925gfx10_emit_ge_cntl<GFX_VERSION, HAS_TESS, HAS_GS, NGG>(sctx, num_patches);926else927si_emit_ia_multi_vgt_param<GFX_VERSION, HAS_TESS, HAS_GS>928(sctx, indirect, prim, num_patches, instance_count, primitive_restart,929min_vertex_count, vertices_per_patch);930931radeon_begin(cs);932933if (prim != sctx->last_prim) {934unsigned vgt_prim = si_conv_pipe_prim(prim);935936if (GFX_VERSION >= GFX10)937radeon_set_uconfig_reg(cs, R_030908_VGT_PRIMITIVE_TYPE, vgt_prim);938else if (GFX_VERSION >= GFX7)939radeon_set_uconfig_reg_idx(cs, sctx->screen, GFX_VERSION, R_030908_VGT_PRIMITIVE_TYPE, 1, vgt_prim);940else941radeon_set_config_reg(cs, R_008958_VGT_PRIMITIVE_TYPE, vgt_prim);942943sctx->last_prim = prim;944}945946/* Primitive restart. */947if (primitive_restart != sctx->last_primitive_restart_en) {948if (GFX_VERSION >= GFX9)949radeon_set_uconfig_reg(cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN, primitive_restart);950else951radeon_set_context_reg(cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, primitive_restart);952953sctx->last_primitive_restart_en = primitive_restart;954}955if (si_prim_restart_index_changed(sctx, primitive_restart, restart_index)) {956radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, restart_index);957sctx->last_restart_index = restart_index;958if (GFX_VERSION == GFX9)959sctx->context_roll = true;960}961radeon_end();962}963964#define EMIT_SQTT_END_DRAW do { \965if (GFX_VERSION >= GFX9 && unlikely(sctx->thread_trace_enabled)) { \966radeon_begin(&sctx->gfx_cs); \967radeon_emit(&sctx->gfx_cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); \968radeon_emit(&sctx->gfx_cs, \969EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | \970EVENT_INDEX(0)); \971radeon_end(); \972} \973} while (0)974975template <chip_class GFX_VERSION, si_has_ngg NGG, si_has_prim_discard_cs ALLOW_PRIM_DISCARD_CS>976static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw_info *info,977unsigned drawid_base,978const struct pipe_draw_indirect_info *indirect,979const struct pipe_draw_start_count_bias *draws,980unsigned num_draws, unsigned total_count,981struct pipe_resource *indexbuf, unsigned index_size,982unsigned index_offset, unsigned instance_count,983bool dispatch_prim_discard_cs, unsigned original_index_size)984{985struct radeon_cmdbuf *cs = &sctx->gfx_cs;986987if (unlikely(sctx->thread_trace_enabled)) {988si_sqtt_write_event_marker(sctx, &sctx->gfx_cs, sctx->sqtt_next_event,989UINT_MAX, UINT_MAX, UINT_MAX);990}991992uint32_t use_opaque = 0;993994if (indirect && indirect->count_from_stream_output) {995struct si_streamout_target *t = (struct si_streamout_target *)indirect->count_from_stream_output;996997radeon_begin(cs);998radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, t->stride_in_dw);999radeon_end();10001001si_cp_copy_data(sctx, &sctx->gfx_cs, COPY_DATA_REG, NULL,1002R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2, COPY_DATA_SRC_MEM,1003t->buf_filled_size, t->buf_filled_size_offset);1004use_opaque = S_0287F0_USE_OPAQUE(1);1005indirect = NULL;1006}10071008uint32_t index_max_size = 0;1009uint64_t index_va = 0;10101011radeon_begin(cs);10121013/* draw packet */1014if (index_size) {1015/* Register shadowing doesn't shadow INDEX_TYPE. */1016if (index_size != sctx->last_index_size || sctx->shadowed_regs) {1017unsigned index_type;10181019/* Index type computation. When we look at how we need to translate index_size,1020* we can see that we just need 2 shifts to get the hw value.1021*1022* 1 = 001b --> 10b = 21023* 2 = 010b --> 00b = 01024* 4 = 100b --> 01b = 11025*/1026index_type = ((index_size >> 2) | (index_size << 1)) & 0x3;10271028if (GFX_VERSION <= GFX7 && SI_BIG_ENDIAN) {1029/* GFX7 doesn't support ubyte indices. */1030index_type |= index_size == 2 ? V_028A7C_VGT_DMA_SWAP_16_BIT1031: V_028A7C_VGT_DMA_SWAP_32_BIT;1032}10331034if (GFX_VERSION >= GFX9) {1035radeon_set_uconfig_reg_idx(cs, sctx->screen, GFX_VERSION,1036R_03090C_VGT_INDEX_TYPE, 2, index_type);1037} else {1038radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));1039radeon_emit(cs, index_type);1040}10411042sctx->last_index_size = index_size;1043}10441045/* If !ALLOW_PRIM_DISCARD_CS, index_size == original_index_size. */1046if (!ALLOW_PRIM_DISCARD_CS || original_index_size) {1047index_max_size = (indexbuf->width0 - index_offset) >> util_logbase2(original_index_size);1048/* Skip draw calls with 0-sized index buffers.1049* They cause a hang on some chips, like Navi10-14.1050*/1051if (!index_max_size) {1052radeon_end();1053return;1054}10551056index_va = si_resource(indexbuf)->gpu_address + index_offset;10571058radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(indexbuf), RADEON_USAGE_READ,1059RADEON_PRIO_INDEX_BUFFER);1060}1061} else {1062/* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,1063* so the state must be re-emitted before the next indexed draw.1064*/1065if (GFX_VERSION >= GFX7)1066sctx->last_index_size = -1;1067}10681069unsigned sh_base_reg = sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX];1070bool render_cond_bit = sctx->render_cond_enabled;10711072if (indirect) {1073assert(num_draws == 1);1074uint64_t indirect_va = si_resource(indirect->buffer)->gpu_address;10751076assert(indirect_va % 8 == 0);10771078si_invalidate_draw_constants(sctx);10791080radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));1081radeon_emit(cs, 1);1082radeon_emit(cs, indirect_va);1083radeon_emit(cs, indirect_va >> 32);10841085radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(indirect->buffer),1086RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);10871088unsigned di_src_sel = index_size ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX;10891090assert(indirect->offset % 4 == 0);10911092if (index_size) {1093radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0));1094radeon_emit(cs, index_va);1095radeon_emit(cs, index_va >> 32);10961097radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));1098radeon_emit(cs, index_max_size);1099}11001101if (!sctx->screen->has_draw_indirect_multi) {1102radeon_emit(cs, PKT3(index_size ? PKT3_DRAW_INDEX_INDIRECT : PKT3_DRAW_INDIRECT, 3,1103render_cond_bit));1104radeon_emit(cs, indirect->offset);1105radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);1106radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);1107radeon_emit(cs, di_src_sel);1108} else {1109uint64_t count_va = 0;11101111if (indirect->indirect_draw_count) {1112struct si_resource *params_buf = si_resource(indirect->indirect_draw_count);11131114radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, params_buf, RADEON_USAGE_READ,1115RADEON_PRIO_DRAW_INDIRECT);11161117count_va = params_buf->gpu_address + indirect->indirect_draw_count_offset;1118}11191120radeon_emit(cs,1121PKT3(index_size ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI, 8,1122render_cond_bit));1123radeon_emit(cs, indirect->offset);1124radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);1125radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);1126radeon_emit(cs, ((sh_base_reg + SI_SGPR_DRAWID * 4 - SI_SH_REG_OFFSET) >> 2) |1127S_2C3_DRAW_INDEX_ENABLE(sctx->shader.vs.cso->info.uses_drawid) |1128S_2C3_COUNT_INDIRECT_ENABLE(!!indirect->indirect_draw_count));1129radeon_emit(cs, indirect->draw_count);1130radeon_emit(cs, count_va);1131radeon_emit(cs, count_va >> 32);1132radeon_emit(cs, indirect->stride);1133radeon_emit(cs, di_src_sel);1134}1135} else {1136/* Register shadowing requires that we always emit PKT3_NUM_INSTANCES. */1137if (sctx->shadowed_regs ||1138sctx->last_instance_count == SI_INSTANCE_COUNT_UNKNOWN ||1139sctx->last_instance_count != instance_count) {1140radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, 0));1141radeon_emit(cs, instance_count);1142sctx->last_instance_count = instance_count;1143}11441145/* Base vertex and start instance. */1146int base_vertex = original_index_size ? draws[0].index_bias : draws[0].start;11471148bool set_draw_id = sctx->vs_uses_draw_id;1149bool set_base_instance = sctx->vs_uses_base_instance;11501151if (sctx->num_vs_blit_sgprs) {1152/* Re-emit draw constants after we leave u_blitter. */1153si_invalidate_draw_sh_constants(sctx);11541155/* Blit VS doesn't use BASE_VERTEX, START_INSTANCE, and DRAWID. */1156radeon_set_sh_reg_seq(cs, sh_base_reg + SI_SGPR_VS_BLIT_DATA * 4, sctx->num_vs_blit_sgprs);1157radeon_emit_array(cs, sctx->vs_blit_sh_data, sctx->num_vs_blit_sgprs);1158} else if (base_vertex != sctx->last_base_vertex ||1159sctx->last_base_vertex == SI_BASE_VERTEX_UNKNOWN ||1160(set_base_instance &&1161(info->start_instance != sctx->last_start_instance ||1162sctx->last_start_instance == SI_START_INSTANCE_UNKNOWN)) ||1163(set_draw_id &&1164(drawid_base != sctx->last_drawid ||1165sctx->last_drawid == SI_DRAW_ID_UNKNOWN)) ||1166sh_base_reg != sctx->last_sh_base_reg) {1167if (set_base_instance) {1168radeon_set_sh_reg_seq(cs, sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 3);1169radeon_emit(cs, base_vertex);1170radeon_emit(cs, drawid_base);1171radeon_emit(cs, info->start_instance);11721173sctx->last_start_instance = info->start_instance;1174sctx->last_drawid = drawid_base;1175} else if (set_draw_id) {1176radeon_set_sh_reg_seq(cs, sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 2);1177radeon_emit(cs, base_vertex);1178radeon_emit(cs, drawid_base);11791180sctx->last_drawid = drawid_base;1181} else {1182radeon_set_sh_reg(cs, sh_base_reg + SI_SGPR_BASE_VERTEX * 4, base_vertex);1183}11841185sctx->last_base_vertex = base_vertex;1186sctx->last_sh_base_reg = sh_base_reg;1187}11881189/* Don't update draw_id in the following code if it doesn't increment. */1190bool increment_draw_id = num_draws > 1 && set_draw_id && info->increment_draw_id;11911192if (index_size) {1193if (ALLOW_PRIM_DISCARD_CS && dispatch_prim_discard_cs) {1194radeon_end();11951196si_dispatch_prim_discard_cs_and_draw(sctx, info, draws, num_draws,1197original_index_size, total_count, index_va,1198index_max_size);1199EMIT_SQTT_END_DRAW;1200return;1201}12021203/* NOT_EOP allows merging multiple draws into 1 wave, but only user VGPRs1204* can be changed between draws, and GS fast launch must be disabled.1205* NOT_EOP doesn't work on gfx9 and older.1206*1207* Instead of doing this, which evaluates the case conditions repeatedly:1208* for (all draws) {1209* if (case1);1210* else;1211* }1212*1213* Use this structuring to evaluate the case conditions once:1214* if (case1) for (all draws);1215* else for (all draws);1216*1217*/1218bool index_bias_varies = num_draws > 1 && info->index_bias_varies;12191220if (increment_draw_id) {1221if (index_bias_varies) {1222for (unsigned i = 0; i < num_draws; i++) {1223uint64_t va = index_va + draws[i].start * index_size;12241225if (i > 0) {1226radeon_set_sh_reg_seq(cs, sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 2);1227radeon_emit(cs, draws[i].index_bias);1228radeon_emit(cs, drawid_base + i);1229}12301231radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));1232radeon_emit(cs, index_max_size);1233radeon_emit(cs, va);1234radeon_emit(cs, va >> 32);1235radeon_emit(cs, draws[i].count);1236radeon_emit(cs, V_0287F0_DI_SRC_SEL_DMA); /* NOT_EOP disabled */1237}1238if (num_draws > 1) {1239sctx->last_base_vertex = draws[num_draws - 1].index_bias;1240sctx->last_drawid = drawid_base + num_draws - 1;1241}1242} else {1243/* Only DrawID varies. */1244for (unsigned i = 0; i < num_draws; i++) {1245uint64_t va = index_va + draws[i].start * index_size;12461247if (i > 0)1248radeon_set_sh_reg(cs, sh_base_reg + SI_SGPR_DRAWID * 4, drawid_base + i);12491250radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));1251radeon_emit(cs, index_max_size);1252radeon_emit(cs, va);1253radeon_emit(cs, va >> 32);1254radeon_emit(cs, draws[i].count);1255radeon_emit(cs, V_0287F0_DI_SRC_SEL_DMA); /* NOT_EOP disabled */1256}1257if (num_draws > 1)1258sctx->last_drawid = drawid_base + num_draws - 1;1259}1260} else {1261if (info->index_bias_varies) {1262/* Only BaseVertex varies. */1263for (unsigned i = 0; i < num_draws; i++) {1264uint64_t va = index_va + draws[i].start * index_size;12651266if (i > 0)1267radeon_set_sh_reg(cs, sh_base_reg + SI_SGPR_BASE_VERTEX * 4, draws[i].index_bias);12681269radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));1270radeon_emit(cs, index_max_size);1271radeon_emit(cs, va);1272radeon_emit(cs, va >> 32);1273radeon_emit(cs, draws[i].count);1274radeon_emit(cs, V_0287F0_DI_SRC_SEL_DMA); /* NOT_EOP disabled */1275}1276if (num_draws > 1)1277sctx->last_base_vertex = draws[num_draws - 1].index_bias;1278} else {1279/* DrawID and BaseVertex are constant. */1280if (GFX_VERSION == GFX10) {1281/* GFX10 has a bug that consecutive draw packets with NOT_EOP must not have1282* count == 0 in the last draw (which doesn't set NOT_EOP).1283*1284* So remove all trailing draws with count == 0.1285*/1286while (num_draws > 1 && !draws[num_draws - 1].count)1287num_draws--;1288}12891290for (unsigned i = 0; i < num_draws; i++) {1291uint64_t va = index_va + draws[i].start * index_size;12921293radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));1294radeon_emit(cs, index_max_size);1295radeon_emit(cs, va);1296radeon_emit(cs, va >> 32);1297radeon_emit(cs, draws[i].count);1298radeon_emit(cs, V_0287F0_DI_SRC_SEL_DMA |1299S_0287F0_NOT_EOP(GFX_VERSION >= GFX10 && i < num_draws - 1));1300}1301}1302}1303} else {1304/* Set the index buffer for fast launch. The VS prolog will load the indices. */1305if (NGG && sctx->ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0)) {1306index_max_size = (indexbuf->width0 - index_offset) >> util_logbase2(original_index_size);13071308radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(indexbuf),1309RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER);1310uint64_t base_index_va = si_resource(indexbuf)->gpu_address + index_offset;13111312for (unsigned i = 0; i < num_draws; i++) {1313uint64_t index_va = base_index_va + draws[i].start * original_index_size;13141315radeon_set_sh_reg_seq(cs, R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS, 2);1316radeon_emit(cs, index_va);1317radeon_emit(cs, index_va >> 32);13181319if (i > 0) {1320if (increment_draw_id) {1321unsigned draw_id = drawid_base + i;13221323radeon_set_sh_reg(cs, sh_base_reg + SI_SGPR_DRAWID * 4, draw_id);1324sctx->last_drawid = draw_id;1325}1326}13271328/* TODO: Do index buffer bounds checking? We don't do it in this case. */1329radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit));1330radeon_emit(cs, draws[i].count);1331radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX);1332}1333radeon_end();13341335EMIT_SQTT_END_DRAW;1336return;1337}13381339for (unsigned i = 0; i < num_draws; i++) {1340if (i > 0) {1341if (increment_draw_id) {1342unsigned draw_id = drawid_base + i;13431344radeon_set_sh_reg_seq(cs, sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 2);1345radeon_emit(cs, draws[i].start);1346radeon_emit(cs, draw_id);13471348sctx->last_drawid = draw_id;1349} else {1350radeon_set_sh_reg(cs, sh_base_reg + SI_SGPR_BASE_VERTEX * 4, draws[i].start);1351}1352}13531354radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit));1355radeon_emit(cs, draws[i].count);1356radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque);1357}1358if (num_draws > 1 && !sctx->num_vs_blit_sgprs)1359sctx->last_base_vertex = draws[num_draws - 1].start;1360}1361}1362radeon_end();13631364EMIT_SQTT_END_DRAW;1365}13661367template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG> ALWAYS_INLINE1368static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx)1369{1370unsigned count = sctx->num_vertex_elements;1371bool pointer_dirty, user_sgprs_dirty;13721373assert(count <= SI_MAX_ATTRIBS);13741375if (sctx->vertex_buffers_dirty) {1376assert(count);13771378struct si_vertex_elements *velems = sctx->vertex_elements;1379unsigned alloc_size = velems->vb_desc_list_alloc_size;1380uint32_t *ptr;13811382if (alloc_size) {1383/* Vertex buffer descriptors are the only ones which are uploaded directly1384* and don't go through si_upload_graphics_shader_descriptors.1385*/1386u_upload_alloc(sctx->b.const_uploader, 0, alloc_size,1387si_optimal_tcc_alignment(sctx, alloc_size), &sctx->vb_descriptors_offset,1388(struct pipe_resource **)&sctx->vb_descriptors_buffer, (void **)&ptr);1389if (!sctx->vb_descriptors_buffer) {1390sctx->vb_descriptors_offset = 0;1391sctx->vb_descriptors_gpu_list = NULL;1392return false;1393}13941395sctx->vb_descriptors_gpu_list = ptr;1396radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, sctx->vb_descriptors_buffer,1397RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);1398/* GFX6 doesn't support the L2 prefetch. */1399if (GFX_VERSION >= GFX7)1400si_cp_dma_prefetch(sctx, &sctx->vb_descriptors_buffer->b.b, sctx->vb_descriptors_offset,1401alloc_size);1402} else {1403si_resource_reference(&sctx->vb_descriptors_buffer, NULL);1404}14051406unsigned first_vb_use_mask = velems->first_vb_use_mask;1407unsigned num_vbos_in_user_sgprs = sctx->screen->num_vbos_in_user_sgprs;14081409for (unsigned i = 0; i < count; i++) {1410struct pipe_vertex_buffer *vb;1411struct si_resource *buf;1412unsigned vbo_index = velems->vertex_buffer_index[i];1413uint32_t *desc = i < num_vbos_in_user_sgprs ? &sctx->vb_descriptor_user_sgprs[i * 4]1414: &ptr[(i - num_vbos_in_user_sgprs) * 4];14151416vb = &sctx->vertex_buffer[vbo_index];1417buf = si_resource(vb->buffer.resource);1418if (!buf) {1419memset(desc, 0, 16);1420continue;1421}14221423int64_t offset = (int64_t)((int)vb->buffer_offset) + velems->src_offset[i];14241425if (offset >= buf->b.b.width0) {1426assert(offset < buf->b.b.width0);1427memset(desc, 0, 16);1428continue;1429}14301431uint64_t va = buf->gpu_address + offset;14321433int64_t num_records = (int64_t)buf->b.b.width0 - offset;1434if (GFX_VERSION != GFX8 && vb->stride) {1435/* Round up by rounding down and adding 1 */1436num_records = (num_records - velems->format_size[i]) / vb->stride + 1;1437}1438assert(num_records >= 0 && num_records <= UINT_MAX);14391440uint32_t rsrc_word3 = velems->rsrc_word3[i];14411442/* OOB_SELECT chooses the out-of-bounds check:1443* - 1: index >= NUM_RECORDS (Structured)1444* - 3: offset >= NUM_RECORDS (Raw)1445*/1446if (GFX_VERSION >= GFX10)1447rsrc_word3 |= S_008F0C_OOB_SELECT(vb->stride ? V_008F0C_OOB_SELECT_STRUCTURED1448: V_008F0C_OOB_SELECT_RAW);14491450desc[0] = va;1451desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(vb->stride);1452desc[2] = num_records;1453desc[3] = rsrc_word3;14541455if (first_vb_use_mask & (1 << i)) {1456radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(vb->buffer.resource),1457RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);1458}1459}14601461sctx->vertex_buffers_dirty = false;14621463pointer_dirty = alloc_size != 0;1464user_sgprs_dirty = num_vbos_in_user_sgprs > 0;1465} else {1466pointer_dirty = sctx->vertex_buffer_pointer_dirty;1467user_sgprs_dirty = sctx->vertex_buffer_user_sgprs_dirty;1468}14691470if (pointer_dirty || user_sgprs_dirty) {1471struct radeon_cmdbuf *cs = &sctx->gfx_cs;1472unsigned num_vbos_in_user_sgprs = sctx->screen->num_vbos_in_user_sgprs;1473unsigned sh_base = si_get_user_data_base(GFX_VERSION, HAS_TESS, HAS_GS, NGG,1474PIPE_SHADER_VERTEX);1475assert(count);14761477radeon_begin(cs);14781479/* Set the pointer to vertex buffer descriptors. */1480if (pointer_dirty && count > num_vbos_in_user_sgprs) {1481/* Find the location of the VB descriptor pointer. */1482unsigned sh_dw_offset = SI_VS_NUM_USER_SGPR;1483if (GFX_VERSION >= GFX9) {1484if (HAS_TESS)1485sh_dw_offset = GFX9_TCS_NUM_USER_SGPR;1486else if (HAS_GS)1487sh_dw_offset = GFX9_VSGS_NUM_USER_SGPR;1488}14891490radeon_set_sh_reg(cs, sh_base + sh_dw_offset * 4,1491sctx->vb_descriptors_buffer->gpu_address +1492sctx->vb_descriptors_offset);1493sctx->vertex_buffer_pointer_dirty = false;1494}14951496/* Set VB descriptors in user SGPRs. */1497if (user_sgprs_dirty) {1498assert(num_vbos_in_user_sgprs);14991500unsigned num_sgprs = MIN2(count, num_vbos_in_user_sgprs) * 4;15011502radeon_set_sh_reg_seq(cs, sh_base + SI_SGPR_VS_VB_DESCRIPTOR_FIRST * 4, num_sgprs);1503radeon_emit_array(cs, sctx->vb_descriptor_user_sgprs, num_sgprs);1504sctx->vertex_buffer_user_sgprs_dirty = false;1505}1506radeon_end();1507}15081509return true;1510}15111512static void si_get_draw_start_count(struct si_context *sctx, const struct pipe_draw_info *info,1513const struct pipe_draw_indirect_info *indirect,1514const struct pipe_draw_start_count_bias *draws,1515unsigned num_draws, unsigned *start, unsigned *count)1516{1517if (indirect && !indirect->count_from_stream_output) {1518unsigned indirect_count;1519struct pipe_transfer *transfer;1520unsigned begin, end;1521unsigned map_size;1522unsigned *data;15231524if (indirect->indirect_draw_count) {1525data = (unsigned*)1526pipe_buffer_map_range(&sctx->b, indirect->indirect_draw_count,1527indirect->indirect_draw_count_offset, sizeof(unsigned),1528PIPE_MAP_READ, &transfer);15291530indirect_count = *data;15311532pipe_buffer_unmap(&sctx->b, transfer);1533} else {1534indirect_count = indirect->draw_count;1535}15361537if (!indirect_count) {1538*start = *count = 0;1539return;1540}15411542map_size = (indirect_count - 1) * indirect->stride + 3 * sizeof(unsigned);1543data = (unsigned*)1544pipe_buffer_map_range(&sctx->b, indirect->buffer, indirect->offset, map_size,1545PIPE_MAP_READ, &transfer);15461547begin = UINT_MAX;1548end = 0;15491550for (unsigned i = 0; i < indirect_count; ++i) {1551unsigned count = data[0];1552unsigned start = data[2];15531554if (count > 0) {1555begin = MIN2(begin, start);1556end = MAX2(end, start + count);1557}15581559data += indirect->stride / sizeof(unsigned);1560}15611562pipe_buffer_unmap(&sctx->b, transfer);15631564if (begin < end) {1565*start = begin;1566*count = end - begin;1567} else {1568*start = *count = 0;1569}1570} else {1571unsigned min_element = UINT_MAX;1572unsigned max_element = 0;15731574for (unsigned i = 0; i < num_draws; i++) {1575min_element = MIN2(min_element, draws[i].start);1576max_element = MAX2(max_element, draws[i].start + draws[i].count);1577}15781579*start = min_element;1580*count = max_element - min_element;1581}1582}15831584template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG>1585static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info,1586const struct pipe_draw_indirect_info *indirect,1587enum pipe_prim_type prim, unsigned instance_count,1588unsigned min_vertex_count, bool primitive_restart,1589unsigned skip_atom_mask)1590{1591unsigned num_patches = 0;15921593si_emit_rasterizer_prim_state<GFX_VERSION, HAS_TESS, HAS_GS, NGG>(sctx);1594if (HAS_TESS)1595si_emit_derived_tess_state(sctx, info->vertices_per_patch, &num_patches);15961597/* Emit state atoms. */1598unsigned mask = sctx->dirty_atoms & ~skip_atom_mask;1599if (mask) {1600do {1601sctx->atoms.array[u_bit_scan(&mask)].emit(sctx);1602} while (mask);16031604sctx->dirty_atoms &= skip_atom_mask;1605}16061607/* Emit states. */1608mask = sctx->dirty_states;1609if (mask) {1610do {1611unsigned i = u_bit_scan(&mask);1612struct si_pm4_state *state = sctx->queued.array[i];16131614/* All places should unset dirty_states if this doesn't pass. */1615assert(state && state != sctx->emitted.array[i]);16161617si_pm4_emit(sctx, state);1618sctx->emitted.array[i] = state;1619} while (mask);16201621sctx->dirty_states = 0;1622}16231624/* Emit draw states. */1625si_emit_vs_state<GFX_VERSION, HAS_TESS, HAS_GS, NGG>(sctx, info->index_size);1626si_emit_draw_registers<GFX_VERSION, HAS_TESS, HAS_GS, NGG>1627(sctx, indirect, prim, num_patches, instance_count, info->vertices_per_patch,1628primitive_restart, info->restart_index, min_vertex_count);1629}16301631static bool si_all_vs_resources_read_only(struct si_context *sctx, struct pipe_resource *indexbuf)1632{1633struct radeon_winsys *ws = sctx->ws;1634struct radeon_cmdbuf *cs = &sctx->gfx_cs;1635struct si_descriptors *buffers =1636&sctx->descriptors[si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX)];1637struct si_shader_selector *vs = sctx->shader.vs.cso;1638struct si_vertex_elements *velems = sctx->vertex_elements;1639unsigned num_velems = velems->count;1640unsigned num_images = vs->info.base.num_images;16411642/* Index buffer. */1643if (indexbuf && ws->cs_is_buffer_referenced(cs, si_resource(indexbuf)->buf, RADEON_USAGE_WRITE))1644goto has_write_reference;16451646/* Vertex buffers. */1647for (unsigned i = 0; i < num_velems; i++) {1648if (!((1 << i) & velems->first_vb_use_mask))1649continue;16501651unsigned vb_index = velems->vertex_buffer_index[i];1652struct pipe_resource *res = sctx->vertex_buffer[vb_index].buffer.resource;1653if (!res)1654continue;16551656if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, RADEON_USAGE_WRITE))1657goto has_write_reference;1658}16591660/* Constant and shader buffers. */1661for (unsigned i = 0; i < buffers->num_active_slots; i++) {1662unsigned index = buffers->first_active_slot + i;1663struct pipe_resource *res = sctx->const_and_shader_buffers[PIPE_SHADER_VERTEX].buffers[index];1664if (!res)1665continue;16661667if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, RADEON_USAGE_WRITE))1668goto has_write_reference;1669}16701671/* Samplers. */1672if (vs->info.base.textures_used[0]) {1673unsigned num_samplers = BITSET_LAST_BIT(vs->info.base.textures_used);16741675for (unsigned i = 0; i < num_samplers; i++) {1676struct pipe_sampler_view *view = sctx->samplers[PIPE_SHADER_VERTEX].views[i];1677if (!view)1678continue;16791680if (ws->cs_is_buffer_referenced(cs, si_resource(view->texture)->buf, RADEON_USAGE_WRITE))1681goto has_write_reference;1682}1683}16841685/* Images. */1686if (num_images) {1687for (unsigned i = 0; i < num_images; i++) {1688struct pipe_resource *res = sctx->images[PIPE_SHADER_VERTEX].views[i].resource;1689if (!res)1690continue;16911692if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, RADEON_USAGE_WRITE))1693goto has_write_reference;1694}1695}16961697return true;16981699has_write_reference:1700/* If the current gfx IB has enough packets, flush it to remove write1701* references to buffers.1702*/1703if (cs->prev_dw + cs->current.cdw > 2048) {1704si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);1705assert(si_all_vs_resources_read_only(sctx, indexbuf));1706return true;1707}1708return false;1709}17101711static ALWAYS_INLINE bool pd_msg(const char *s)1712{1713if (SI_PRIM_DISCARD_DEBUG)1714printf("PD failed: %s\n", s);1715return false;1716}17171718#define DRAW_CLEANUP do { \1719if (index_size && indexbuf != info->index.resource) \1720pipe_resource_reference(&indexbuf, NULL); \1721} while (0)17221723template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,1724si_has_prim_discard_cs ALLOW_PRIM_DISCARD_CS>1725static void si_draw_vbo(struct pipe_context *ctx,1726const struct pipe_draw_info *info,1727unsigned drawid_offset,1728const struct pipe_draw_indirect_info *indirect,1729const struct pipe_draw_start_count_bias *draws,1730unsigned num_draws)1731{1732/* Keep code that uses the least number of local variables as close to the beginning1733* of this function as possible to minimize register pressure.1734*1735* It doesn't matter where we return due to invalid parameters because such cases1736* shouldn't occur in practice.1737*/1738struct si_context *sctx = (struct si_context *)ctx;17391740/* Recompute and re-emit the texture resource states if needed. */1741unsigned dirty_tex_counter = p_atomic_read(&sctx->screen->dirty_tex_counter);1742if (unlikely(dirty_tex_counter != sctx->last_dirty_tex_counter)) {1743sctx->last_dirty_tex_counter = dirty_tex_counter;1744sctx->framebuffer.dirty_cbufs |= ((1 << sctx->framebuffer.state.nr_cbufs) - 1);1745sctx->framebuffer.dirty_zsbuf = true;1746si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);1747si_update_all_texture_descriptors(sctx);1748}17491750unsigned dirty_buf_counter = p_atomic_read(&sctx->screen->dirty_buf_counter);1751if (unlikely(dirty_buf_counter != sctx->last_dirty_buf_counter)) {1752sctx->last_dirty_buf_counter = dirty_buf_counter;1753/* Rebind all buffers unconditionally. */1754si_rebind_buffer(sctx, NULL);1755}17561757si_decompress_textures(sctx, u_bit_consecutive(0, SI_NUM_GRAPHICS_SHADERS));1758si_need_gfx_cs_space(sctx, num_draws);17591760if (HAS_TESS) {1761struct si_shader_selector *tcs = sctx->shader.tcs.cso;17621763/* The rarely occuring tcs == NULL case is not optimized. */1764bool same_patch_vertices =1765GFX_VERSION >= GFX9 &&1766tcs && info->vertices_per_patch == tcs->info.base.tess.tcs_vertices_out;17671768if (sctx->same_patch_vertices != same_patch_vertices) {1769sctx->same_patch_vertices = same_patch_vertices;1770sctx->do_update_shaders = true;1771}17721773if (GFX_VERSION == GFX9 && sctx->screen->info.has_ls_vgpr_init_bug) {1774/* Determine whether the LS VGPR fix should be applied.1775*1776* It is only required when num input CPs > num output CPs,1777* which cannot happen with the fixed function TCS. We should1778* also update this bit when switching from TCS to fixed1779* function TCS.1780*/1781bool ls_vgpr_fix =1782tcs && info->vertices_per_patch > tcs->info.base.tess.tcs_vertices_out;17831784if (ls_vgpr_fix != sctx->ls_vgpr_fix) {1785sctx->ls_vgpr_fix = ls_vgpr_fix;1786sctx->do_update_shaders = true;1787}1788}1789}17901791enum pipe_prim_type prim = info->mode;1792unsigned instance_count = info->instance_count;17931794/* GFX6-GFX7 treat instance_count==0 as instance_count==1. There is1795* no workaround for indirect draws, but we can at least skip1796* direct draws.1797* 'instance_count == 0' seems to be problematic on Renoir chips (#4866),1798* so simplify the condition and drop these draws for all <= GFX9 chips.1799*/1800if (GFX_VERSION <= GFX9 && unlikely(!indirect && !instance_count))1801return;18021803struct si_shader_selector *vs = sctx->shader.vs.cso;1804if (unlikely(!vs || sctx->num_vertex_elements < vs->num_vs_inputs ||1805!sctx->shader.ps.cso || (HAS_TESS != (prim == PIPE_PRIM_PATCHES)))) {1806assert(0);1807return;1808}18091810if (GFX_VERSION <= GFX9 && HAS_GS) {1811/* Determine whether the GS triangle strip adjacency fix should1812* be applied. Rotate every other triangle if triangle strips with1813* adjacency are fed to the GS. This doesn't work if primitive1814* restart occurs after an odd number of triangles.1815*/1816bool gs_tri_strip_adj_fix =1817!HAS_TESS && prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY;18181819if (gs_tri_strip_adj_fix != sctx->gs_tri_strip_adj_fix) {1820sctx->gs_tri_strip_adj_fix = gs_tri_strip_adj_fix;1821sctx->do_update_shaders = true;1822}1823}18241825struct pipe_resource *indexbuf = info->index.resource;1826unsigned index_size = info->index_size;1827unsigned index_offset = indirect && indirect->buffer ? draws[0].start * index_size : 0;18281829if (index_size) {1830/* Translate or upload, if needed. */1831/* 8-bit indices are supported on GFX8. */1832if (GFX_VERSION <= GFX7 && index_size == 1) {1833unsigned start, count, start_offset, size, offset;1834void *ptr;18351836si_get_draw_start_count(sctx, info, indirect, draws, num_draws, &start, &count);1837start_offset = start * 2;1838size = count * 2;18391840indexbuf = NULL;1841u_upload_alloc(ctx->stream_uploader, start_offset, size,1842si_optimal_tcc_alignment(sctx, size), &offset, &indexbuf, &ptr);1843if (unlikely(!indexbuf))1844return;18451846util_shorten_ubyte_elts_to_userptr(&sctx->b, info, 0, 0, index_offset + start, count, ptr);18471848/* info->start will be added by the drawing code */1849index_offset = offset - start_offset;1850index_size = 2;1851} else if (info->has_user_indices) {1852unsigned start_offset;18531854assert(!indirect);1855assert(num_draws == 1);1856start_offset = draws[0].start * index_size;18571858indexbuf = NULL;1859u_upload_data(ctx->stream_uploader, start_offset, draws[0].count * index_size,1860sctx->screen->info.tcc_cache_line_size,1861(char *)info->index.user + start_offset, &index_offset, &indexbuf);1862if (unlikely(!indexbuf))1863return;18641865/* info->start will be added by the drawing code */1866index_offset -= start_offset;1867} else if (GFX_VERSION <= GFX7 && si_resource(indexbuf)->TC_L2_dirty) {1868/* GFX8 reads index buffers through TC L2, so it doesn't1869* need this. */1870sctx->flags |= SI_CONTEXT_WB_L2;1871si_resource(indexbuf)->TC_L2_dirty = false;1872}1873}18741875unsigned min_direct_count = 0;1876unsigned total_direct_count = 0;18771878if (indirect) {1879/* Add the buffer size for memory checking in need_cs_space. */1880if (indirect->buffer)1881si_context_add_resource_size(sctx, indirect->buffer);18821883/* Indirect buffers use TC L2 on GFX9, but not older hw. */1884if (GFX_VERSION <= GFX8) {1885if (indirect->buffer && si_resource(indirect->buffer)->TC_L2_dirty) {1886sctx->flags |= SI_CONTEXT_WB_L2;1887si_resource(indirect->buffer)->TC_L2_dirty = false;1888}18891890if (indirect->indirect_draw_count &&1891si_resource(indirect->indirect_draw_count)->TC_L2_dirty) {1892sctx->flags |= SI_CONTEXT_WB_L2;1893si_resource(indirect->indirect_draw_count)->TC_L2_dirty = false;1894}1895}1896} else {1897total_direct_count = min_direct_count = draws[0].count;18981899for (unsigned i = 1; i < num_draws; i++) {1900unsigned count = draws[i].count;19011902total_direct_count += count;1903min_direct_count = MIN2(min_direct_count, count);1904}1905}19061907struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;1908bool primitive_restart =1909info->primitive_restart &&1910(!sctx->screen->options.prim_restart_tri_strips_only ||1911(prim != PIPE_PRIM_TRIANGLE_STRIP && prim != PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY));1912bool dispatch_prim_discard_cs = false;1913bool prim_discard_cs_instancing = false;1914unsigned original_index_size = index_size;19151916/* Determine if we can use the primitive discard compute shader. */1917/* TODO: this requires that primitives can be drawn out of order, so check depth/stencil/blend states. */1918if (ALLOW_PRIM_DISCARD_CS &&1919(total_direct_count > sctx->prim_discard_vertex_count_threshold1920? (sctx->compute_num_verts_rejected += total_direct_count, true)1921: /* Add, then return true. */1922(sctx->compute_num_verts_ineligible += total_direct_count,1923false)) && /* Add, then return false. */1924(!primitive_restart || pd_msg("primitive restart")) &&1925/* Supported prim types. */1926(1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP)) &&1927/* Instancing is limited to 16-bit indices, because InstanceID is packed into VertexID. */1928/* Instanced index_size == 0 requires that start + count < USHRT_MAX, so just reject it. */1929(instance_count == 1 ||1930(instance_count <= USHRT_MAX && index_size && index_size <= 2) ||1931pd_msg("instance_count too large or index_size == 4 or DrawArraysInstanced")) &&1932((drawid_offset == 0 && (num_draws == 1 || !info->increment_draw_id)) ||1933!sctx->shader.vs.cso->info.uses_drawid || pd_msg("draw_id > 0")) &&1934(!sctx->render_cond || pd_msg("render condition")) &&1935/* Forced enablement ignores pipeline statistics queries. */1936(sctx->screen->debug_flags & (DBG(PD) | DBG(ALWAYS_PD)) ||1937(!sctx->num_pipeline_stat_queries && !sctx->streamout.prims_gen_query_enabled) ||1938pd_msg("pipestat or primgen query")) &&1939(!sctx->vertex_elements->instance_divisor_is_fetched || pd_msg("loads instance divisors")) &&1940(!sctx->shader.ps.cso->info.uses_primid || pd_msg("PS uses PrimID")) &&1941!rs->polygon_mode_enabled &&1942#if SI_PRIM_DISCARD_DEBUG /* same as cso->prim_discard_cs_allowed */1943(!sctx->shader.vs.cso->info.uses_bindless_images || pd_msg("uses bindless images")) &&1944(!sctx->shader.vs.cso->info.uses_bindless_samplers || pd_msg("uses bindless samplers")) &&1945(!sctx->shader.vs.cso->info.base.writes_memory || pd_msg("writes memory")) &&1946(!sctx->shader.vs.cso->info.writes_viewport_index || pd_msg("writes viewport index")) &&1947!sctx->shader.vs.cso->info.base.vs.window_space_position &&1948!sctx->shader.vs.cso->so.num_outputs &&1949#else1950(sctx->shader.vs.cso->prim_discard_cs_allowed ||1951pd_msg("VS shader uses unsupported features")) &&1952#endif1953/* Check that all buffers are used for read only, because compute1954* dispatches can run ahead. */1955(si_all_vs_resources_read_only(sctx, index_size ? indexbuf : NULL) ||1956pd_msg("write reference"))) {1957switch (si_prepare_prim_discard_or_split_draw(sctx, info, drawid_offset, draws, num_draws,1958total_direct_count)) {1959case SI_PRIM_DISCARD_ENABLED:1960original_index_size = index_size;1961prim_discard_cs_instancing = instance_count > 1;1962dispatch_prim_discard_cs = true;19631964/* The compute shader changes/lowers the following: */1965prim = PIPE_PRIM_TRIANGLES;1966index_size = 4;1967instance_count = 1;1968sctx->compute_num_verts_rejected -= total_direct_count;1969sctx->compute_num_verts_accepted += total_direct_count;1970break;1971case SI_PRIM_DISCARD_DISABLED:1972break;1973case SI_PRIM_DISCARD_DRAW_SPLIT:1974case SI_PRIM_DISCARD_MULTI_DRAW_SPLIT:1975sctx->compute_num_verts_rejected -= total_direct_count;1976/* The multi draw was split into multiple ones and executed. Return. */1977DRAW_CLEANUP;1978return;1979}1980}19811982if (ALLOW_PRIM_DISCARD_CS &&1983prim_discard_cs_instancing != sctx->prim_discard_cs_instancing) {1984sctx->prim_discard_cs_instancing = prim_discard_cs_instancing;1985sctx->do_update_shaders = true;1986}19871988/* Set the rasterization primitive type.1989*1990* This must be done after si_decompress_textures, which can call1991* draw_vbo recursively, and before si_update_shaders, which uses1992* current_rast_prim for this draw_vbo call.1993*/1994if (!HAS_GS && !HAS_TESS) {1995enum pipe_prim_type rast_prim;19961997if (util_rast_prim_is_triangles(prim)) {1998rast_prim = PIPE_PRIM_TRIANGLES;1999} else {2000/* Only possibilities, POINTS, LINE*, RECTANGLES */2001rast_prim = prim;2002}20032004if (rast_prim != sctx->current_rast_prim) {2005if (util_prim_is_points_or_lines(sctx->current_rast_prim) !=2006util_prim_is_points_or_lines(rast_prim))2007si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband);20082009sctx->current_rast_prim = rast_prim;2010sctx->do_update_shaders = true;2011}2012}20132014/* Update NGG culling settings. */2015uint8_t old_ngg_culling = sctx->ngg_culling;2016if (GFX_VERSION >= GFX10) {2017struct si_shader_selector *hw_vs = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->cso;20182019if (NGG && !HAS_GS && !dispatch_prim_discard_cs &&2020/* Tessellation sets ngg_cull_vert_threshold to UINT_MAX if the prim type2021* is not triangles, so this check is only needed without tessellation. */2022(HAS_TESS || sctx->current_rast_prim == PIPE_PRIM_TRIANGLES) &&2023total_direct_count > hw_vs->ngg_cull_vert_threshold) {2024uint8_t ngg_culling = sctx->viewport0_y_inverted ? rs->ngg_cull_flags_y_inverted :2025rs->ngg_cull_flags;20262027/* Use NGG fast launch for certain primitive types.2028* A draw must have at least 1 full primitive.2029* The fast launch doesn't work with tessellation.2030*2031* Small instances (including small draws) don't perform well with fast launch.2032* It's better to use normal launch with NOT_EOP for small draws, and it's2033* always better to use normal launch for small instances.2034*/2035if (!HAS_TESS && ngg_culling && min_direct_count >= 64 &&2036!(sctx->screen->debug_flags & DBG(NO_FAST_LAUNCH))) {2037if (prim == PIPE_PRIM_TRIANGLES && !index_size) {2038ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST;2039} else if (prim == PIPE_PRIM_TRIANGLE_STRIP) {2040if (!index_size) {2041ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP;2042} else if (!primitive_restart) {2043ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP |2044SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(MIN2(index_size, 3));2045/* The index buffer will be emulated. */2046index_size = 0;2047}2048}2049}20502051if (ngg_culling != old_ngg_culling) {2052/* If shader compilation is not ready, this setting will be rejected. */2053sctx->ngg_culling = ngg_culling;2054sctx->do_update_shaders = true;2055}2056} else if (old_ngg_culling) {2057sctx->ngg_culling = 0;2058sctx->do_update_shaders = true;2059}2060}20612062if (unlikely(sctx->do_update_shaders)) {2063if (unlikely(!si_update_shaders(sctx))) {2064DRAW_CLEANUP;2065return;2066}20672068/* Insert a VGT_FLUSH when enabling fast launch changes to prevent hangs.2069* See issues #2418, #2426, #24342070*2071* This is the setting that is used by the draw.2072*/2073if (GFX_VERSION >= GFX10) {2074uint8_t ngg_culling = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current->key.opt.ngg_culling;2075if (GFX_VERSION == GFX10 &&2076!(old_ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) &&2077ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)2078sctx->flags |= SI_CONTEXT_VGT_FLUSH;20792080if (old_ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0) &&2081!(ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0))) {2082/* Need to re-set these, because we have bound an index buffer there. */2083sctx->shader_pointers_dirty |=2084(1u << si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_GEOMETRY)) |2085(1u << si_sampler_and_image_descriptors_idx(PIPE_SHADER_GEOMETRY));2086si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);2087}20882089/* Set this to the correct value determined by si_update_shaders. */2090sctx->ngg_culling = ngg_culling;2091}2092}20932094/* Since we've called si_context_add_resource_size for vertex buffers,2095* this must be called after si_need_cs_space, because we must let2096* need_cs_space flush before we add buffers to the buffer list.2097*2098* This must be done after si_update_shaders because si_update_shaders can2099* flush the CS when enabling tess and GS rings.2100*/2101if (sctx->bo_list_add_all_gfx_resources)2102si_gfx_resources_add_all_to_bo_list(sctx);21032104/* Graphics shader descriptors must be uploaded after si_update_shaders because2105* it binds tess and GS ring buffers.2106*/2107if (unlikely(!si_upload_graphics_shader_descriptors(sctx))) {2108DRAW_CLEANUP;2109return;2110}21112112/* Vega10/Raven scissor bug workaround. When any context register is2113* written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR2114* registers must be written too.2115*/2116unsigned masked_atoms = 0;2117bool gfx9_scissor_bug = false;21182119if (GFX_VERSION == GFX9 && sctx->screen->info.has_gfx9_scissor_bug) {2120masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.scissors);2121gfx9_scissor_bug = true;21222123if ((indirect && indirect->count_from_stream_output) ||2124sctx->dirty_atoms & si_atoms_that_always_roll_context() ||2125sctx->dirty_states & si_states_that_always_roll_context())2126sctx->context_roll = true;2127}21282129/* Use optimal packet order based on whether we need to sync the pipeline. */2130if (unlikely(sctx->flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB |2131SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |2132SI_CONTEXT_VS_PARTIAL_FLUSH))) {2133/* If we have to wait for idle, set all states first, so that all2134* SET packets are processed in parallel with previous draw calls.2135* Then draw and prefetch at the end. This ensures that the time2136* the CUs are idle is very short.2137*/2138if (unlikely(sctx->flags & SI_CONTEXT_FLUSH_FOR_RENDER_COND))2139masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.render_cond);21402141/* Emit all states except possibly render condition. */2142si_emit_all_states<GFX_VERSION, HAS_TESS, HAS_GS, NGG>2143(sctx, info, indirect, prim, instance_count, min_direct_count,2144primitive_restart, masked_atoms);2145sctx->emit_cache_flush(sctx, &sctx->gfx_cs);2146/* <-- CUs are idle here. */21472148/* This uploads VBO descriptors, sets user SGPRs, and executes the L2 prefetch.2149* It should done after cache flushing.2150*/2151if (unlikely((!si_upload_and_prefetch_VB_descriptors<GFX_VERSION, HAS_TESS, HAS_GS, NGG>(sctx)))) {2152DRAW_CLEANUP;2153return;2154}21552156if (si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond)) {2157sctx->atoms.s.render_cond.emit(sctx);2158sctx->dirty_atoms &= ~si_get_atom_bit(sctx, &sctx->atoms.s.render_cond);2159}21602161if (GFX_VERSION == GFX9 && gfx9_scissor_bug &&2162(sctx->context_roll || si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) {2163sctx->atoms.s.scissors.emit(sctx);2164sctx->dirty_atoms &= ~si_get_atom_bit(sctx, &sctx->atoms.s.scissors);2165}2166assert(sctx->dirty_atoms == 0);21672168si_emit_draw_packets<GFX_VERSION, NGG, ALLOW_PRIM_DISCARD_CS>2169(sctx, info, drawid_offset, indirect, draws, num_draws, total_direct_count, indexbuf,2170index_size, index_offset, instance_count, dispatch_prim_discard_cs,2171original_index_size);2172/* <-- CUs are busy here. */21732174/* Start prefetches after the draw has been started. Both will run2175* in parallel, but starting the draw first is more important.2176*/2177si_prefetch_shaders<GFX_VERSION, HAS_TESS, HAS_GS, NGG, PREFETCH_ALL>(sctx);2178} else {2179/* If we don't wait for idle, start prefetches first, then set2180* states, and draw at the end.2181*/2182if (sctx->flags)2183sctx->emit_cache_flush(sctx, &sctx->gfx_cs);21842185/* Only prefetch the API VS and VBO descriptors. */2186si_prefetch_shaders<GFX_VERSION, HAS_TESS, HAS_GS, NGG, PREFETCH_BEFORE_DRAW>(sctx);21872188/* This uploads VBO descriptors, sets user SGPRs, and executes the L2 prefetch.2189* It should done after cache flushing and after the VS prefetch.2190*/2191if (unlikely((!si_upload_and_prefetch_VB_descriptors<GFX_VERSION, HAS_TESS, HAS_GS, NGG>(sctx)))) {2192DRAW_CLEANUP;2193return;2194}21952196si_emit_all_states<GFX_VERSION, HAS_TESS, HAS_GS, NGG>2197(sctx, info, indirect, prim, instance_count, min_direct_count,2198primitive_restart, masked_atoms);21992200if (GFX_VERSION == GFX9 && gfx9_scissor_bug &&2201(sctx->context_roll || si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) {2202sctx->atoms.s.scissors.emit(sctx);2203sctx->dirty_atoms &= ~si_get_atom_bit(sctx, &sctx->atoms.s.scissors);2204}2205assert(sctx->dirty_atoms == 0);22062207si_emit_draw_packets<GFX_VERSION, NGG, ALLOW_PRIM_DISCARD_CS>2208(sctx, info, drawid_offset, indirect, draws, num_draws, total_direct_count, indexbuf,2209index_size, index_offset, instance_count, dispatch_prim_discard_cs,2210original_index_size);22112212/* Prefetch the remaining shaders after the draw has been2213* started. */2214si_prefetch_shaders<GFX_VERSION, HAS_TESS, HAS_GS, NGG, PREFETCH_AFTER_DRAW>(sctx);2215}22162217/* Clear the context roll flag after the draw call.2218* Only used by the gfx9 scissor bug.2219*/2220if (GFX_VERSION == GFX9)2221sctx->context_roll = false;22222223if (unlikely(sctx->current_saved_cs)) {2224si_trace_emit(sctx);2225si_log_draw_state(sctx, sctx->log);2226}22272228/* Workaround for a VGT hang when streamout is enabled.2229* It must be done after drawing. */2230if (((GFX_VERSION == GFX7 && sctx->family == CHIP_HAWAII) ||2231(GFX_VERSION == GFX8 && (sctx->family == CHIP_TONGA || sctx->family == CHIP_FIJI))) &&2232si_get_strmout_en(sctx)) {2233sctx->flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;2234}22352236if (unlikely(sctx->decompression_enabled)) {2237sctx->num_decompress_calls++;2238} else {2239sctx->num_draw_calls++;2240if (primitive_restart)2241sctx->num_prim_restart_calls++;2242}22432244if (!sctx->blitter_running && sctx->framebuffer.state.zsbuf) {2245struct si_texture *zstex = (struct si_texture *)sctx->framebuffer.state.zsbuf->texture;2246zstex->depth_cleared_level_mask &= ~BITFIELD_BIT(sctx->framebuffer.state.zsbuf->u.tex.level);2247}22482249/* TODO: Set displayable_dcc_dirty if image stores are used. */22502251DRAW_CLEANUP;2252}22532254static void si_draw_rectangle(struct blitter_context *blitter, void *vertex_elements_cso,2255blitter_get_vs_func get_vs, int x1, int y1, int x2, int y2,2256float depth, unsigned num_instances, enum blitter_attrib_type type,2257const union blitter_attrib *attrib)2258{2259struct pipe_context *pipe = util_blitter_get_pipe(blitter);2260struct si_context *sctx = (struct si_context *)pipe;22612262/* Pack position coordinates as signed int16. */2263sctx->vs_blit_sh_data[0] = (uint32_t)(x1 & 0xffff) | ((uint32_t)(y1 & 0xffff) << 16);2264sctx->vs_blit_sh_data[1] = (uint32_t)(x2 & 0xffff) | ((uint32_t)(y2 & 0xffff) << 16);2265sctx->vs_blit_sh_data[2] = fui(depth);22662267switch (type) {2268case UTIL_BLITTER_ATTRIB_COLOR:2269memcpy(&sctx->vs_blit_sh_data[3], attrib->color, sizeof(float) * 4);2270break;2271case UTIL_BLITTER_ATTRIB_TEXCOORD_XY:2272case UTIL_BLITTER_ATTRIB_TEXCOORD_XYZW:2273memcpy(&sctx->vs_blit_sh_data[3], &attrib->texcoord, sizeof(attrib->texcoord));2274break;2275case UTIL_BLITTER_ATTRIB_NONE:;2276}22772278pipe->bind_vs_state(pipe, si_get_blitter_vs(sctx, type, num_instances));22792280struct pipe_draw_info info = {};2281struct pipe_draw_start_count_bias draw;22822283info.mode = SI_PRIM_RECTANGLE_LIST;2284info.instance_count = num_instances;22852286draw.start = 0;2287draw.count = 3;22882289/* Don't set per-stage shader pointers for VS. */2290sctx->shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(VERTEX);2291sctx->vertex_buffer_pointer_dirty = false;2292sctx->vertex_buffer_user_sgprs_dirty = false;22932294pipe->draw_vbo(pipe, &info, 0, NULL, &draw, 1);2295}22962297template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS,2298si_has_ngg NGG, si_has_prim_discard_cs ALLOW_PRIM_DISCARD_CS>2299static void si_init_draw_vbo(struct si_context *sctx)2300{2301/* Prim discard CS is only useful on gfx7+ because gfx6 doesn't have async compute. */2302if (ALLOW_PRIM_DISCARD_CS && GFX_VERSION < GFX8)2303return;23042305if (ALLOW_PRIM_DISCARD_CS && (HAS_TESS || HAS_GS))2306return;23072308if (NGG && GFX_VERSION < GFX10)2309return;23102311sctx->draw_vbo[HAS_TESS][HAS_GS][NGG][ALLOW_PRIM_DISCARD_CS] =2312si_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG, ALLOW_PRIM_DISCARD_CS>;2313}23142315template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS>2316static void si_init_draw_vbo_all_internal_options(struct si_context *sctx)2317{2318si_init_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG_OFF, PRIM_DISCARD_CS_OFF>(sctx);2319si_init_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG_OFF, PRIM_DISCARD_CS_ON>(sctx);2320si_init_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG_ON, PRIM_DISCARD_CS_OFF>(sctx);2321si_init_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG_ON, PRIM_DISCARD_CS_ON>(sctx);2322}23232324template <chip_class GFX_VERSION>2325static void si_init_draw_vbo_all_pipeline_options(struct si_context *sctx)2326{2327si_init_draw_vbo_all_internal_options<GFX_VERSION, TESS_OFF, GS_OFF>(sctx);2328si_init_draw_vbo_all_internal_options<GFX_VERSION, TESS_OFF, GS_ON>(sctx);2329si_init_draw_vbo_all_internal_options<GFX_VERSION, TESS_ON, GS_OFF>(sctx);2330si_init_draw_vbo_all_internal_options<GFX_VERSION, TESS_ON, GS_ON>(sctx);2331}23322333static void si_invalid_draw_vbo(struct pipe_context *pipe,2334const struct pipe_draw_info *info,2335unsigned drawid_offset,2336const struct pipe_draw_indirect_info *indirect,2337const struct pipe_draw_start_count_bias *draws,2338unsigned num_draws)2339{2340unreachable("vertex shader not bound");2341}23422343extern "C"2344void GFX(si_init_draw_functions_)(struct si_context *sctx)2345{2346assert(sctx->chip_class == GFX());23472348si_init_draw_vbo_all_pipeline_options<GFX()>(sctx);23492350/* Bind a fake draw_vbo, so that draw_vbo isn't NULL, which would skip2351* initialization of callbacks in upper layers (such as u_threaded_context).2352*/2353sctx->b.draw_vbo = si_invalid_draw_vbo;2354sctx->blitter->draw_rectangle = si_draw_rectangle;23552356si_init_ia_multi_vgt_param_table(sctx);2357}235823592360