Path: blob/21.2-virgl/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
4570 views
/*1* Copyright 2019 Advanced Micro Devices, Inc.2* All Rights Reserved.3*4* Permission is hereby granted, free of charge, to any person obtaining a5* copy of this software and associated documentation files (the "Software"),6* to deal in the Software without restriction, including without limitation7* on the rights to use, copy, modify, merge, publish, distribute, sub8* license, and/or sell copies of the Software, and to permit persons to whom9* the Software is furnished to do so, subject to the following conditions:10*11* The above copyright notice and this permission notice (including the next12* paragraph) shall be included in all copies or substantial portions of the13* Software.14*15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,17* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL18* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,19* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR20* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE21* USE OR OTHER DEALINGS IN THE SOFTWARE.22*23*/2425#include "ac_llvm_cull.h"26#include "si_build_pm4.h"27#include "si_pipe.h"28#include "si_shader_internal.h"29#include "sid.h"30#include "util/fast_idiv_by_const.h"31#include "util/u_prim.h"32#include "util/u_suballoc.h"33#include "util/u_upload_mgr.h"3435/* Based on:36* https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf37*/3839/* This file implements primitive culling using asynchronous compute.40*41* It takes a monolithic VS in LLVM IR returning gl_Position and invokes it42* in a compute shader. The shader processes 1 primitive/thread by invoking43* the VS for each vertex to get the positions, decomposes strips44* into triangles (if needed), eliminates primitive restart (if needed),45* does (W<0) culling, face culling, view XY culling, zero-area and46* small-primitive culling, and generates a new index buffer that doesn't47* contain culled primitives.48*49* There is no primitive ordering. The generated index buffer will contain50* primitives in a random order.51*52* IB = a GPU command buffer53*54* Both the compute and gfx IBs run in parallel sort of like CE and DE.55* The gfx IB has a CP barrier (REWIND packet) before a draw packet. REWIND56* doesn't continue if its word isn't 0x80000000. The vertex count is being57* atomically incremented within the draw packet. A CS_DONE event will signal58* the REWIND packet to continue. It's really a direct draw with command59* buffer patching from the compute queue.60*61* The compute IB doesn't have to start when its corresponding gfx IB starts,62* but can start sooner. The compute IB is signaled to start after the last63* execution barrier in the *previous* gfx IB. This is handled as follows.64* The kernel GPU scheduler starts the compute IB after the previous gfx IB has65* started. The compute IB then waits (WAIT_REG_MEM) for a mid-IB fence that66* represents the barrier in the previous gfx IB.67*68* Features:69* - Triangle strips are decomposed into an indexed triangle list.70* The decomposition differs based on the provoking vertex state.71* - Instanced draws are converted into non-instanced draws for 16-bit indices.72* (InstanceID is stored in the high bits of VertexID and unpacked by VS)73* - W<0 culling (W<0 is behind the viewer, sort of like near Z culling).74* - Back face culling, incl. culling zero-area / degenerate primitives.75* - View XY culling.76* - Small primitive culling for all MSAA modes and all quant modes.77*78* The following are not implemented:79* - ClipVertex/ClipDistance/CullDistance-based culling.80* - Scissor culling.81* - HiZ culling.82*83* Limitations (and unimplemented features that may be possible to implement):84* - Only triangles and triangle strips are supported.85* - Primitive restart is not supported.86* - Instancing is only supported with 16-bit indices and instance count <= 2^16.87* - The instance divisor buffer is unavailable, so all divisors must be88* either 0 or 1.89* - Multidraws where the vertex shader reads gl_DrawID are unsupported.90* - No support for tessellation and geometry shaders.91* (patch elimination where tess factors are 0 would be possible to implement)92* - The vertex shader must not contain memory stores.93* - All VS resources must not have a write usage in the command buffer.94* - Bindless textures and images must not occur in the vertex shader.95*96* User data SGPR layout:97* VERTEX_COUNTER: address of "count" in the draw packet incremented atomically by the shader.98* START_OUT_INDEX: output index buffer offset / 1299* START_IN_INDEX: input index buffer offset / index_size100* VS.BASE_VERTEX: same value as VS101* INDEX_BUFFERS: pointer to constants102* 0..3: input index buffer - typed buffer view103* 4..7: output index buffer - typed buffer view104* 8..11: viewport state - scale.xy, translate.xy105* VS.VERTEX_BUFFERS: same value as VS106* VS.CONST_AND_SHADER_BUFFERS: same value as VS107* VS.SAMPLERS_AND_IMAGES: same value as VS108* VS.START_INSTANCE: same value as VS109* SMALL_PRIM_CULLING_PRECISION: Scale the primitive bounding box by this number.110* NUM_PRIMS_UDIV_MULTIPLIER: For fast 31-bit division by the number of primitives111* per instance for instancing.112* NUM_PRIMS_UDIV_TERMS:113* - Bits [0:4]: "post_shift" for fast 31-bit division for instancing.114* - Bits [5:31]: The number of primitives per instance for computing the remainder.115*116* How to test primitive restart (the most complicated part because it needs117* to get the primitive orientation right):118* Set THREADGROUP_SIZE to 2 to exercise both intra-wave and inter-wave119* primitive orientation flips with small draw calls, which is what most tests use.120* You can also enable draw call splitting into draw calls with just 2 primitives.121*/122123/* At least 256 is needed for the fastest wave launch rate from compute queues124* due to hw constraints. Nothing in the code needs more than 1 wave/threadgroup. */125#define THREADGROUP_SIZE 256 /* high numbers limit available VGPRs */126#define THREADGROUPS_PER_CU 1 /* TGs to launch on 1 CU before going onto the next, max 8 */127#define MAX_WAVES_PER_SH 0 /* no limit */128#define INDEX_STORES_USE_SLC 1 /* don't cache indices if L2 is full */129130/* Grouping compute dispatches for small draw calls: How many primitives from multiple131* draw calls to process by compute before signaling the gfx IB. This reduces the number132* of EOP events + REWIND packets, because they decrease performance.133* This also determines the granularity of draw-level and packet-level splitting.134*/135#define PRIMS_PER_IB (1024 * 1024) /* size per gfx IB */136#define PRIMS_PER_BATCH (128 * 1024) /* size between REWIND packets */137138/* Derived values. */139#define WAVES_PER_TG DIV_ROUND_UP(THREADGROUP_SIZE, 64)140141#define REWIND_SIGNAL_BIT 0x80000000142143static LLVMValueRef si_expand_32bit_pointer(struct si_shader_context *ctx, LLVMValueRef ptr);144145void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_context,146unsigned *prim_discard_vertex_count_threshold,147unsigned *index_ring_size_per_ib)148{149*prim_discard_vertex_count_threshold = UINT_MAX; /* disable */150151if (sscreen->info.chip_class <= GFX7 || /* SI-CI support is not implemented */152sscreen->debug_flags & DBG(NO_PD) || is_aux_context)153return;154155/* TODO: enable this */156bool enable_by_default = false;157158if (sscreen->debug_flags & DBG(ALWAYS_PD) || sscreen->debug_flags & DBG(PD) ||159(enable_by_default && sscreen->allow_draw_out_of_order &&160sscreen->info.num_se >= 2)) {161*prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */162163if (sscreen->debug_flags & DBG(ALWAYS_PD))164*prim_discard_vertex_count_threshold = 0; /* always enable */165166/* The total size is double this per context. Greater numbers allow bigger gfx IBs. */167*index_ring_size_per_ib = PRIMS_PER_IB * 12; /* 3 32-bit indices per primitive. */168}169}170171static LLVMValueRef si_expand_32bit_pointer(struct si_shader_context *ctx, LLVMValueRef ptr)172{173uint64_t hi = (uint64_t)ctx->screen->info.address32_hi << 32;174ptr = LLVMBuildZExt(ctx->ac.builder, ptr, ctx->ac.i64, "");175ptr = LLVMBuildOr(ctx->ac.builder, ptr, LLVMConstInt(ctx->ac.i64, hi, 0), "");176return LLVMBuildIntToPtr(ctx->ac.builder, ptr,177LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GLOBAL), "");178}179180struct si_thread0_section {181struct si_shader_context *ctx;182LLVMValueRef vgpr_result; /* a VGPR for the value on thread 0. */183LLVMValueRef saved_exec;184};185186/* Enter a section that only executes on thread 0. */187static void si_enter_thread0_section(struct si_shader_context *ctx,188struct si_thread0_section *section, LLVMValueRef thread_id,189LLVMValueRef check_nonzero)190{191section->ctx = ctx;192section->vgpr_result = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "result0");193194/* This IF has 4 instructions:195* v_and_b32_e32 v, 63, v ; get the thread ID196* v_cmp_eq_u32_e32 vcc, 0, v ; thread ID == 0197* s_and_saveexec_b64 s, vcc198* s_cbranch_execz BB0_4199*200* It could just be s_and_saveexec_b64 s, 1.201*/202LLVMValueRef cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, thread_id, ctx->ac.i32_0, "");203if (check_nonzero) {204cond = LLVMBuildAnd(ctx->ac.builder, cond,205LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, check_nonzero,206ctx->ac.i32_0, ""), "");207}208ac_build_ifcc(&ctx->ac, cond, 12601);209}210211/* Exit a section that only executes on thread 0 and broadcast the result212* to all threads. */213static void si_exit_thread0_section(struct si_thread0_section *section, LLVMValueRef *result)214{215struct si_shader_context *ctx = section->ctx;216217LLVMBuildStore(ctx->ac.builder, *result, section->vgpr_result);218219ac_build_endif(&ctx->ac, 12601);220221/* Broadcast the result from thread 0 to all threads. */222*result =223ac_build_readlane(&ctx->ac, LLVMBuildLoad(ctx->ac.builder, section->vgpr_result, ""), NULL);224}225226static void si_build_primitive_accepted(struct ac_llvm_context *ac, LLVMValueRef accepted,227void *data);228229void si_build_prim_discard_compute_shader(struct si_shader_context *ctx)230{231struct si_shader_key *key = &ctx->shader->key;232LLVMBuilderRef builder = ctx->ac.builder;233LLVMValueRef vs = ctx->main_fn;234235/* Always inline the VS function. */236ac_add_function_attr(ctx->ac.context, vs, -1, AC_FUNC_ATTR_ALWAYSINLINE);237LLVMSetLinkage(vs, LLVMPrivateLinkage);238239enum ac_arg_type const_desc_type;240if (ctx->shader->selector->info.base.num_ubos == 1 &&241ctx->shader->selector->info.base.num_ssbos == 0)242const_desc_type = AC_ARG_CONST_FLOAT_PTR;243else244const_desc_type = AC_ARG_CONST_DESC_PTR;245246memset(&ctx->args, 0, sizeof(ctx->args));247248struct ac_arg param_index_buffers_and_constants, param_vertex_counter;249struct ac_arg param_vb_desc, param_const_desc, param_start_out_index;250struct ac_arg param_base_vertex, param_start_instance, param_start_in_index;251struct ac_arg param_block_id, param_local_id, param_smallprim_precision;252struct ac_arg param_num_prims_udiv_multiplier, param_num_prims_udiv_terms;253struct ac_arg param_sampler_desc;254255ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_vertex_counter);256ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_start_out_index);257ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_start_in_index);258ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_base_vertex);259ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, ¶m_index_buffers_and_constants);260ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, ¶m_vb_desc);261ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_desc_type, ¶m_const_desc);262ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR, ¶m_sampler_desc);263ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_start_instance);264ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, ¶m_smallprim_precision);265if (key->opt.cs_instancing) {266ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_num_prims_udiv_multiplier);267ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_num_prims_udiv_terms);268}269270/* Block ID and thread ID inputs. */271ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_block_id);272ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, ¶m_local_id);273274/* Create the compute shader function. */275gl_shader_stage old_stage = ctx->stage;276ctx->stage = MESA_SHADER_COMPUTE;277si_llvm_create_func(ctx, "prim_discard_cs", NULL, 0, THREADGROUP_SIZE);278ctx->stage = old_stage;279280/* Assemble parameters for VS. */281LLVMValueRef vs_params[16];282unsigned num_vs_params = 0;283unsigned param_vertex_id, param_instance_id;284285vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 0))); /* INTERNAL RESOURCES */286vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 1))); /* BINDLESS */287vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_const_desc);288vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_sampler_desc);289vs_params[num_vs_params++] =290LLVMConstInt(ctx->ac.i32, S_VS_STATE_INDEXED(key->opt.cs_indexed), 0);291vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_base_vertex);292vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_start_instance);293vs_params[num_vs_params++] = ctx->ac.i32_0; /* DrawID */294vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_vb_desc);295296vs_params[(param_vertex_id = num_vs_params++)] = NULL; /* VertexID */297vs_params[(param_instance_id = num_vs_params++)] = NULL; /* InstanceID */298vs_params[num_vs_params++] = ctx->ac.i32_0; /* unused (PrimID) */299vs_params[num_vs_params++] = ctx->ac.i32_0; /* unused */300301assert(num_vs_params <= ARRAY_SIZE(vs_params));302assert(num_vs_params == LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(vs))));303304/* Load descriptors. (load 8 dwords at once) */305LLVMValueRef input_indexbuf, output_indexbuf, tmp, desc[8];306307LLVMValueRef index_buffers_and_constants =308ac_get_arg(&ctx->ac, param_index_buffers_and_constants);309tmp = LLVMBuildPointerCast(builder, index_buffers_and_constants,310ac_array_in_const32_addr_space(ctx->ac.v8i32), "");311tmp = ac_build_load_to_sgpr(&ctx->ac, tmp, ctx->ac.i32_0);312313for (unsigned i = 0; i < 8; i++)314desc[i] = ac_llvm_extract_elem(&ctx->ac, tmp, i);315316input_indexbuf = ac_build_gather_values(&ctx->ac, desc, 4);317output_indexbuf = ac_build_gather_values(&ctx->ac, desc + 4, 4);318319/* Compute PrimID and InstanceID. */320LLVMValueRef global_thread_id = ac_build_imad(&ctx->ac, ac_get_arg(&ctx->ac, param_block_id),321LLVMConstInt(ctx->ac.i32, THREADGROUP_SIZE, 0),322ac_get_arg(&ctx->ac, param_local_id));323LLVMValueRef prim_id = global_thread_id; /* PrimID within an instance */324LLVMValueRef instance_id = ctx->ac.i32_0;325326if (key->opt.cs_instancing) {327LLVMValueRef num_prims_udiv_terms = ac_get_arg(&ctx->ac, param_num_prims_udiv_terms);328LLVMValueRef num_prims_udiv_multiplier =329ac_get_arg(&ctx->ac, param_num_prims_udiv_multiplier);330/* Unpack num_prims_udiv_terms. */331LLVMValueRef post_shift =332LLVMBuildAnd(builder, num_prims_udiv_terms, LLVMConstInt(ctx->ac.i32, 0x1f, 0), "");333LLVMValueRef prims_per_instance =334LLVMBuildLShr(builder, num_prims_udiv_terms, LLVMConstInt(ctx->ac.i32, 5, 0), "");335/* Divide the total prim_id by the number of prims per instance. */336instance_id =337ac_build_fast_udiv_u31_d_not_one(&ctx->ac, prim_id, num_prims_udiv_multiplier, post_shift);338/* Compute the remainder. */339prim_id = LLVMBuildSub(builder, prim_id,340LLVMBuildMul(builder, instance_id, prims_per_instance, ""), "");341}342343/* Generate indices (like a non-indexed draw call). */344LLVMValueRef index[4] = {NULL, NULL, NULL, LLVMGetUndef(ctx->ac.i32)};345unsigned vertices_per_prim = 3;346347switch (key->opt.cs_prim_type) {348case PIPE_PRIM_TRIANGLES:349for (unsigned i = 0; i < 3; i++) {350index[i] = ac_build_imad(&ctx->ac, prim_id, LLVMConstInt(ctx->ac.i32, 3, 0),351LLVMConstInt(ctx->ac.i32, i, 0));352}353break;354case PIPE_PRIM_TRIANGLE_STRIP:355for (unsigned i = 0; i < 3; i++) {356index[i] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, i, 0), "");357}358break;359default:360unreachable("unexpected primitive type");361}362363/* Fetch indices. */364if (key->opt.cs_indexed) {365for (unsigned i = 0; i < 3; i++) {366index[i] = LLVMBuildAdd(builder, index[i], ac_get_arg(&ctx->ac, param_start_in_index), "");367index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf, index[i], ctx->ac.i32_0,3681, 0, true, false, false);369index[i] = ac_to_integer(&ctx->ac, index[i]);370}371}372373LLVMValueRef thread_id = LLVMBuildAnd(builder, ac_get_arg(&ctx->ac, param_local_id),374LLVMConstInt(ctx->ac.i32, 63, 0), "");375376/* Every other triangle in a strip has a reversed vertex order, so we377* need to swap vertices of odd primitives to get the correct primitive378* orientation when converting triangle strips to triangles. Primitive379* restart complicates it, because a strip can start anywhere.380*/381LLVMValueRef prim_restart_accepted = ctx->ac.i1true;382LLVMValueRef vertex_counter = ac_get_arg(&ctx->ac, param_vertex_counter);383384if (key->opt.cs_prim_type == PIPE_PRIM_TRIANGLE_STRIP) {385/* Without primitive restart, odd primitives have reversed orientation.386* Only primitive restart can flip it with respect to the first vertex387* of the draw call.388*/389/* prim_is_odd = current_is_odd % 2. */390LLVMValueRef prim_is_odd = LLVMBuildXor(391builder, ctx->ac.i1false, LLVMBuildTrunc(builder, thread_id, ctx->ac.i1, ""), "");392393/* Convert triangle strip indices to triangle indices. */394ac_build_triangle_strip_indices_to_triangle(395&ctx->ac, prim_is_odd, LLVMConstInt(ctx->ac.i1, key->opt.cs_provoking_vertex_first, 0),396index);397}398399/* Execute the vertex shader for each vertex to get vertex positions. */400LLVMValueRef pos[3][4];401for (unsigned i = 0; i < vertices_per_prim; i++) {402vs_params[param_vertex_id] = index[i];403vs_params[param_instance_id] = instance_id;404405LLVMValueRef ret = ac_build_call(&ctx->ac, vs, vs_params, num_vs_params);406for (unsigned chan = 0; chan < 4; chan++)407pos[i][chan] = LLVMBuildExtractValue(builder, ret, chan, "");408}409410/* Divide XYZ by W. */411for (unsigned i = 0; i < vertices_per_prim; i++) {412for (unsigned chan = 0; chan < 3; chan++)413pos[i][chan] = ac_build_fdiv(&ctx->ac, pos[i][chan], pos[i][3]);414}415416/* Load the viewport state. */417LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, index_buffers_and_constants,418LLVMConstInt(ctx->ac.i32, 2, 0));419vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, "");420LLVMValueRef vp_scale[2], vp_translate[2];421vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);422vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);423vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);424vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);425426/* Do culling. */427struct ac_cull_options options = {};428options.cull_front = key->opt.cs_cull_front;429options.cull_back = key->opt.cs_cull_back;430options.cull_view_xy = true;431options.cull_small_prims = true;432options.cull_zero_area = true;433options.cull_w = true;434435LLVMValueRef params[] = {436instance_id,437vertex_counter,438output_indexbuf,439(void*)index,440ac_get_arg(&ctx->ac, param_start_out_index),441};442443ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted, vp_scale, vp_translate,444ac_get_arg(&ctx->ac, param_smallprim_precision), &options,445si_build_primitive_accepted, params);446LLVMBuildRetVoid(builder);447}448449static void si_build_primitive_accepted(struct ac_llvm_context *ac, LLVMValueRef accepted,450void *userdata)451{452struct si_shader_context *ctx = container_of(ac, struct si_shader_context, ac);453struct si_shader_key *key = &ctx->shader->key;454LLVMBuilderRef builder = ctx->ac.builder;455unsigned vertices_per_prim = 3;456LLVMValueRef *params = (LLVMValueRef *)userdata;457LLVMValueRef instance_id = params[0];458LLVMValueRef vertex_counter = params[1];459LLVMValueRef output_indexbuf = params[2];460LLVMValueRef *index = (LLVMValueRef *)params[3];461LLVMValueRef start_out_index = params[4];462463LLVMValueRef accepted_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, accepted);464465ac_build_ifcc(&ctx->ac, accepted, 16607);466467/* Count the number of active threads by doing bitcount(accepted). */468LLVMValueRef num_prims_accepted = ac_build_bit_count(&ctx->ac, accepted_threadmask);469num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->ac.i32, "");470471/* Get the number of bits set before the index of this thread. */472LLVMValueRef prim_index = ac_build_mbcnt(&ctx->ac, accepted_threadmask);473LLVMValueRef start;474475/* Execute atomic_add on the vertex count. */476struct si_thread0_section section;477si_enter_thread0_section(ctx, §ion, prim_index, num_prims_accepted);478{479LLVMValueRef num_indices = LLVMBuildMul(480builder, num_prims_accepted, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");481vertex_counter = si_expand_32bit_pointer(ctx, vertex_counter);482start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, vertex_counter, num_indices,483LLVMAtomicOrderingMonotonic, false);484}485si_exit_thread0_section(§ion, &start);486487/* Convert it into the primitive index. */488start = LLVMBuildUDiv(builder, start, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");489490/* Now we need to store the indices of accepted primitives into491* the output index buffer.492*/493494/* We have lowered instancing. Pack the instance ID into vertex ID. */495if (key->opt.cs_instancing) {496instance_id = LLVMBuildShl(builder, instance_id, LLVMConstInt(ctx->ac.i32, 16, 0), "");497498for (unsigned i = 0; i < vertices_per_prim; i++)499index[i] = LLVMBuildOr(builder, index[i], instance_id, "");500}501502/* Write indices for accepted primitives. */503LLVMValueRef vindex = LLVMBuildAdd(builder, start, prim_index, "");504vindex = LLVMBuildAdd(builder, vindex, start_out_index, "");505LLVMValueRef vdata = ac_build_gather_values(&ctx->ac, index, 3);506507if (!ac_has_vec3_support(ctx->ac.chip_class, true))508vdata = ac_build_expand_to_vec4(&ctx->ac, vdata, 3);509510ac_build_buffer_store_format(&ctx->ac, output_indexbuf, vdata, vindex, ctx->ac.i32_0,511ac_glc | (INDEX_STORES_USE_SLC ? ac_slc : 0));512ac_build_endif(&ctx->ac, 16607);513}514515/* Return false if the shader isn't ready. */516static bool si_shader_select_prim_discard_cs(struct si_context *sctx,517const struct pipe_draw_info *info)518{519struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;520struct si_shader_key key;521522memset(&key, 0, sizeof(key));523si_shader_selector_key_vs(sctx, sctx->shader.vs.cso, &key, &key.part.vs.prolog);524assert(!key.part.vs.prolog.instance_divisor_is_fetched);525526key.part.vs.prolog.unpack_instance_id_from_vertex_id = 0;527key.opt.vs_as_prim_discard_cs = 1;528key.opt.cs_prim_type = info->mode;529key.opt.cs_indexed = info->index_size != 0;530key.opt.cs_instancing = info->instance_count > 1;531key.opt.cs_provoking_vertex_first = rs->provoking_vertex_first;532533if (rs->rasterizer_discard) {534/* Just for performance testing and analysis of trivial bottlenecks.535* This should result in a very short compute shader. */536key.opt.cs_cull_front = 1;537key.opt.cs_cull_back = 1;538} else {539key.opt.cs_cull_front = sctx->viewport0_y_inverted ? rs->cull_back : rs->cull_front;540key.opt.cs_cull_back = sctx->viewport0_y_inverted ? rs->cull_front : rs->cull_back;541}542543sctx->cs_prim_discard_state.cso = sctx->shader.vs.cso;544sctx->cs_prim_discard_state.current = NULL;545546if (!sctx->compiler.passes)547si_init_compiler(sctx->screen, &sctx->compiler);548549struct si_compiler_ctx_state compiler_state;550compiler_state.compiler = &sctx->compiler;551compiler_state.debug = sctx->debug;552compiler_state.is_debug_context = sctx->is_debug;553554return si_shader_select_with_key(sctx->screen, &sctx->cs_prim_discard_state, &compiler_state,555&key, -1, true) == 0 &&556/* Disallow compute shaders using the scratch buffer. */557sctx->cs_prim_discard_state.current->config.scratch_bytes_per_wave == 0;558}559560static bool si_initialize_prim_discard_cmdbuf(struct si_context *sctx)561{562if (sctx->index_ring)563return true;564565if (!sctx->prim_discard_compute_cs.priv) {566struct radeon_winsys *ws = sctx->ws;567568if (!ws->cs_add_parallel_compute_ib(&sctx->prim_discard_compute_cs,569&sctx->gfx_cs, false))570return false;571}572573if (!sctx->index_ring) {574sctx->index_ring = si_aligned_buffer_create(575sctx->b.screen, SI_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_DRIVER_INTERNAL,576PIPE_USAGE_DEFAULT,577sctx->index_ring_size_per_ib * 2, sctx->screen->info.pte_fragment_size);578if (!sctx->index_ring)579return false;580}581return true;582}583584static bool si_check_ring_space(struct si_context *sctx, unsigned out_indexbuf_size)585{586return sctx->index_ring_offset +587align(out_indexbuf_size, sctx->screen->info.tcc_cache_line_size) <=588sctx->index_ring_size_per_ib;589}590591#define COMPUTE_PREAMBLE_SIZE (8 + 39 + 11 + 7)592593enum si_prim_discard_outcome594si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe_draw_info *info,595unsigned drawid_offset,596const struct pipe_draw_start_count_bias *draws,597unsigned num_draws, unsigned total_count)598{599/* If the compute shader compilation isn't finished, this returns false. */600if (!si_shader_select_prim_discard_cs(sctx, info))601return SI_PRIM_DISCARD_DISABLED;602603if (!si_initialize_prim_discard_cmdbuf(sctx))604return SI_PRIM_DISCARD_DISABLED;605606struct radeon_cmdbuf *gfx_cs = &sctx->gfx_cs;607unsigned prim = info->mode;608unsigned instance_count = info->instance_count;609610unsigned num_prims_per_instance;611if (prim == PIPE_PRIM_TRIANGLES)612num_prims_per_instance = total_count / 3;613else if (prim == PIPE_PRIM_TRIANGLE_STRIP)614num_prims_per_instance = total_count - 2; /* approximation ignoring multi draws */615else616unreachable("shouldn't get here");617618unsigned num_prims = num_prims_per_instance * instance_count;619unsigned out_indexbuf_size = num_prims * 12;620bool ring_full = !si_check_ring_space(sctx, out_indexbuf_size);621622/* Split draws at the draw call level if the ring is full. This makes623* better use of the ring space.624*625* If instancing is enabled and there is not enough ring buffer space, compute-based626* primitive discard is disabled.627*/628if (ring_full && num_prims > PRIMS_PER_BATCH && instance_count == 1) {629unsigned vert_count_per_subdraw = 0;630631if (prim == PIPE_PRIM_TRIANGLES)632vert_count_per_subdraw = PRIMS_PER_BATCH * 3;633else if (prim == PIPE_PRIM_TRIANGLE_STRIP)634vert_count_per_subdraw = PRIMS_PER_BATCH;635636/* Split multi draws first. */637if (num_draws > 1) {638unsigned count = 0;639unsigned first_draw = 0;640unsigned num_draws_split = 0;641642for (unsigned i = 0; i < num_draws; i++) {643if (count && count + draws[i].count > vert_count_per_subdraw) {644/* Submit previous draws. */645sctx->b.draw_vbo(&sctx->b, info, drawid_offset, NULL, draws + first_draw, num_draws_split);646count = 0;647first_draw = i;648num_draws_split = 0;649}650651if (draws[i].count > vert_count_per_subdraw) {652/* Submit just 1 draw. It will be split. */653sctx->b.draw_vbo(&sctx->b, info, drawid_offset, NULL, draws + i, 1);654assert(count == 0);655assert(first_draw == i);656assert(num_draws_split == 0);657first_draw = i + 1;658continue;659}660661count += draws[i].count;662num_draws_split++;663}664665if (count) {666/* Submit the remaining draws. */667assert(num_draws_split > 0);668sctx->b.draw_vbo(&sctx->b, info, drawid_offset, NULL, draws + first_draw, num_draws_split);669}670return SI_PRIM_DISCARD_MULTI_DRAW_SPLIT;671}672673/* Split single draws if splitting multi draws isn't enough. */674struct pipe_draw_info split_draw = *info;675struct pipe_draw_start_count_bias split_draw_range = draws[0];676unsigned base_start = split_draw_range.start;677unsigned count = draws[0].count;678679if (prim == PIPE_PRIM_TRIANGLES) {680assert(vert_count_per_subdraw < count);681682for (unsigned start = 0; start < count; start += vert_count_per_subdraw) {683split_draw_range.start = base_start + start;684split_draw_range.count = MIN2(count - start, vert_count_per_subdraw);685686sctx->b.draw_vbo(&sctx->b, &split_draw, drawid_offset, NULL, &split_draw_range, 1);687}688} else if (prim == PIPE_PRIM_TRIANGLE_STRIP) {689/* No primitive pair can be split, because strips reverse orientation690* for odd primitives. */691STATIC_ASSERT(PRIMS_PER_BATCH % 2 == 0);692693for (unsigned start = 0; start < count - 2; start += vert_count_per_subdraw) {694split_draw_range.start = base_start + start;695split_draw_range.count = MIN2(count - start, vert_count_per_subdraw + 2);696697sctx->b.draw_vbo(&sctx->b, &split_draw, drawid_offset, NULL, &split_draw_range, 1);698}699}700701return SI_PRIM_DISCARD_DRAW_SPLIT;702}703704/* Just quit if the draw call doesn't fit into the ring and can't be split. */705if (out_indexbuf_size > sctx->index_ring_size_per_ib) {706if (SI_PRIM_DISCARD_DEBUG)707puts("PD failed: draw call too big, can't be split");708return SI_PRIM_DISCARD_DISABLED;709}710711/* Compute how many CS dwords we need to reserve. */712unsigned need_compute_dw = COMPUTE_PREAMBLE_SIZE +71311 /* shader */ +71430; /* leave some space at the end */715unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx, 0);716717for (unsigned i = 0; i < num_draws; i++) {718unsigned num_subdraws = DIV_ROUND_UP(draws[i].count, PRIMS_PER_BATCH);719720need_compute_dw += 8 * num_subdraws + /* signal REWIND */72114 /* user SGPRs */ +7224 * (num_subdraws - 1) + /* user SGPRs after the first subdraw */72311 * num_subdraws;724need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */725}726727if (ring_full ||728!sctx->ws->cs_check_space(gfx_cs, need_gfx_dw, false)) {729/* If the current IB is empty but the size is too small, add a NOP730* packet to force a flush and get a bigger IB.731*/732if (!radeon_emitted(gfx_cs, sctx->initial_gfx_cs_size) &&733gfx_cs->current.cdw + need_gfx_dw > gfx_cs->current.max_dw) {734radeon_begin(gfx_cs);735radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));736radeon_emit(gfx_cs, 0);737radeon_end();738}739740si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);741}742743/* The compute IB is always chained, but we need to call cs_check_space to add more space. */744struct radeon_cmdbuf *cs = &sctx->prim_discard_compute_cs;745ASSERTED bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw, false);746assert(compute_has_space);747assert(si_check_ring_space(sctx, out_indexbuf_size));748assert(cs->current.cdw + need_compute_dw <= cs->current.max_dw);749return SI_PRIM_DISCARD_ENABLED;750}751752void si_compute_signal_gfx(struct si_context *sctx)753{754struct radeon_cmdbuf *cs = &sctx->prim_discard_compute_cs;755unsigned writeback_L2_flags = 0;756757/* GFX8 needs to flush L2 for CP to see the updated vertex count. */758if (sctx->chip_class == GFX8)759writeback_L2_flags = EVENT_TC_WB_ACTION_ENA | EVENT_TC_NC_ACTION_ENA;760761if (!sctx->compute_num_prims_in_batch)762return;763764assert(sctx->compute_rewind_va);765766/* After the queued dispatches are done and vertex counts are written to767* the gfx IB, signal the gfx IB to continue. CP doesn't wait for768* the dispatches to finish, it only adds the CS_DONE event into the event769* queue.770*/771si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, writeback_L2_flags,772sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,773writeback_L2_flags ? EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM : EOP_INT_SEL_NONE,774EOP_DATA_SEL_VALUE_32BIT, NULL,775sctx->compute_rewind_va | ((uint64_t)sctx->screen->info.address32_hi << 32),776REWIND_SIGNAL_BIT, /* signaling value for the REWIND packet */777SI_NOT_QUERY);778779sctx->compute_rewind_va = 0;780sctx->compute_num_prims_in_batch = 0;781}782783/* Dispatch a primitive discard compute shader. */784void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,785const struct pipe_draw_info *info,786const struct pipe_draw_start_count_bias *draws,787unsigned num_draws, unsigned index_size,788unsigned total_count, uint64_t input_indexbuf_va,789unsigned index_max_size)790{791struct radeon_cmdbuf *gfx_cs = &sctx->gfx_cs;792struct radeon_cmdbuf *cs = &sctx->prim_discard_compute_cs;793unsigned num_total_prims;794unsigned vertices_per_prim, output_indexbuf_format, gfx10_output_indexbuf_format;795796if (!info->instance_count)797return;798799switch (info->mode) {800case PIPE_PRIM_TRIANGLES:801case PIPE_PRIM_TRIANGLE_STRIP:802if (info->mode == PIPE_PRIM_TRIANGLES)803num_total_prims = total_count / 3;804else if (total_count >= 2)805num_total_prims = total_count - 2; /* tri strip approximation ignoring multi draws */806else807num_total_prims = 0;808809vertices_per_prim = 3;810output_indexbuf_format = V_008F0C_BUF_DATA_FORMAT_32_32_32;811gfx10_output_indexbuf_format = V_008F0C_GFX10_FORMAT_32_32_32_UINT;812break;813default:814unreachable("unsupported primitive type");815return;816}817818if (!num_total_prims)819return;820821num_total_prims *= info->instance_count;822823unsigned out_indexbuf_offset;824uint64_t output_indexbuf_size = num_total_prims * vertices_per_prim * 4;825826/* Initialize the compute IB if it's empty. */827if (!sctx->prim_discard_compute_ib_initialized) {828/* 1) State initialization. */829sctx->compute_ib_last_shader = NULL;830831if (sctx->last_ib_barrier_fence) {832assert(!sctx->last_ib_barrier_buf);833sctx->ws->cs_add_fence_dependency(gfx_cs, sctx->last_ib_barrier_fence,834RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY);835}836837/* 2) IB initialization. */838839/* This needs to be done at the beginning of IBs due to possible840* TTM buffer moves in the kernel.841*/842if (sctx->chip_class >= GFX10) { /* 8 DW */843radeon_begin(cs);844radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));845radeon_emit(cs, 0); /* CP_COHER_CNTL */846radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */847radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */848radeon_emit(cs, 0); /* CP_COHER_BASE */849radeon_emit(cs, 0); /* CP_COHER_BASE_HI */850radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */851radeon_emit(cs, /* GCR_CNTL */852S_586_GLI_INV(V_586_GLI_ALL) | S_586_GLK_INV(1) | S_586_GLV_INV(1) |853S_586_GL1_INV(1) | S_586_GL2_INV(1) | S_586_GL2_WB(1) | S_586_GLM_INV(1) |854S_586_GLM_WB(1) | S_586_SEQ(V_586_SEQ_FORWARD));855radeon_end();856} else {857si_emit_surface_sync(sctx, cs,858S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) |859S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8) |860S_0085F0_SH_ICACHE_ACTION_ENA(1) |861S_0085F0_SH_KCACHE_ACTION_ENA(1));862}863864si_emit_initial_compute_regs(sctx, cs); /* 39 DW */865866radeon_begin(cs); /* 11 DW */867radeon_set_sh_reg(868cs, R_00B860_COMPUTE_TMPRING_SIZE,869S_00B860_WAVES(sctx->scratch_waves) | S_00B860_WAVESIZE(0)); /* no scratch */870871/* Only 1D grids are launched. */872radeon_set_sh_reg_seq(cs, R_00B820_COMPUTE_NUM_THREAD_Y, 2);873radeon_emit(cs, S_00B820_NUM_THREAD_FULL(1) | S_00B820_NUM_THREAD_PARTIAL(1));874radeon_emit(cs, S_00B824_NUM_THREAD_FULL(1) | S_00B824_NUM_THREAD_PARTIAL(1));875876radeon_set_sh_reg_seq(cs, R_00B814_COMPUTE_START_Y, 2);877radeon_emit(cs, 0);878radeon_emit(cs, 0);879radeon_end();880881if (sctx->last_ib_barrier_buf) {882assert(!sctx->last_ib_barrier_fence);883radeon_add_to_buffer_list(sctx, gfx_cs, sctx->last_ib_barrier_buf, RADEON_USAGE_READ,884RADEON_PRIO_FENCE);885si_cp_wait_mem(sctx, cs, /* 7 DW */886sctx->last_ib_barrier_buf->gpu_address + sctx->last_ib_barrier_buf_offset,8871, 1, WAIT_REG_MEM_EQUAL);888}889890sctx->prim_discard_compute_ib_initialized = true;891assert(cs->current.cdw <= COMPUTE_PREAMBLE_SIZE);892}893894/* Allocate the output index buffer. */895output_indexbuf_size = align(output_indexbuf_size, sctx->screen->info.tcc_cache_line_size);896assert(sctx->index_ring_offset + output_indexbuf_size <= sctx->index_ring_size_per_ib);897out_indexbuf_offset = sctx->index_ring_base + sctx->index_ring_offset;898sctx->index_ring_offset += output_indexbuf_size;899900radeon_add_to_buffer_list(sctx, gfx_cs, sctx->index_ring, RADEON_USAGE_READWRITE,901RADEON_PRIO_SHADER_RW_BUFFER);902uint64_t out_indexbuf_va = sctx->index_ring->gpu_address + out_indexbuf_offset;903904/* Prepare index buffer descriptors. */905struct si_resource *indexbuf_desc = NULL;906unsigned indexbuf_desc_offset;907unsigned desc_size = 12 * 4;908uint32_t *desc;909910u_upload_alloc(sctx->b.const_uploader, 0, desc_size, si_optimal_tcc_alignment(sctx, desc_size),911&indexbuf_desc_offset, (struct pipe_resource **)&indexbuf_desc, (void **)&desc);912radeon_add_to_buffer_list(sctx, gfx_cs, indexbuf_desc, RADEON_USAGE_READ,913RADEON_PRIO_DESCRIPTORS);914915/* Input index buffer. */916desc[0] = input_indexbuf_va;917desc[1] = S_008F04_BASE_ADDRESS_HI(input_indexbuf_va >> 32) | S_008F04_STRIDE(index_size);918desc[2] = index_max_size * (sctx->chip_class == GFX8 ? index_size : 1);919920if (sctx->chip_class >= GFX10) {921desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |922S_008F0C_FORMAT(index_size == 1 ? V_008F0C_GFX10_FORMAT_8_UINT923: index_size == 2 ? V_008F0C_GFX10_FORMAT_16_UINT924: V_008F0C_GFX10_FORMAT_32_UINT) |925S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |926S_008F0C_RESOURCE_LEVEL(1);927} else {928desc[3] =929S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |930S_008F0C_DATA_FORMAT(index_size == 1 ? V_008F0C_BUF_DATA_FORMAT_8931: index_size == 2 ? V_008F0C_BUF_DATA_FORMAT_16932: V_008F0C_BUF_DATA_FORMAT_32);933}934935/* Output index buffer. */936desc[4] = out_indexbuf_va;937desc[5] =938S_008F04_BASE_ADDRESS_HI(out_indexbuf_va >> 32) | S_008F04_STRIDE(vertices_per_prim * 4);939desc[6] = num_total_prims * (sctx->chip_class == GFX8 ? vertices_per_prim * 4 : 1);940941if (sctx->chip_class >= GFX10) {942desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |943S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |944S_008F0C_FORMAT(gfx10_output_indexbuf_format) |945S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |946S_008F0C_RESOURCE_LEVEL(1);947} else {948desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |949S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |950S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |951S_008F0C_DATA_FORMAT(output_indexbuf_format);952}953954/* Viewport state. */955struct si_small_prim_cull_info cull_info;956si_get_small_prim_cull_info(sctx, &cull_info);957958desc[8] = fui(cull_info.scale[0]);959desc[9] = fui(cull_info.scale[1]);960desc[10] = fui(cull_info.translate[0]);961desc[11] = fui(cull_info.translate[1]);962963/* Set user data SGPRs. */964/* This can't be >= 16 if we want the fastest launch rate. */965unsigned user_sgprs = info->instance_count > 1 ? 12 : 10;966967uint64_t index_buffers_va = indexbuf_desc->gpu_address + indexbuf_desc_offset;968unsigned vs_const_desc = si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX);969unsigned vs_sampler_desc = si_sampler_and_image_descriptors_idx(PIPE_SHADER_VERTEX);970uint64_t vs_const_desc_va = sctx->descriptors[vs_const_desc].gpu_address;971uint64_t vs_sampler_desc_va = sctx->descriptors[vs_sampler_desc].gpu_address;972uint64_t vb_desc_va = sctx->vb_descriptors_buffer973? sctx->vb_descriptors_buffer->gpu_address + sctx->vb_descriptors_offset974: 0;975si_resource_reference(&indexbuf_desc, NULL);976977/* Set the compute shader. */978struct si_shader *shader = sctx->cs_prim_discard_state.current;979980if (shader != sctx->compute_ib_last_shader) {981radeon_add_to_buffer_list(sctx, gfx_cs, shader->bo, RADEON_USAGE_READ,982RADEON_PRIO_SHADER_BINARY);983uint64_t shader_va = shader->bo->gpu_address;984985assert(shader->config.scratch_bytes_per_wave == 0);986assert(shader->config.num_vgprs * WAVES_PER_TG <= 256 * 4);987988radeon_begin(cs);989radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);990radeon_emit(cs, shader_va >> 8);991radeon_emit(cs, S_00B834_DATA(shader_va >> 40));992993radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);994radeon_emit(995cs, S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) |996S_00B848_SGPRS(sctx->chip_class <= GFX9 ? (shader->config.num_sgprs - 1) / 8 : 0) |997S_00B848_FLOAT_MODE(shader->config.float_mode) | S_00B848_DX10_CLAMP(1) |998S_00B848_MEM_ORDERED(sctx->chip_class >= GFX10) |999S_00B848_WGP_MODE(sctx->chip_class >= GFX10));1000radeon_emit(cs, S_00B84C_SCRATCH_EN(0 /* no scratch */) | S_00B84C_USER_SGPR(user_sgprs) |1001S_00B84C_TGID_X_EN(1 /* only blockID.x is used */) |1002S_00B84C_TIDIG_COMP_CNT(0 /* only threadID.x is used */) |1003S_00B84C_LDS_SIZE(shader->config.lds_size));10041005radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,1006ac_get_compute_resource_limits(&sctx->screen->info, WAVES_PER_TG,1007MAX_WAVES_PER_SH, THREADGROUPS_PER_CU));1008radeon_end();1009sctx->compute_ib_last_shader = shader;1010}10111012STATIC_ASSERT(PRIMS_PER_BATCH % THREADGROUP_SIZE == 0);10131014struct si_fast_udiv_info32 num_prims_udiv = {};10151016for (unsigned i = 0; i < num_draws; i++) {1017unsigned count = draws[i].count;1018unsigned num_prims_per_instance, num_prims;10191020/* Determine the number of primitives per instance. */1021if (info->mode == PIPE_PRIM_TRIANGLES)1022num_prims_per_instance = count / 3;1023else if (count >= 2)1024num_prims_per_instance = count - 2;1025else1026num_prims_per_instance = 0;10271028if (!num_prims_per_instance)1029continue;10301031num_prims = num_prims_per_instance;10321033if (info->instance_count > 1) {1034num_prims_udiv = si_compute_fast_udiv_info32(num_prims_per_instance, 31);1035num_prims *= info->instance_count;1036}10371038/* Limitations on how these two are packed in the user SGPR. */1039assert(num_prims_udiv.post_shift < 32);1040assert(num_prims_per_instance < 1 << 27);10411042/* Big draw calls are split into smaller dispatches and draw packets. */1043for (unsigned start_prim = 0; start_prim < num_prims; start_prim += PRIMS_PER_BATCH) {1044unsigned num_subdraw_prims;10451046if (start_prim + PRIMS_PER_BATCH < num_prims) {1047num_subdraw_prims = PRIMS_PER_BATCH;1048} else {1049num_subdraw_prims = num_prims - start_prim;1050}10511052/* Small dispatches are executed back to back until a specific primitive1053* count is reached. Then, a CS_DONE is inserted to signal the gfx IB1054* to start drawing the batch. This batching adds latency to the gfx IB,1055* but CS_DONE and REWIND are too slow.1056*/1057if (sctx->compute_num_prims_in_batch + num_subdraw_prims > PRIMS_PER_BATCH)1058si_compute_signal_gfx(sctx);10591060if (sctx->compute_num_prims_in_batch == 0) {1061assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi);1062sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4;10631064radeon_begin(gfx_cs);1065radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0));1066radeon_emit(gfx_cs, 0);1067radeon_end();1068}10691070sctx->compute_num_prims_in_batch += num_subdraw_prims;10711072uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4;1073uint64_t index_va = out_indexbuf_va + start_prim * 12;10741075/* Emit the draw packet into the gfx IB. */1076radeon_begin(gfx_cs);1077radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0));1078radeon_emit(gfx_cs, num_subdraw_prims * vertices_per_prim);1079radeon_emit(gfx_cs, index_va);1080radeon_emit(gfx_cs, index_va >> 32);1081radeon_emit(gfx_cs, 0);1082radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA);1083radeon_end();10841085radeon_begin_again(cs);10861087/* Continue with the compute IB. */1088if (start_prim == 0) {1089if (i == 0) {1090/* First draw. */1091radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, user_sgprs);1092radeon_emit(cs, count_va);1093radeon_emit(cs, start_prim);1094radeon_emit(cs, draws[i].start);1095radeon_emit(cs, index_size ? draws[i].index_bias : draws[i].start);1096radeon_emit(cs, index_buffers_va);1097radeon_emit(cs, vb_desc_va);1098radeon_emit(cs, vs_const_desc_va);1099radeon_emit(cs, vs_sampler_desc_va);1100radeon_emit(cs, info->start_instance);1101/* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */1102radeon_emit(cs, fui(cull_info.small_prim_precision));11031104if (info->instance_count > 1) {1105radeon_emit(cs, num_prims_udiv.multiplier);1106radeon_emit(cs, num_prims_udiv.post_shift | (num_prims_per_instance << 5));1107}1108} else {1109/* Subsequent draws. */1110radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 4);1111radeon_emit(cs, count_va);1112radeon_emit(cs, 0);1113radeon_emit(cs, draws[i].start);1114radeon_emit(cs, index_size ? draws[i].index_bias : draws[i].start);11151116if (info->instance_count > 1) {1117radeon_set_sh_reg_seq(cs, R_00B928_COMPUTE_USER_DATA_10, 2);1118radeon_emit(cs, num_prims_udiv.multiplier);1119radeon_emit(cs, num_prims_udiv.post_shift | (num_prims_per_instance << 5));1120}1121}1122} else {1123/* Draw split. Only update the SGPRs that changed. */1124radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2);1125radeon_emit(cs, count_va);1126radeon_emit(cs, start_prim);1127}11281129/* Set grid dimensions. */1130unsigned start_block = start_prim / THREADGROUP_SIZE;1131unsigned num_full_blocks = num_subdraw_prims / THREADGROUP_SIZE;1132unsigned partial_block_size = num_subdraw_prims % THREADGROUP_SIZE;11331134radeon_set_sh_reg(cs, R_00B810_COMPUTE_START_X, start_block);1135radeon_set_sh_reg(cs, R_00B81C_COMPUTE_NUM_THREAD_X,1136S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE) |1137S_00B81C_NUM_THREAD_PARTIAL(partial_block_size));11381139radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | PKT3_SHADER_TYPE_S(1));1140radeon_emit(cs, start_block + num_full_blocks + !!partial_block_size);1141radeon_emit(cs, 1);1142radeon_emit(cs, 1);1143radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) | S_00B800_PARTIAL_TG_EN(!!partial_block_size) |1144S_00B800_ORDER_MODE(0 /* launch in order */));1145radeon_end();11461147assert(cs->current.cdw <= cs->current.max_dw);1148assert(gfx_cs->current.cdw <= gfx_cs->current.max_dw);1149}1150}1151}115211531154