Path: blob/21.2-virgl/src/gallium/drivers/radeonsi/si_shader.c
4570 views
/*1* Copyright 2012 Advanced Micro Devices, Inc.2* All Rights Reserved.3*4* Permission is hereby granted, free of charge, to any person obtaining a5* copy of this software and associated documentation files (the "Software"),6* to deal in the Software without restriction, including without limitation7* on the rights to use, copy, modify, merge, publish, distribute, sub8* license, and/or sell copies of the Software, and to permit persons to whom9* the Software is furnished to do so, subject to the following conditions:10*11* The above copyright notice and this permission notice (including the next12* paragraph) shall be included in all copies or substantial portions of the13* Software.14*15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,17* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL18* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,19* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR20* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE21* USE OR OTHER DEALINGS IN THE SOFTWARE.22*/2324#include "ac_exp_param.h"25#include "ac_rtld.h"26#include "compiler/nir/nir.h"27#include "compiler/nir/nir_serialize.h"28#include "si_pipe.h"29#include "si_shader_internal.h"30#include "sid.h"31#include "tgsi/tgsi_from_mesa.h"32#include "tgsi/tgsi_strings.h"33#include "util/u_memory.h"3435static const char scratch_rsrc_dword0_symbol[] = "SCRATCH_RSRC_DWORD0";3637static const char scratch_rsrc_dword1_symbol[] = "SCRATCH_RSRC_DWORD1";3839static void si_dump_shader_key(const struct si_shader *shader, FILE *f);4041/** Whether the shader runs as a combination of multiple API shaders */42bool si_is_multi_part_shader(struct si_shader *shader)43{44if (shader->selector->screen->info.chip_class <= GFX8)45return false;4647return shader->key.as_ls || shader->key.as_es ||48shader->selector->info.stage == MESA_SHADER_TESS_CTRL ||49shader->selector->info.stage == MESA_SHADER_GEOMETRY;50}5152/** Whether the shader runs on a merged HW stage (LSHS or ESGS) */53bool si_is_merged_shader(struct si_shader *shader)54{55return shader->key.as_ngg || si_is_multi_part_shader(shader);56}5758/**59* Returns a unique index for a per-patch semantic name and index. The index60* must be less than 32, so that a 32-bit bitmask of used inputs or outputs61* can be calculated.62*/63unsigned si_shader_io_get_unique_index_patch(unsigned semantic)64{65switch (semantic) {66case VARYING_SLOT_TESS_LEVEL_OUTER:67return 0;68case VARYING_SLOT_TESS_LEVEL_INNER:69return 1;70default:71if (semantic >= VARYING_SLOT_PATCH0 && semantic < VARYING_SLOT_PATCH0 + 30)72return 2 + (semantic - VARYING_SLOT_PATCH0);7374assert(!"invalid semantic");75return 0;76}77}7879/**80* Returns a unique index for a semantic name and index. The index must be81* less than 64, so that a 64-bit bitmask of used inputs or outputs can be82* calculated.83*/84unsigned si_shader_io_get_unique_index(unsigned semantic, bool is_varying)85{86switch (semantic) {87case VARYING_SLOT_POS:88return 0;89default:90/* Since some shader stages use the highest used IO index91* to determine the size to allocate for inputs/outputs92* (in LDS, tess and GS rings). GENERIC should be placed right93* after POSITION to make that size as small as possible.94*/95if (semantic >= VARYING_SLOT_VAR0 && semantic <= VARYING_SLOT_VAR31)96return 1 + (semantic - VARYING_SLOT_VAR0); /* 1..32 */9798/* Put 16-bit GLES varyings after 32-bit varyings. They can use the same indices as99* legacy desktop GL varyings because they are mutually exclusive.100*/101if (semantic >= VARYING_SLOT_VAR0_16BIT && semantic <= VARYING_SLOT_VAR15_16BIT)102return 33 + (semantic - VARYING_SLOT_VAR0_16BIT); /* 33..48 */103104assert(!"invalid generic index");105return 0;106107/* Legacy desktop GL varyings. */108case VARYING_SLOT_FOGC:109return 33;110case VARYING_SLOT_COL0:111return 34;112case VARYING_SLOT_COL1:113return 35;114case VARYING_SLOT_BFC0:115/* If it's a varying, COLOR and BCOLOR alias. */116if (is_varying)117return 34;118else119return 36;120case VARYING_SLOT_BFC1:121if (is_varying)122return 35;123else124return 37;125case VARYING_SLOT_TEX0:126case VARYING_SLOT_TEX1:127case VARYING_SLOT_TEX2:128case VARYING_SLOT_TEX3:129case VARYING_SLOT_TEX4:130case VARYING_SLOT_TEX5:131case VARYING_SLOT_TEX6:132case VARYING_SLOT_TEX7:133return 38 + (semantic - VARYING_SLOT_TEX0);134case VARYING_SLOT_CLIP_VERTEX:135return 46;136137/* Varyings present in both GLES and desktop GL must start at 49 after 16-bit varyings. */138case VARYING_SLOT_CLIP_DIST0:139return 49;140case VARYING_SLOT_CLIP_DIST1:141return 50;142case VARYING_SLOT_PSIZ:143return 51;144145/* These can't be written by LS, HS, and ES. */146case VARYING_SLOT_LAYER:147return 52;148case VARYING_SLOT_VIEWPORT:149return 53;150case VARYING_SLOT_PRIMITIVE_ID:151return 54;152}153}154155static void si_dump_streamout(struct pipe_stream_output_info *so)156{157unsigned i;158159if (so->num_outputs)160fprintf(stderr, "STREAMOUT\n");161162for (i = 0; i < so->num_outputs; i++) {163unsigned mask = ((1 << so->output[i].num_components) - 1) << so->output[i].start_component;164fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n", i, so->output[i].output_buffer,165so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,166so->output[i].register_index, mask & 1 ? "x" : "", mask & 2 ? "y" : "",167mask & 4 ? "z" : "", mask & 8 ? "w" : "");168}169}170171static void declare_streamout_params(struct si_shader_context *ctx,172struct pipe_stream_output_info *so)173{174if (ctx->screen->use_ngg_streamout) {175if (ctx->stage == MESA_SHADER_TESS_EVAL)176ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);177return;178}179180/* Streamout SGPRs. */181if (so->num_outputs) {182ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.streamout_config);183ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.streamout_write_index);184} else if (ctx->stage == MESA_SHADER_TESS_EVAL) {185ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);186}187188/* A streamout buffer offset is loaded if the stride is non-zero. */189for (int i = 0; i < 4; i++) {190if (!so->stride[i])191continue;192193ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.streamout_offset[i]);194}195}196197unsigned si_get_max_workgroup_size(const struct si_shader *shader)198{199switch (shader->selector->info.stage) {200case MESA_SHADER_VERTEX:201case MESA_SHADER_TESS_EVAL:202return shader->key.as_ngg ? 128 : 0;203204case MESA_SHADER_TESS_CTRL:205/* Return this so that LLVM doesn't remove s_barrier206* instructions on chips where we use s_barrier. */207return shader->selector->screen->info.chip_class >= GFX7 ? 128 : 0;208209case MESA_SHADER_GEOMETRY:210return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 0;211212case MESA_SHADER_COMPUTE:213break; /* see below */214215default:216return 0;217}218219/* Compile a variable block size using the maximum variable size. */220if (shader->selector->info.base.workgroup_size_variable)221return SI_MAX_VARIABLE_THREADS_PER_BLOCK;222223uint16_t *local_size = shader->selector->info.base.workgroup_size;224unsigned max_work_group_size = (uint32_t)local_size[0] *225(uint32_t)local_size[1] *226(uint32_t)local_size[2];227assert(max_work_group_size);228return max_work_group_size;229}230231static void declare_const_and_shader_buffers(struct si_shader_context *ctx, bool assign_params)232{233enum ac_arg_type const_shader_buf_type;234235if (ctx->shader->selector->info.base.num_ubos == 1 &&236ctx->shader->selector->info.base.num_ssbos == 0)237const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR;238else239const_shader_buf_type = AC_ARG_CONST_DESC_PTR;240241ac_add_arg(242&ctx->args, AC_ARG_SGPR, 1, const_shader_buf_type,243assign_params ? &ctx->const_and_shader_buffers : &ctx->other_const_and_shader_buffers);244}245246static void declare_samplers_and_images(struct si_shader_context *ctx, bool assign_params)247{248ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,249assign_params ? &ctx->samplers_and_images : &ctx->other_samplers_and_images);250}251252static void declare_per_stage_desc_pointers(struct si_shader_context *ctx, bool assign_params)253{254declare_const_and_shader_buffers(ctx, assign_params);255declare_samplers_and_images(ctx, assign_params);256}257258static void declare_global_desc_pointers(struct si_shader_context *ctx)259{260ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->internal_bindings);261ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,262&ctx->bindless_samplers_and_images);263}264265static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx)266{267ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);268if (!ctx->shader->is_gs_copy_shader) {269ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.base_vertex);270ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.draw_id);271ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.start_instance);272}273}274275static void declare_vb_descriptor_input_sgprs(struct si_shader_context *ctx)276{277ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->args.vertex_buffers);278279unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;280if (num_vbos_in_user_sgprs) {281unsigned user_sgprs = ctx->args.num_sgprs_used;282283if (si_is_merged_shader(ctx->shader))284user_sgprs -= 8;285assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);286287/* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */288for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++)289ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */290291assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(ctx->vb_descriptors));292for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++)293ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->vb_descriptors[i]);294}295}296297static void declare_vs_input_vgprs(struct si_shader_context *ctx, unsigned *num_prolog_vgprs)298{299struct si_shader *shader = ctx->shader;300301ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vertex_id);302if (shader->key.as_ls) {303ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vs_rel_patch_id);304if (ctx->screen->info.chip_class >= GFX10) {305ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */306ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);307} else {308ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);309ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */310}311} else if (ctx->screen->info.chip_class >= GFX10) {312ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */313ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,314&ctx->args.vs_prim_id); /* user vgpr or PrimID (legacy) */315ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);316} else {317ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);318ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vs_prim_id);319ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */320}321322if (!shader->is_gs_copy_shader) {323/* Vertex load indices. */324if (shader->selector->info.num_inputs) {325ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vertex_index0);326for (unsigned i = 1; i < shader->selector->info.num_inputs; i++)327ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);328}329*num_prolog_vgprs += shader->selector->info.num_inputs;330}331}332333static void declare_vs_blit_inputs(struct si_shader_context *ctx, unsigned vs_blit_property)334{335ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_blit_inputs); /* i16 x1, y1 */336ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* i16 x1, y1 */337ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* depth */338339if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {340ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */341ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */342ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */343ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */344} else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) {345ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */346ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */347ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */348ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */349ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */350ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */351}352}353354static void declare_tes_input_vgprs(struct si_shader_context *ctx)355{356ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.tes_u);357ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.tes_v);358ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_rel_patch_id);359ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_patch_id);360}361362enum363{364/* Convenient merged shader definitions. */365SI_SHADER_MERGED_VERTEX_TESSCTRL = MESA_ALL_SHADER_STAGES,366SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,367};368369void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, unsigned registers,370enum ac_arg_type type, struct ac_arg *arg, unsigned idx)371{372assert(args->arg_count == idx);373ac_add_arg(args, file, registers, type, arg);374}375376void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)377{378struct si_shader *shader = ctx->shader;379unsigned i, num_returns, num_return_sgprs;380unsigned num_prolog_vgprs = 0;381unsigned stage = ctx->stage;382383memset(&ctx->args, 0, sizeof(ctx->args));384385/* Set MERGED shaders. */386if (ctx->screen->info.chip_class >= GFX9) {387if (shader->key.as_ls || stage == MESA_SHADER_TESS_CTRL)388stage = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */389else if (shader->key.as_es || shader->key.as_ngg || stage == MESA_SHADER_GEOMETRY)390stage = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;391}392393switch (stage) {394case MESA_SHADER_VERTEX:395declare_global_desc_pointers(ctx);396397if (shader->selector->info.base.vs.blit_sgprs_amd) {398declare_vs_blit_inputs(ctx, shader->selector->info.base.vs.blit_sgprs_amd);399400/* VGPRs */401declare_vs_input_vgprs(ctx, &num_prolog_vgprs);402break;403}404405declare_per_stage_desc_pointers(ctx, true);406declare_vs_specific_input_sgprs(ctx);407if (!shader->is_gs_copy_shader)408declare_vb_descriptor_input_sgprs(ctx);409410if (shader->key.as_es) {411ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.es2gs_offset);412} else if (shader->key.as_ls) {413/* no extra parameters */414} else {415/* The locations of the other parameters are assigned dynamically. */416declare_streamout_params(ctx, &shader->selector->so);417}418419/* VGPRs */420declare_vs_input_vgprs(ctx, &num_prolog_vgprs);421422/* Return values */423if (shader->key.opt.vs_as_prim_discard_cs) {424for (i = 0; i < 4; i++)425ac_add_return(&ctx->args, AC_ARG_VGPR);426}427break;428429case MESA_SHADER_TESS_CTRL: /* GFX6-GFX8 */430declare_global_desc_pointers(ctx);431declare_per_stage_desc_pointers(ctx, true);432ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);433ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);434ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);435ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);436ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset);437ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tcs_factor_offset);438439/* VGPRs */440ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);441ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);442443/* param_tcs_offchip_offset and param_tcs_factor_offset are444* placed after the user SGPRs.445*/446for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)447ac_add_return(&ctx->args, AC_ARG_SGPR);448for (i = 0; i < 11; i++)449ac_add_return(&ctx->args, AC_ARG_VGPR);450break;451452case SI_SHADER_MERGED_VERTEX_TESSCTRL:453/* Merged stages have 8 system SGPRs at the beginning. */454/* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */455declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_TESS_CTRL);456ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset);457ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.merged_wave_info);458ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tcs_factor_offset);459ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.scratch_offset);460ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */461ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */462463declare_global_desc_pointers(ctx);464declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_VERTEX);465declare_vs_specific_input_sgprs(ctx);466467ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);468ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);469ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);470if (ctx->stage == MESA_SHADER_VERTEX)471declare_vb_descriptor_input_sgprs(ctx);472473/* VGPRs (first TCS, then VS) */474ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);475ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);476477if (ctx->stage == MESA_SHADER_VERTEX) {478declare_vs_input_vgprs(ctx, &num_prolog_vgprs);479480/* LS return values are inputs to the TCS main shader part. */481for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)482ac_add_return(&ctx->args, AC_ARG_SGPR);483for (i = 0; i < 2; i++)484ac_add_return(&ctx->args, AC_ARG_VGPR);485486/* VS outputs passed via VGPRs to TCS. */487if (shader->key.opt.same_patch_vertices) {488unsigned num_outputs = util_last_bit64(shader->selector->outputs_written);489for (i = 0; i < num_outputs * 4; i++)490ac_add_return(&ctx->args, AC_ARG_VGPR);491}492} else {493/* TCS inputs are passed via VGPRs from VS. */494if (shader->key.opt.same_patch_vertices) {495unsigned num_inputs = util_last_bit64(shader->previous_stage_sel->outputs_written);496for (i = 0; i < num_inputs * 4; i++)497ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);498}499500/* TCS return values are inputs to the TCS epilog.501*502* param_tcs_offchip_offset, param_tcs_factor_offset,503* param_tcs_offchip_layout, and internal_bindings504* should be passed to the epilog.505*/506for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++)507ac_add_return(&ctx->args, AC_ARG_SGPR);508for (i = 0; i < 11; i++)509ac_add_return(&ctx->args, AC_ARG_VGPR);510}511break;512513case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:514/* Merged stages have 8 system SGPRs at the beginning. */515/* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */516declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_GEOMETRY);517518if (ctx->shader->key.as_ngg)519ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs_tg_info);520else521ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs2vs_offset);522523ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.merged_wave_info);524ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset);525ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.scratch_offset);526ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,527&ctx->small_prim_cull_info); /* SPI_SHADER_PGM_LO_GS << 8 */528ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,529NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */530531declare_global_desc_pointers(ctx);532if (ctx->stage != MESA_SHADER_VERTEX || !shader->selector->info.base.vs.blit_sgprs_amd) {533declare_per_stage_desc_pointers(534ctx, (ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL));535}536537if (ctx->stage == MESA_SHADER_VERTEX) {538if (shader->selector->info.base.vs.blit_sgprs_amd)539declare_vs_blit_inputs(ctx, shader->selector->info.base.vs.blit_sgprs_amd);540else541declare_vs_specific_input_sgprs(ctx);542} else {543ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);544545if (ctx->stage == MESA_SHADER_TESS_EVAL) {546ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);547ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);548}549}550551if (ctx->stage == MESA_SHADER_VERTEX)552declare_vb_descriptor_input_sgprs(ctx);553554/* VGPRs (first GS, then VS/TES) */555ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx01_offset);556ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx23_offset);557ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);558ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);559ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx45_offset);560561if (ctx->stage == MESA_SHADER_VERTEX) {562declare_vs_input_vgprs(ctx, &num_prolog_vgprs);563} else if (ctx->stage == MESA_SHADER_TESS_EVAL) {564declare_tes_input_vgprs(ctx);565}566567if ((ctx->shader->key.as_es || ngg_cull_shader) &&568(ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL)) {569unsigned num_user_sgprs, num_vgprs;570571if (ctx->stage == MESA_SHADER_VERTEX && ngg_cull_shader) {572/* For the NGG cull shader, add 1 SGPR to hold573* the vertex buffer pointer.574*/575num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR + 1;576577if (shader->selector->num_vbos_in_user_sgprs) {578assert(num_user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);579num_user_sgprs =580SI_SGPR_VS_VB_DESCRIPTOR_FIRST + shader->selector->num_vbos_in_user_sgprs * 4;581}582} else if (ctx->stage == MESA_SHADER_TESS_EVAL && ngg_cull_shader) {583num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;584} else {585num_user_sgprs = SI_NUM_VS_STATE_RESOURCE_SGPRS;586}587588/* The NGG cull shader has to return all 9 VGPRs.589*590* The normal merged ESGS shader only has to return the 5 VGPRs591* for the GS stage.592*/593num_vgprs = ngg_cull_shader ? 9 : 5;594595/* ES return values are inputs to GS. */596for (i = 0; i < 8 + num_user_sgprs; i++)597ac_add_return(&ctx->args, AC_ARG_SGPR);598for (i = 0; i < num_vgprs; i++)599ac_add_return(&ctx->args, AC_ARG_VGPR);600}601break;602603case MESA_SHADER_TESS_EVAL:604declare_global_desc_pointers(ctx);605declare_per_stage_desc_pointers(ctx, true);606ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);607ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);608ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);609610if (shader->key.as_es) {611ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset);612ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);613ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.es2gs_offset);614} else {615declare_streamout_params(ctx, &shader->selector->so);616ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset);617}618619/* VGPRs */620declare_tes_input_vgprs(ctx);621break;622623case MESA_SHADER_GEOMETRY:624declare_global_desc_pointers(ctx);625declare_per_stage_desc_pointers(ctx, true);626ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs2vs_offset);627ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs_wave_id);628629/* VGPRs */630ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[0]);631ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[1]);632ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);633ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[2]);634ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[3]);635ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[4]);636ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[5]);637ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);638break;639640case MESA_SHADER_FRAGMENT:641declare_global_desc_pointers(ctx);642declare_per_stage_desc_pointers(ctx, true);643si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL, SI_PARAM_ALPHA_REF);644si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.prim_mask,645SI_PARAM_PRIM_MASK);646647si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_sample,648SI_PARAM_PERSP_SAMPLE);649si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_center,650SI_PARAM_PERSP_CENTER);651si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_centroid,652SI_PARAM_PERSP_CENTROID);653si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, NULL, SI_PARAM_PERSP_PULL_MODEL);654si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_sample,655SI_PARAM_LINEAR_SAMPLE);656si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_center,657SI_PARAM_LINEAR_CENTER);658si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_centroid,659SI_PARAM_LINEAR_CENTROID);660si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_FLOAT, NULL, SI_PARAM_LINE_STIPPLE_TEX);661si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[0],662SI_PARAM_POS_X_FLOAT);663si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[1],664SI_PARAM_POS_Y_FLOAT);665si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[2],666SI_PARAM_POS_Z_FLOAT);667si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[3],668SI_PARAM_POS_W_FLOAT);669shader->info.face_vgpr_index = ctx->args.num_vgprs_used;670si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.front_face,671SI_PARAM_FRONT_FACE);672shader->info.ancillary_vgpr_index = ctx->args.num_vgprs_used;673si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.ancillary,674SI_PARAM_ANCILLARY);675si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.sample_coverage,676SI_PARAM_SAMPLE_COVERAGE);677si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->pos_fixed_pt,678SI_PARAM_POS_FIXED_PT);679680/* Color inputs from the prolog. */681if (shader->selector->info.colors_read) {682unsigned num_color_elements = util_bitcount(shader->selector->info.colors_read);683684for (i = 0; i < num_color_elements; i++)685ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);686687num_prolog_vgprs += num_color_elements;688}689690/* Outputs for the epilog. */691num_return_sgprs = SI_SGPR_ALPHA_REF + 1;692num_returns = num_return_sgprs + util_bitcount(shader->selector->info.colors_written) * 4 +693shader->selector->info.writes_z + shader->selector->info.writes_stencil +694shader->selector->info.writes_samplemask + 1 /* SampleMaskIn */;695696num_returns = MAX2(num_returns, num_return_sgprs + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);697698for (i = 0; i < num_return_sgprs; i++)699ac_add_return(&ctx->args, AC_ARG_SGPR);700for (; i < num_returns; i++)701ac_add_return(&ctx->args, AC_ARG_VGPR);702break;703704case MESA_SHADER_COMPUTE:705declare_global_desc_pointers(ctx);706declare_per_stage_desc_pointers(ctx, true);707if (shader->selector->info.uses_grid_size)708ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->args.num_work_groups);709if (shader->selector->info.uses_variable_block_size)710ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->block_size);711712unsigned cs_user_data_dwords =713shader->selector->info.base.cs.user_data_components_amd;714if (cs_user_data_dwords) {715ac_add_arg(&ctx->args, AC_ARG_SGPR, cs_user_data_dwords, AC_ARG_INT, &ctx->cs_user_data);716}717718/* Some descriptors can be in user SGPRs. */719/* Shader buffers in user SGPRs. */720for (unsigned i = 0; i < shader->selector->cs_num_shaderbufs_in_user_sgprs; i++) {721while (ctx->args.num_sgprs_used % 4 != 0)722ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);723724ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->cs_shaderbuf[i]);725}726/* Images in user SGPRs. */727for (unsigned i = 0; i < shader->selector->cs_num_images_in_user_sgprs; i++) {728unsigned num_sgprs = shader->selector->info.base.image_buffers & (1 << i) ? 4 : 8;729730while (ctx->args.num_sgprs_used % num_sgprs != 0)731ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);732733ac_add_arg(&ctx->args, AC_ARG_SGPR, num_sgprs, AC_ARG_INT, &ctx->cs_image[i]);734}735736/* Hardware SGPRs. */737for (i = 0; i < 3; i++) {738if (shader->selector->info.uses_block_id[i]) {739ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.workgroup_ids[i]);740}741}742if (shader->selector->info.uses_subgroup_info)743ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tg_size);744745/* Hardware VGPRs. */746if (!ctx->screen->info.has_graphics && ctx->screen->info.family >= CHIP_ALDEBARAN)747ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.local_invocation_ids);748else749ac_add_arg(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, &ctx->args.local_invocation_ids);750break;751default:752assert(0 && "unimplemented shader");753return;754}755756shader->info.num_input_sgprs = ctx->args.num_sgprs_used;757shader->info.num_input_vgprs = ctx->args.num_vgprs_used;758759assert(shader->info.num_input_vgprs >= num_prolog_vgprs);760shader->info.num_input_vgprs -= num_prolog_vgprs;761}762763/* For the UMR disassembler. */764#define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */765#define DEBUGGER_NUM_MARKERS 5766767static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,768struct ac_rtld_binary *rtld)769{770const struct si_shader_selector *sel = shader->selector;771const char *part_elfs[5];772size_t part_sizes[5];773unsigned num_parts = 0;774775#define add_part(shader_or_part) \776if (shader_or_part) { \777part_elfs[num_parts] = (shader_or_part)->binary.elf_buffer; \778part_sizes[num_parts] = (shader_or_part)->binary.elf_size; \779num_parts++; \780}781782add_part(shader->prolog);783add_part(shader->previous_stage);784add_part(shader->prolog2);785add_part(shader);786add_part(shader->epilog);787788#undef add_part789790struct ac_rtld_symbol lds_symbols[2];791unsigned num_lds_symbols = 0;792793if (sel && screen->info.chip_class >= GFX9 && !shader->is_gs_copy_shader &&794(sel->info.stage == MESA_SHADER_GEOMETRY || shader->key.as_ngg)) {795struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];796sym->name = "esgs_ring";797sym->size = shader->gs_info.esgs_ring_size * 4;798sym->align = 64 * 1024;799}800801if (shader->key.as_ngg && sel->info.stage == MESA_SHADER_GEOMETRY) {802struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];803sym->name = "ngg_emit";804sym->size = shader->ngg.ngg_emit_size * 4;805sym->align = 4;806}807808bool ok = ac_rtld_open(809rtld, (struct ac_rtld_open_info){.info = &screen->info,810.options =811{812.halt_at_entry = screen->options.halt_shaders,813},814.shader_type = sel->info.stage,815.wave_size = si_get_shader_wave_size(shader),816.num_parts = num_parts,817.elf_ptrs = part_elfs,818.elf_sizes = part_sizes,819.num_shared_lds_symbols = num_lds_symbols,820.shared_lds_symbols = lds_symbols});821822if (rtld->lds_size > 0) {823unsigned alloc_granularity = screen->info.chip_class >= GFX7 ? 512 : 256;824shader->config.lds_size = align(rtld->lds_size, alloc_granularity) / alloc_granularity;825}826827return ok;828}829830static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader)831{832struct ac_rtld_binary rtld;833si_shader_binary_open(screen, shader, &rtld);834return rtld.exec_size;835}836837static bool si_get_external_symbol(void *data, const char *name, uint64_t *value)838{839uint64_t *scratch_va = data;840841if (!strcmp(scratch_rsrc_dword0_symbol, name)) {842*value = (uint32_t)*scratch_va;843return true;844}845if (!strcmp(scratch_rsrc_dword1_symbol, name)) {846/* Enable scratch coalescing. */847*value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32) | S_008F04_SWIZZLE_ENABLE(1);848return true;849}850851return false;852}853854bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,855uint64_t scratch_va)856{857struct ac_rtld_binary binary;858if (!si_shader_binary_open(sscreen, shader, &binary))859return false;860861si_resource_reference(&shader->bo, NULL);862shader->bo = si_aligned_buffer_create(863&sscreen->b,864(sscreen->info.cpdma_prefetch_writes_memory ?8650 : SI_RESOURCE_FLAG_READ_ONLY) | SI_RESOURCE_FLAG_DRIVER_INTERNAL,866PIPE_USAGE_IMMUTABLE, align(binary.rx_size, SI_CPDMA_ALIGNMENT), 256);867if (!shader->bo)868return false;869870/* Upload. */871struct ac_rtld_upload_info u = {};872u.binary = &binary;873u.get_external_symbol = si_get_external_symbol;874u.cb_data = &scratch_va;875u.rx_va = shader->bo->gpu_address;876u.rx_ptr = sscreen->ws->buffer_map(sscreen->ws,877shader->bo->buf, NULL,878PIPE_MAP_READ_WRITE | PIPE_MAP_UNSYNCHRONIZED | RADEON_MAP_TEMPORARY);879if (!u.rx_ptr)880return false;881882int size = ac_rtld_upload(&u);883884if (sscreen->debug_flags & DBG(SQTT)) {885/* Remember the uploaded code */886shader->binary.uploaded_code_size = size;887shader->binary.uploaded_code = malloc(size);888memcpy(shader->binary.uploaded_code, u.rx_ptr, size);889}890891sscreen->ws->buffer_unmap(sscreen->ws, shader->bo->buf);892ac_rtld_close(&binary);893894return size >= 0;895}896897static void si_shader_dump_disassembly(struct si_screen *screen,898const struct si_shader_binary *binary,899gl_shader_stage stage, unsigned wave_size,900struct pipe_debug_callback *debug, const char *name,901FILE *file)902{903struct ac_rtld_binary rtld_binary;904905if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){906.info = &screen->info,907.shader_type = stage,908.wave_size = wave_size,909.num_parts = 1,910.elf_ptrs = &binary->elf_buffer,911.elf_sizes = &binary->elf_size}))912return;913914const char *disasm;915size_t nbytes;916917if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes))918goto out;919920if (nbytes > INT_MAX)921goto out;922923if (debug && debug->debug_message) {924/* Very long debug messages are cut off, so send the925* disassembly one line at a time. This causes more926* overhead, but on the plus side it simplifies927* parsing of resulting logs.928*/929pipe_debug_message(debug, SHADER_INFO, "Shader Disassembly Begin");930931uint64_t line = 0;932while (line < nbytes) {933int count = nbytes - line;934const char *nl = memchr(disasm + line, '\n', nbytes - line);935if (nl)936count = nl - (disasm + line);937938if (count) {939pipe_debug_message(debug, SHADER_INFO, "%.*s", count, disasm + line);940}941942line += count + 1;943}944945pipe_debug_message(debug, SHADER_INFO, "Shader Disassembly End");946}947948if (file) {949fprintf(file, "Shader %s disassembly:\n", name);950fprintf(file, "%*s", (int)nbytes, disasm);951}952953out:954ac_rtld_close(&rtld_binary);955}956957static void si_calculate_max_simd_waves(struct si_shader *shader)958{959struct si_screen *sscreen = shader->selector->screen;960struct ac_shader_config *conf = &shader->config;961unsigned num_inputs = shader->selector->info.num_inputs;962unsigned lds_increment = sscreen->info.chip_class >= GFX7 ? 512 : 256;963unsigned lds_per_wave = 0;964unsigned max_simd_waves;965966max_simd_waves = sscreen->info.max_wave64_per_simd;967968/* Compute LDS usage for PS. */969switch (shader->selector->info.stage) {970case MESA_SHADER_FRAGMENT:971/* The minimum usage per wave is (num_inputs * 48). The maximum972* usage is (num_inputs * 48 * 16).973* We can get anything in between and it varies between waves.974*975* The 48 bytes per input for a single primitive is equal to976* 4 bytes/component * 4 components/input * 3 points.977*978* Other stages don't know the size at compile time or don't979* allocate LDS per wave, but instead they do it per thread group.980*/981lds_per_wave = conf->lds_size * lds_increment + align(num_inputs * 48, lds_increment);982break;983case MESA_SHADER_COMPUTE: {984unsigned max_workgroup_size = si_get_max_workgroup_size(shader);985lds_per_wave = (conf->lds_size * lds_increment) /986DIV_ROUND_UP(max_workgroup_size, sscreen->compute_wave_size);987}988break;989default:;990}991992/* Compute the per-SIMD wave counts. */993if (conf->num_sgprs) {994max_simd_waves =995MIN2(max_simd_waves, sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs);996}997998if (conf->num_vgprs) {999/* Always print wave limits as Wave64, so that we can compare1000* Wave32 and Wave64 with shader-db fairly. */1001unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd;1002max_simd_waves = MIN2(max_simd_waves, max_vgprs / conf->num_vgprs);1003}10041005unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / 4;1006if (lds_per_wave)1007max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);10081009shader->info.max_simd_waves = max_simd_waves;1010}10111012void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,1013struct pipe_debug_callback *debug)1014{1015const struct ac_shader_config *conf = &shader->config;10161017if (screen->options.debug_disassembly)1018si_shader_dump_disassembly(screen, &shader->binary, shader->selector->info.stage,1019si_get_shader_wave_size(shader), debug, "main", NULL);10201021pipe_debug_message(debug, SHADER_INFO,1022"Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "1023"LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "1024"Spilled VGPRs: %d PrivMem VGPRs: %d",1025conf->num_sgprs, conf->num_vgprs, si_get_shader_binary_size(screen, shader),1026conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves,1027conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs);1028}10291030static void si_shader_dump_stats(struct si_screen *sscreen, struct si_shader *shader, FILE *file,1031bool check_debug_option)1032{1033const struct ac_shader_config *conf = &shader->config;10341035if (!check_debug_option || si_can_dump_shader(sscreen, shader->selector->info.stage)) {1036if (shader->selector->info.stage == MESA_SHADER_FRAGMENT) {1037fprintf(file,1038"*** SHADER CONFIG ***\n"1039"SPI_PS_INPUT_ADDR = 0x%04x\n"1040"SPI_PS_INPUT_ENA = 0x%04x\n",1041conf->spi_ps_input_addr, conf->spi_ps_input_ena);1042}10431044fprintf(file,1045"*** SHADER STATS ***\n"1046"SGPRS: %d\n"1047"VGPRS: %d\n"1048"Spilled SGPRs: %d\n"1049"Spilled VGPRs: %d\n"1050"Private memory VGPRs: %d\n"1051"Code Size: %d bytes\n"1052"LDS: %d blocks\n"1053"Scratch: %d bytes per wave\n"1054"Max Waves: %d\n"1055"********************\n\n\n",1056conf->num_sgprs, conf->num_vgprs, conf->spilled_sgprs, conf->spilled_vgprs,1057shader->info.private_mem_vgprs, si_get_shader_binary_size(sscreen, shader),1058conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves);1059}1060}10611062const char *si_get_shader_name(const struct si_shader *shader)1063{1064switch (shader->selector->info.stage) {1065case MESA_SHADER_VERTEX:1066if (shader->key.as_es)1067return "Vertex Shader as ES";1068else if (shader->key.as_ls)1069return "Vertex Shader as LS";1070else if (shader->key.opt.vs_as_prim_discard_cs)1071return "Vertex Shader as Primitive Discard CS";1072else if (shader->key.as_ngg)1073return "Vertex Shader as ESGS";1074else1075return "Vertex Shader as VS";1076case MESA_SHADER_TESS_CTRL:1077return "Tessellation Control Shader";1078case MESA_SHADER_TESS_EVAL:1079if (shader->key.as_es)1080return "Tessellation Evaluation Shader as ES";1081else if (shader->key.as_ngg)1082return "Tessellation Evaluation Shader as ESGS";1083else1084return "Tessellation Evaluation Shader as VS";1085case MESA_SHADER_GEOMETRY:1086if (shader->is_gs_copy_shader)1087return "GS Copy Shader as VS";1088else1089return "Geometry Shader";1090case MESA_SHADER_FRAGMENT:1091return "Pixel Shader";1092case MESA_SHADER_COMPUTE:1093return "Compute Shader";1094default:1095return "Unknown Shader";1096}1097}10981099void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,1100struct pipe_debug_callback *debug, FILE *file, bool check_debug_option)1101{1102gl_shader_stage stage = shader->selector->info.stage;11031104if (!check_debug_option || si_can_dump_shader(sscreen, stage))1105si_dump_shader_key(shader, file);11061107if (!check_debug_option && shader->binary.llvm_ir_string) {1108if (shader->previous_stage && shader->previous_stage->binary.llvm_ir_string) {1109fprintf(file, "\n%s - previous stage - LLVM IR:\n\n", si_get_shader_name(shader));1110fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);1111}11121113fprintf(file, "\n%s - main shader part - LLVM IR:\n\n", si_get_shader_name(shader));1114fprintf(file, "%s\n", shader->binary.llvm_ir_string);1115}11161117if (!check_debug_option ||1118(si_can_dump_shader(sscreen, stage) && !(sscreen->debug_flags & DBG(NO_ASM)))) {1119unsigned wave_size = si_get_shader_wave_size(shader);11201121fprintf(file, "\n%s:\n", si_get_shader_name(shader));11221123if (shader->prolog)1124si_shader_dump_disassembly(sscreen, &shader->prolog->binary, stage, wave_size, debug,1125"prolog", file);1126if (shader->previous_stage)1127si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary, stage,1128wave_size, debug, "previous stage", file);1129if (shader->prolog2)1130si_shader_dump_disassembly(sscreen, &shader->prolog2->binary, stage, wave_size,1131debug, "prolog2", file);11321133si_shader_dump_disassembly(sscreen, &shader->binary, stage, wave_size, debug, "main",1134file);11351136if (shader->epilog)1137si_shader_dump_disassembly(sscreen, &shader->epilog->binary, stage, wave_size, debug,1138"epilog", file);1139fprintf(file, "\n");1140}11411142si_shader_dump_stats(sscreen, shader, file, check_debug_option);1143}11441145static void si_dump_shader_key_vs(const struct si_shader_key *key,1146const struct si_vs_prolog_bits *prolog, const char *prefix,1147FILE *f)1148{1149fprintf(f, " %s.instance_divisor_is_one = %u\n", prefix, prolog->instance_divisor_is_one);1150fprintf(f, " %s.instance_divisor_is_fetched = %u\n", prefix,1151prolog->instance_divisor_is_fetched);1152fprintf(f, " %s.unpack_instance_id_from_vertex_id = %u\n", prefix,1153prolog->unpack_instance_id_from_vertex_id);1154fprintf(f, " %s.ls_vgpr_fix = %u\n", prefix, prolog->ls_vgpr_fix);11551156fprintf(f, " mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode);1157fprintf(f, " mono.vs.fix_fetch = {");1158for (int i = 0; i < SI_MAX_ATTRIBS; i++) {1159union si_vs_fix_fetch fix = key->mono.vs_fix_fetch[i];1160if (i)1161fprintf(f, ", ");1162if (!fix.bits)1163fprintf(f, "0");1164else1165fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size, fix.u.num_channels_m1,1166fix.u.format);1167}1168fprintf(f, "}\n");1169}11701171static void si_dump_shader_key(const struct si_shader *shader, FILE *f)1172{1173const struct si_shader_key *key = &shader->key;1174gl_shader_stage stage = shader->selector->info.stage;11751176fprintf(f, "SHADER KEY\n");11771178switch (stage) {1179case MESA_SHADER_VERTEX:1180si_dump_shader_key_vs(key, &key->part.vs.prolog, "part.vs.prolog", f);1181fprintf(f, " as_es = %u\n", key->as_es);1182fprintf(f, " as_ls = %u\n", key->as_ls);1183fprintf(f, " as_ngg = %u\n", key->as_ngg);1184fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id);1185fprintf(f, " opt.vs_as_prim_discard_cs = %u\n", key->opt.vs_as_prim_discard_cs);1186fprintf(f, " opt.cs_prim_type = %s\n", tgsi_primitive_names[key->opt.cs_prim_type]);1187fprintf(f, " opt.cs_indexed = %u\n", key->opt.cs_indexed);1188fprintf(f, " opt.cs_instancing = %u\n", key->opt.cs_instancing);1189fprintf(f, " opt.cs_provoking_vertex_first = %u\n", key->opt.cs_provoking_vertex_first);1190fprintf(f, " opt.cs_cull_front = %u\n", key->opt.cs_cull_front);1191fprintf(f, " opt.cs_cull_back = %u\n", key->opt.cs_cull_back);1192break;11931194case MESA_SHADER_TESS_CTRL:1195if (shader->selector->screen->info.chip_class >= GFX9) {1196si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog, "part.tcs.ls_prolog", f);1197}1198fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);1199fprintf(f, " mono.u.ff_tcs_inputs_to_copy = 0x%" PRIx64 "\n",1200key->mono.u.ff_tcs_inputs_to_copy);1201fprintf(f, " opt.prefer_mono = %u\n", key->opt.prefer_mono);1202fprintf(f, " opt.same_patch_vertices = %u\n", key->opt.same_patch_vertices);1203break;12041205case MESA_SHADER_TESS_EVAL:1206fprintf(f, " as_es = %u\n", key->as_es);1207fprintf(f, " as_ngg = %u\n", key->as_ngg);1208fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id);1209break;12101211case MESA_SHADER_GEOMETRY:1212if (shader->is_gs_copy_shader)1213break;12141215if (shader->selector->screen->info.chip_class >= GFX9 &&1216key->part.gs.es->info.stage == MESA_SHADER_VERTEX) {1217si_dump_shader_key_vs(key, &key->part.gs.vs_prolog, "part.gs.vs_prolog", f);1218}1219fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n",1220key->part.gs.prolog.tri_strip_adj_fix);1221fprintf(f, " as_ngg = %u\n", key->as_ngg);1222break;12231224case MESA_SHADER_COMPUTE:1225break;12261227case MESA_SHADER_FRAGMENT:1228fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);1229fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);1230fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);1231fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n",1232key->part.ps.prolog.force_persp_sample_interp);1233fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n",1234key->part.ps.prolog.force_linear_sample_interp);1235fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n",1236key->part.ps.prolog.force_persp_center_interp);1237fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n",1238key->part.ps.prolog.force_linear_center_interp);1239fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n",1240key->part.ps.prolog.bc_optimize_for_persp);1241fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n",1242key->part.ps.prolog.bc_optimize_for_linear);1243fprintf(f, " part.ps.prolog.samplemask_log_ps_iter = %u\n",1244key->part.ps.prolog.samplemask_log_ps_iter);1245fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n",1246key->part.ps.epilog.spi_shader_col_format);1247fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);1248fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);1249fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);1250fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);1251fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);1252fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n",1253key->part.ps.epilog.poly_line_smoothing);1254fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);1255fprintf(f, " mono.u.ps.interpolate_at_sample_force_center = %u\n",1256key->mono.u.ps.interpolate_at_sample_force_center);1257fprintf(f, " mono.u.ps.fbfetch_msaa = %u\n", key->mono.u.ps.fbfetch_msaa);1258fprintf(f, " mono.u.ps.fbfetch_is_1D = %u\n", key->mono.u.ps.fbfetch_is_1D);1259fprintf(f, " mono.u.ps.fbfetch_layered = %u\n", key->mono.u.ps.fbfetch_layered);1260break;12611262default:1263assert(0);1264}12651266if ((stage == MESA_SHADER_GEOMETRY || stage == MESA_SHADER_TESS_EVAL ||1267stage == MESA_SHADER_VERTEX) &&1268!key->as_es && !key->as_ls) {1269fprintf(f, " opt.kill_outputs = 0x%" PRIx64 "\n", key->opt.kill_outputs);1270fprintf(f, " opt.kill_pointsize = 0x%x\n", key->opt.kill_pointsize);1271fprintf(f, " opt.kill_clip_distances = 0x%x\n", key->opt.kill_clip_distances);1272if (stage != MESA_SHADER_GEOMETRY)1273fprintf(f, " opt.ngg_culling = 0x%x\n", key->opt.ngg_culling);1274}12751276fprintf(f, " opt.prefer_mono = %u\n", key->opt.prefer_mono);1277fprintf(f, " opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n",1278key->opt.inline_uniforms,1279key->opt.inlined_uniform_values[0],1280key->opt.inlined_uniform_values[1],1281key->opt.inlined_uniform_values[2],1282key->opt.inlined_uniform_values[3]);1283}12841285bool si_vs_needs_prolog(const struct si_shader_selector *sel,1286const struct si_vs_prolog_bits *prolog_key,1287const struct si_shader_key *key, bool ngg_cull_shader)1288{1289/* VGPR initialization fixup for Vega10 and Raven is always done in the1290* VS prolog. */1291return sel->vs_needs_prolog || prolog_key->ls_vgpr_fix ||1292prolog_key->unpack_instance_id_from_vertex_id ||1293/* The 2nd VS prolog loads input VGPRs from LDS */1294(key->opt.ngg_culling && !ngg_cull_shader) ||1295/* The 1st VS prolog generates input VGPRs for fast launch. */1296(ngg_cull_shader && key->opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);1297}12981299/**1300* Compute the VS prolog key, which contains all the information needed to1301* build the VS prolog function, and set shader->info bits where needed.1302*1303* \param info Shader info of the vertex shader.1304* \param num_input_sgprs Number of input SGPRs for the vertex shader.1305* \param has_old_ Whether the preceding shader part is the NGG cull shader.1306* \param prolog_key Key of the VS prolog1307* \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS.1308* \param key Output shader part key.1309*/1310void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_sgprs,1311bool ngg_cull_shader, const struct si_vs_prolog_bits *prolog_key,1312struct si_shader *shader_out, union si_shader_part_key *key)1313{1314memset(key, 0, sizeof(*key));1315key->vs_prolog.states = *prolog_key;1316key->vs_prolog.num_input_sgprs = num_input_sgprs;1317key->vs_prolog.num_inputs = info->num_inputs;1318key->vs_prolog.as_ls = shader_out->key.as_ls;1319key->vs_prolog.as_es = shader_out->key.as_es;1320key->vs_prolog.as_ngg = shader_out->key.as_ngg;1321key->vs_prolog.as_prim_discard_cs = shader_out->key.opt.vs_as_prim_discard_cs;13221323if (ngg_cull_shader) {1324key->vs_prolog.gs_fast_launch_tri_list =1325!!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST);1326key->vs_prolog.gs_fast_launch_tri_strip =1327!!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP);1328key->vs_prolog.gs_fast_launch_index_size_packed =1329SI_GET_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(shader_out->key.opt.ngg_culling);1330} else if (shader_out->key.opt.ngg_culling) {1331key->vs_prolog.load_vgprs_after_culling = 1;1332}13331334if (shader_out->selector->info.stage == MESA_SHADER_TESS_CTRL) {1335key->vs_prolog.as_ls = 1;1336key->vs_prolog.num_merged_next_stage_vgprs = 2;1337} else if (shader_out->selector->info.stage == MESA_SHADER_GEOMETRY) {1338key->vs_prolog.as_es = 1;1339key->vs_prolog.num_merged_next_stage_vgprs = 5;1340} else if (shader_out->key.as_ngg) {1341key->vs_prolog.num_merged_next_stage_vgprs = 5;1342}13431344/* Only one of these combinations can be set. as_ngg can be set with as_es. */1345assert(key->vs_prolog.as_ls + key->vs_prolog.as_ngg +1346(key->vs_prolog.as_es && !key->vs_prolog.as_ngg) + key->vs_prolog.as_prim_discard_cs <=13471);13481349/* Enable loading the InstanceID VGPR. */1350uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);13511352if ((key->vs_prolog.states.instance_divisor_is_one |1353key->vs_prolog.states.instance_divisor_is_fetched) &1354input_mask)1355shader_out->info.uses_instanceid = true;1356}13571358struct nir_shader *si_get_nir_shader(struct si_shader_selector *sel,1359const struct si_shader_key *key,1360bool *free_nir)1361{1362nir_shader *nir;1363*free_nir = false;13641365if (sel->nir) {1366nir = sel->nir;1367} else if (sel->nir_binary) {1368struct pipe_screen *screen = &sel->screen->b;1369const void *options = screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR,1370pipe_shader_type_from_mesa(sel->info.stage));13711372struct blob_reader blob_reader;1373blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size);1374*free_nir = true;1375nir = nir_deserialize(NULL, options, &blob_reader);1376} else {1377return NULL;1378}13791380if (key && key->opt.inline_uniforms) {1381assert(*free_nir);13821383/* Most places use shader information from the default variant, not1384* the optimized variant. These are the things that the driver looks at1385* in optimized variants and the list of things that we need to do.1386*1387* The driver takes into account these things if they suddenly disappear1388* from the shader code:1389* - Register usage and code size decrease (obvious)1390* - Eliminated PS system values are disabled by LLVM1391* (FragCoord, FrontFace, barycentrics)1392* - VS/TES/GS outputs feeding PS are eliminated if outputs are undef.1393* (thanks to an LLVM pass in Mesa - TODO: move it to NIR)1394* The storage for eliminated outputs is also not allocated.1395* - VS/TCS/TES/GS/PS input loads are eliminated (VS relies on DCE in LLVM)1396* - TCS output stores are eliminated1397*1398* TODO: These are things the driver ignores in the final shader code1399* and relies on the default shader info.1400* - Other system values are not eliminated1401* - PS.NUM_INTERP = bitcount64(inputs_read), renumber inputs1402* to remove holes1403* - uses_discard - if it changed to false1404* - writes_memory - if it changed to false1405* - VS->TCS, VS->GS, TES->GS output stores for the former stage are not1406* eliminated1407* - Eliminated VS/TCS/TES outputs are still allocated. (except when feeding PS)1408* GS outputs are eliminated except for the temporary LDS.1409* Clip distances, gl_PointSize, and PS outputs are eliminated based1410* on current states, so we don't care about the shader code.1411*1412* TODO: Merged shaders don't inline uniforms for the first stage.1413* VS-GS: only GS inlines uniforms; VS-TCS: only TCS; TES-GS: only GS.1414* (key == NULL for the first stage here)1415*1416* TODO: Compute shaders don't support inlinable uniforms, because they1417* don't have shader variants.1418*1419* TODO: The driver uses a linear search to find a shader variant. This1420* can be really slow if we get too many variants due to uniform inlining.1421*/1422NIR_PASS_V(nir, nir_inline_uniforms,1423nir->info.num_inlinable_uniforms,1424key->opt.inlined_uniform_values,1425nir->info.inlinable_uniform_dw_offsets);14261427si_nir_opts(sel->screen, nir, true);1428si_nir_late_opts(nir);14291430/* This must be done again. */1431NIR_PASS_V(nir, nir_io_add_const_offset_to_base, nir_var_shader_in |1432nir_var_shader_out);1433}14341435return nir;1436}14371438bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,1439struct si_shader *shader, struct pipe_debug_callback *debug)1440{1441struct si_shader_selector *sel = shader->selector;1442bool free_nir;1443struct nir_shader *nir = si_get_nir_shader(sel, &shader->key, &free_nir);14441445/* Dump NIR before doing NIR->LLVM conversion in case the1446* conversion fails. */1447if (si_can_dump_shader(sscreen, sel->info.stage) &&1448!(sscreen->debug_flags & DBG(NO_NIR))) {1449nir_print_shader(nir, stderr);1450si_dump_streamout(&sel->so);1451}14521453memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,1454sizeof(shader->info.vs_output_param_offset));14551456shader->info.uses_instanceid = sel->info.uses_instanceid;14571458/* TODO: ACO could compile non-monolithic shaders here (starting1459* with PS and NGG VS), but monolithic shaders should be compiled1460* by LLVM due to more complicated compilation.1461*/1462if (!si_llvm_compile_shader(sscreen, compiler, shader, debug, nir, free_nir))1463return false;14641465/* Validate SGPR and VGPR usage for compute to detect compiler bugs. */1466if (sel->info.stage == MESA_SHADER_COMPUTE) {1467unsigned wave_size = sscreen->compute_wave_size;1468unsigned max_vgprs =1469sscreen->info.num_physical_wave64_vgprs_per_simd * (wave_size == 32 ? 2 : 1);1470unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd;1471unsigned max_sgprs_per_wave = 128;1472unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */1473unsigned threads_per_tg = si_get_max_workgroup_size(shader);1474unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, wave_size);1475unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg);14761477max_vgprs = max_vgprs / waves_per_simd;1478max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave);14791480if (shader->config.num_sgprs > max_sgprs || shader->config.num_vgprs > max_vgprs) {1481fprintf(stderr,1482"LLVM failed to compile a shader correctly: "1483"SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",1484shader->config.num_sgprs, shader->config.num_vgprs, max_sgprs, max_vgprs);14851486/* Just terminate the process, because dependent1487* shaders can hang due to bad input data, but use1488* the env var to allow shader-db to work.1489*/1490if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))1491abort();1492}1493}14941495/* Add the scratch offset to input SGPRs. */1496if (shader->config.scratch_bytes_per_wave && !si_is_merged_shader(shader))1497shader->info.num_input_sgprs += 1; /* scratch byte offset */14981499/* Calculate the number of fragment input VGPRs. */1500if (sel->info.stage == MESA_SHADER_FRAGMENT) {1501shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt(1502&shader->config, &shader->info.face_vgpr_index, &shader->info.ancillary_vgpr_index);1503}15041505si_calculate_max_simd_waves(shader);1506si_shader_dump_stats_for_shader_db(sscreen, shader, debug);1507return true;1508}15091510/**1511* Create, compile and return a shader part (prolog or epilog).1512*1513* \param sscreen screen1514* \param list list of shader parts of the same category1515* \param type shader type1516* \param key shader part key1517* \param prolog whether the part being requested is a prolog1518* \param tm LLVM target machine1519* \param debug debug callback1520* \param build the callback responsible for building the main function1521* \return non-NULL on success1522*/1523static struct si_shader_part *1524si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,1525gl_shader_stage stage, bool prolog, union si_shader_part_key *key,1526struct ac_llvm_compiler *compiler, struct pipe_debug_callback *debug,1527void (*build)(struct si_shader_context *, union si_shader_part_key *),1528const char *name)1529{1530struct si_shader_part *result;15311532simple_mtx_lock(&sscreen->shader_parts_mutex);15331534/* Find existing. */1535for (result = *list; result; result = result->next) {1536if (memcmp(&result->key, key, sizeof(*key)) == 0) {1537simple_mtx_unlock(&sscreen->shader_parts_mutex);1538return result;1539}1540}15411542/* Compile a new one. */1543result = CALLOC_STRUCT(si_shader_part);1544result->key = *key;15451546struct si_shader_selector sel = {};1547sel.screen = sscreen;15481549struct si_shader shader = {};1550shader.selector = &sel;15511552switch (stage) {1553case MESA_SHADER_VERTEX:1554shader.key.as_ls = key->vs_prolog.as_ls;1555shader.key.as_es = key->vs_prolog.as_es;1556shader.key.as_ngg = key->vs_prolog.as_ngg;1557shader.key.opt.ngg_culling =1558(key->vs_prolog.gs_fast_launch_tri_list ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST : 0) |1559(key->vs_prolog.gs_fast_launch_tri_strip ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP : 0) |1560SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(key->vs_prolog.gs_fast_launch_index_size_packed);1561shader.key.opt.vs_as_prim_discard_cs = key->vs_prolog.as_prim_discard_cs;1562break;1563case MESA_SHADER_TESS_CTRL:1564assert(!prolog);1565shader.key.part.tcs.epilog = key->tcs_epilog.states;1566break;1567case MESA_SHADER_GEOMETRY:1568assert(prolog);1569shader.key.as_ngg = key->gs_prolog.as_ngg;1570break;1571case MESA_SHADER_FRAGMENT:1572if (prolog)1573shader.key.part.ps.prolog = key->ps_prolog.states;1574else1575shader.key.part.ps.epilog = key->ps_epilog.states;1576break;1577default:1578unreachable("bad shader part");1579}15801581struct si_shader_context ctx;1582si_llvm_context_init(&ctx, sscreen, compiler,1583si_get_wave_size(sscreen, stage,1584shader.key.as_ngg, shader.key.as_es,1585shader.key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL,1586shader.key.opt.vs_as_prim_discard_cs));1587ctx.shader = &shader;1588ctx.stage = stage;15891590build(&ctx, key);15911592/* Compile. */1593si_llvm_optimize_module(&ctx);15941595if (!si_compile_llvm(sscreen, &result->binary, &result->config, compiler, &ctx.ac, debug,1596ctx.stage, name, false)) {1597FREE(result);1598result = NULL;1599goto out;1600}16011602result->next = *list;1603*list = result;16041605out:1606si_llvm_dispose(&ctx);1607simple_mtx_unlock(&sscreen->shader_parts_mutex);1608return result;1609}16101611static bool si_get_vs_prolog(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,1612struct si_shader *shader, struct pipe_debug_callback *debug,1613struct si_shader *main_part, const struct si_vs_prolog_bits *key)1614{1615struct si_shader_selector *vs = main_part->selector;16161617if (!si_vs_needs_prolog(vs, key, &shader->key, false))1618return true;16191620/* Get the prolog. */1621union si_shader_part_key prolog_key;1622si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, false, key, shader,1623&prolog_key);16241625shader->prolog =1626si_get_shader_part(sscreen, &sscreen->vs_prologs, MESA_SHADER_VERTEX, true, &prolog_key,1627compiler, debug, si_llvm_build_vs_prolog, "Vertex Shader Prolog");1628return shader->prolog != NULL;1629}16301631/**1632* Select and compile (or reuse) vertex shader parts (prolog & epilog).1633*/1634static bool si_shader_select_vs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,1635struct si_shader *shader, struct pipe_debug_callback *debug)1636{1637return si_get_vs_prolog(sscreen, compiler, shader, debug, shader, &shader->key.part.vs.prolog);1638}16391640/**1641* Select and compile (or reuse) TCS parts (epilog).1642*/1643static bool si_shader_select_tcs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,1644struct si_shader *shader, struct pipe_debug_callback *debug)1645{1646if (sscreen->info.chip_class >= GFX9) {1647struct si_shader *ls_main_part = shader->key.part.tcs.ls->main_shader_part_ls;16481649if (!si_get_vs_prolog(sscreen, compiler, shader, debug, ls_main_part,1650&shader->key.part.tcs.ls_prolog))1651return false;16521653shader->previous_stage = ls_main_part;1654}16551656/* Get the epilog. */1657union si_shader_part_key epilog_key;1658memset(&epilog_key, 0, sizeof(epilog_key));1659epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;16601661shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs, MESA_SHADER_TESS_CTRL, false,1662&epilog_key, compiler, debug, si_llvm_build_tcs_epilog,1663"Tessellation Control Shader Epilog");1664return shader->epilog != NULL;1665}16661667/**1668* Select and compile (or reuse) GS parts (prolog).1669*/1670static bool si_shader_select_gs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,1671struct si_shader *shader, struct pipe_debug_callback *debug)1672{1673if (sscreen->info.chip_class >= GFX9) {1674struct si_shader *es_main_part;16751676if (shader->key.as_ngg)1677es_main_part = shader->key.part.gs.es->main_shader_part_ngg_es;1678else1679es_main_part = shader->key.part.gs.es->main_shader_part_es;16801681if (shader->key.part.gs.es->info.stage == MESA_SHADER_VERTEX &&1682!si_get_vs_prolog(sscreen, compiler, shader, debug, es_main_part,1683&shader->key.part.gs.vs_prolog))1684return false;16851686shader->previous_stage = es_main_part;1687}16881689if (!shader->key.part.gs.prolog.tri_strip_adj_fix)1690return true;16911692union si_shader_part_key prolog_key;1693memset(&prolog_key, 0, sizeof(prolog_key));1694prolog_key.gs_prolog.states = shader->key.part.gs.prolog;1695prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;16961697shader->prolog2 =1698si_get_shader_part(sscreen, &sscreen->gs_prologs, MESA_SHADER_GEOMETRY, true, &prolog_key,1699compiler, debug, si_llvm_build_gs_prolog, "Geometry Shader Prolog");1700return shader->prolog2 != NULL;1701}17021703/**1704* Compute the PS prolog key, which contains all the information needed to1705* build the PS prolog function, and set related bits in shader->config.1706*/1707void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key,1708bool separate_prolog)1709{1710struct si_shader_info *info = &shader->selector->info;17111712memset(key, 0, sizeof(*key));1713key->ps_prolog.states = shader->key.part.ps.prolog;1714key->ps_prolog.colors_read = info->colors_read;1715key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;1716key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;1717key->ps_prolog.wqm =1718info->base.fs.needs_quad_helper_invocations &&1719(key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||1720key->ps_prolog.states.force_linear_sample_interp ||1721key->ps_prolog.states.force_persp_center_interp ||1722key->ps_prolog.states.force_linear_center_interp ||1723key->ps_prolog.states.bc_optimize_for_persp || key->ps_prolog.states.bc_optimize_for_linear);1724key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index;17251726if (info->colors_read) {1727ubyte *color = shader->selector->color_attr_index;17281729if (shader->key.part.ps.prolog.color_two_side) {1730/* BCOLORs are stored after the last input. */1731key->ps_prolog.num_interp_inputs = info->num_inputs;1732key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;1733if (separate_prolog)1734shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);1735}17361737for (unsigned i = 0; i < 2; i++) {1738unsigned interp = info->color_interpolate[i];1739unsigned location = info->color_interpolate_loc[i];17401741if (!(info->colors_read & (0xf << i * 4)))1742continue;17431744key->ps_prolog.color_attr_index[i] = color[i];17451746if (shader->key.part.ps.prolog.flatshade_colors && interp == INTERP_MODE_COLOR)1747interp = INTERP_MODE_FLAT;17481749switch (interp) {1750case INTERP_MODE_FLAT:1751key->ps_prolog.color_interp_vgpr_index[i] = -1;1752break;1753case INTERP_MODE_SMOOTH:1754case INTERP_MODE_COLOR:1755/* Force the interpolation location for colors here. */1756if (shader->key.part.ps.prolog.force_persp_sample_interp)1757location = TGSI_INTERPOLATE_LOC_SAMPLE;1758if (shader->key.part.ps.prolog.force_persp_center_interp)1759location = TGSI_INTERPOLATE_LOC_CENTER;17601761switch (location) {1762case TGSI_INTERPOLATE_LOC_SAMPLE:1763key->ps_prolog.color_interp_vgpr_index[i] = 0;1764if (separate_prolog) {1765shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);1766}1767break;1768case TGSI_INTERPOLATE_LOC_CENTER:1769key->ps_prolog.color_interp_vgpr_index[i] = 2;1770if (separate_prolog) {1771shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);1772}1773break;1774case TGSI_INTERPOLATE_LOC_CENTROID:1775key->ps_prolog.color_interp_vgpr_index[i] = 4;1776if (separate_prolog) {1777shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTROID_ENA(1);1778}1779break;1780default:1781assert(0);1782}1783break;1784case INTERP_MODE_NOPERSPECTIVE:1785/* Force the interpolation location for colors here. */1786if (shader->key.part.ps.prolog.force_linear_sample_interp)1787location = TGSI_INTERPOLATE_LOC_SAMPLE;1788if (shader->key.part.ps.prolog.force_linear_center_interp)1789location = TGSI_INTERPOLATE_LOC_CENTER;17901791/* The VGPR assignment for non-monolithic shaders1792* works because InitialPSInputAddr is set on the1793* main shader and PERSP_PULL_MODEL is never used.1794*/1795switch (location) {1796case TGSI_INTERPOLATE_LOC_SAMPLE:1797key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 6 : 9;1798if (separate_prolog) {1799shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);1800}1801break;1802case TGSI_INTERPOLATE_LOC_CENTER:1803key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 8 : 11;1804if (separate_prolog) {1805shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);1806}1807break;1808case TGSI_INTERPOLATE_LOC_CENTROID:1809key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 10 : 13;1810if (separate_prolog) {1811shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTROID_ENA(1);1812}1813break;1814default:1815assert(0);1816}1817break;1818default:1819assert(0);1820}1821}1822}1823}18241825/**1826* Check whether a PS prolog is required based on the key.1827*/1828bool si_need_ps_prolog(const union si_shader_part_key *key)1829{1830return key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||1831key->ps_prolog.states.force_linear_sample_interp ||1832key->ps_prolog.states.force_persp_center_interp ||1833key->ps_prolog.states.force_linear_center_interp ||1834key->ps_prolog.states.bc_optimize_for_persp ||1835key->ps_prolog.states.bc_optimize_for_linear || key->ps_prolog.states.poly_stipple ||1836key->ps_prolog.states.samplemask_log_ps_iter;1837}18381839/**1840* Compute the PS epilog key, which contains all the information needed to1841* build the PS epilog function.1842*/1843void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *key)1844{1845struct si_shader_info *info = &shader->selector->info;1846memset(key, 0, sizeof(*key));1847key->ps_epilog.colors_written = info->colors_written;1848key->ps_epilog.color_types = info->output_color_types;1849key->ps_epilog.writes_z = info->writes_z;1850key->ps_epilog.writes_stencil = info->writes_stencil;1851key->ps_epilog.writes_samplemask = info->writes_samplemask;1852key->ps_epilog.states = shader->key.part.ps.epilog;1853}18541855/**1856* Select and compile (or reuse) pixel shader parts (prolog & epilog).1857*/1858static bool si_shader_select_ps_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,1859struct si_shader *shader, struct pipe_debug_callback *debug)1860{1861union si_shader_part_key prolog_key;1862union si_shader_part_key epilog_key;18631864/* Get the prolog. */1865si_get_ps_prolog_key(shader, &prolog_key, true);18661867/* The prolog is a no-op if these aren't set. */1868if (si_need_ps_prolog(&prolog_key)) {1869shader->prolog =1870si_get_shader_part(sscreen, &sscreen->ps_prologs, MESA_SHADER_FRAGMENT, true, &prolog_key,1871compiler, debug, si_llvm_build_ps_prolog, "Fragment Shader Prolog");1872if (!shader->prolog)1873return false;1874}18751876/* Get the epilog. */1877si_get_ps_epilog_key(shader, &epilog_key);18781879shader->epilog =1880si_get_shader_part(sscreen, &sscreen->ps_epilogs, MESA_SHADER_FRAGMENT, false, &epilog_key,1881compiler, debug, si_llvm_build_ps_epilog, "Fragment Shader Epilog");1882if (!shader->epilog)1883return false;18841885/* Enable POS_FIXED_PT if polygon stippling is enabled. */1886if (shader->key.part.ps.prolog.poly_stipple) {1887shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);1888assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));1889}18901891/* Set up the enable bits for per-sample shading if needed. */1892if (shader->key.part.ps.prolog.force_persp_sample_interp &&1893(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||1894G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {1895shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;1896shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;1897shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);1898}1899if (shader->key.part.ps.prolog.force_linear_sample_interp &&1900(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||1901G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {1902shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;1903shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;1904shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);1905}1906if (shader->key.part.ps.prolog.force_persp_center_interp &&1907(G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||1908G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {1909shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;1910shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;1911shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);1912}1913if (shader->key.part.ps.prolog.force_linear_center_interp &&1914(G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||1915G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {1916shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;1917shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;1918shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);1919}19201921/* POW_W_FLOAT requires that one of the perspective weights is enabled. */1922if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&1923!(shader->config.spi_ps_input_ena & 0xf)) {1924shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);1925assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));1926}19271928/* At least one pair of interpolation weights must be enabled. */1929if (!(shader->config.spi_ps_input_ena & 0x7f)) {1930shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);1931assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));1932}19331934/* Samplemask fixup requires the sample ID. */1935if (shader->key.part.ps.prolog.samplemask_log_ps_iter) {1936shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);1937assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr));1938}19391940/* The sample mask input is always enabled, because the API shader always1941* passes it through to the epilog. Disable it here if it's unused.1942*/1943if (!shader->key.part.ps.epilog.poly_line_smoothing && !shader->selector->info.reads_samplemask)1944shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;19451946return true;1947}19481949void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size)1950{1951/* If tessellation is all offchip and on-chip GS isn't used, this1952* workaround is not needed.1953*/1954return;19551956/* SPI barrier management bug:1957* Make sure we have at least 4k of LDS in use to avoid the bug.1958* It applies to workgroup sizes of more than one wavefront.1959*/1960if (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_KABINI)1961*lds_size = MAX2(*lds_size, 8);1962}19631964void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader)1965{1966unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */19671968shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);19691970if (shader->selector->info.stage == MESA_SHADER_COMPUTE &&1971si_get_max_workgroup_size(shader) > sscreen->compute_wave_size) {1972si_multiwave_lds_size_workaround(sscreen, &shader->config.lds_size);1973}1974}19751976bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,1977struct si_shader *shader, struct pipe_debug_callback *debug)1978{1979struct si_shader_selector *sel = shader->selector;1980struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);19811982/* LS, ES, VS are compiled on demand if the main part hasn't been1983* compiled for that stage.1984*1985* GS are compiled on demand if the main part hasn't been compiled1986* for the chosen NGG-ness.1987*1988* Vertex shaders are compiled on demand when a vertex fetch1989* workaround must be applied.1990*/1991if (shader->is_monolithic) {1992/* Monolithic shader (compiled as a whole, has many variants,1993* may take a long time to compile).1994*/1995if (!si_compile_shader(sscreen, compiler, shader, debug))1996return false;1997} else {1998/* The shader consists of several parts:1999*2000* - the middle part is the user shader, it has 1 variant only2001* and it was compiled during the creation of the shader2002* selector2003* - the prolog part is inserted at the beginning2004* - the epilog part is inserted at the end2005*2006* The prolog and epilog have many (but simple) variants.2007*2008* Starting with gfx9, geometry and tessellation control2009* shaders also contain the prolog and user shader parts of2010* the previous shader stage.2011*/20122013if (!mainp)2014return false;20152016/* Copy the compiled shader data over. */2017shader->is_binary_shared = true;2018shader->binary = mainp->binary;2019shader->config = mainp->config;2020shader->info.num_input_sgprs = mainp->info.num_input_sgprs;2021shader->info.num_input_vgprs = mainp->info.num_input_vgprs;2022shader->info.face_vgpr_index = mainp->info.face_vgpr_index;2023shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index;2024memcpy(shader->info.vs_output_param_offset, mainp->info.vs_output_param_offset,2025sizeof(mainp->info.vs_output_param_offset));2026shader->info.uses_instanceid = mainp->info.uses_instanceid;2027shader->info.nr_pos_exports = mainp->info.nr_pos_exports;2028shader->info.nr_param_exports = mainp->info.nr_param_exports;20292030/* Select prologs and/or epilogs. */2031switch (sel->info.stage) {2032case MESA_SHADER_VERTEX:2033if (!si_shader_select_vs_parts(sscreen, compiler, shader, debug))2034return false;2035break;2036case MESA_SHADER_TESS_CTRL:2037if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug))2038return false;2039break;2040case MESA_SHADER_TESS_EVAL:2041break;2042case MESA_SHADER_GEOMETRY:2043if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))2044return false;2045break;2046case MESA_SHADER_FRAGMENT:2047if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))2048return false;20492050/* Make sure we have at least as many VGPRs as there2051* are allocated inputs.2052*/2053shader->config.num_vgprs = MAX2(shader->config.num_vgprs, shader->info.num_input_vgprs);2054break;2055default:;2056}20572058/* Update SGPR and VGPR counts. */2059if (shader->prolog) {2060shader->config.num_sgprs =2061MAX2(shader->config.num_sgprs, shader->prolog->config.num_sgprs);2062shader->config.num_vgprs =2063MAX2(shader->config.num_vgprs, shader->prolog->config.num_vgprs);2064}2065if (shader->previous_stage) {2066shader->config.num_sgprs =2067MAX2(shader->config.num_sgprs, shader->previous_stage->config.num_sgprs);2068shader->config.num_vgprs =2069MAX2(shader->config.num_vgprs, shader->previous_stage->config.num_vgprs);2070shader->config.spilled_sgprs =2071MAX2(shader->config.spilled_sgprs, shader->previous_stage->config.spilled_sgprs);2072shader->config.spilled_vgprs =2073MAX2(shader->config.spilled_vgprs, shader->previous_stage->config.spilled_vgprs);2074shader->info.private_mem_vgprs =2075MAX2(shader->info.private_mem_vgprs, shader->previous_stage->info.private_mem_vgprs);2076shader->config.scratch_bytes_per_wave =2077MAX2(shader->config.scratch_bytes_per_wave,2078shader->previous_stage->config.scratch_bytes_per_wave);2079shader->info.uses_instanceid |= shader->previous_stage->info.uses_instanceid;2080}2081if (shader->prolog2) {2082shader->config.num_sgprs =2083MAX2(shader->config.num_sgprs, shader->prolog2->config.num_sgprs);2084shader->config.num_vgprs =2085MAX2(shader->config.num_vgprs, shader->prolog2->config.num_vgprs);2086}2087if (shader->epilog) {2088shader->config.num_sgprs =2089MAX2(shader->config.num_sgprs, shader->epilog->config.num_sgprs);2090shader->config.num_vgprs =2091MAX2(shader->config.num_vgprs, shader->epilog->config.num_vgprs);2092}2093si_calculate_max_simd_waves(shader);2094}20952096if (shader->key.as_ngg) {2097assert(!shader->key.as_es && !shader->key.as_ls);2098if (!gfx10_ngg_calculate_subgroup_info(shader)) {2099fprintf(stderr, "Failed to compute subgroup info\n");2100return false;2101}2102} else if (sscreen->info.chip_class >= GFX9 && sel->info.stage == MESA_SHADER_GEOMETRY) {2103gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info);2104}21052106shader->uses_vs_state_provoking_vertex =2107sscreen->use_ngg &&2108/* Used to convert triangle strips from GS to triangles. */2109((sel->info.stage == MESA_SHADER_GEOMETRY &&2110util_rast_prim_is_triangles(sel->info.base.gs.output_primitive)) ||2111(sel->info.stage == MESA_SHADER_VERTEX &&2112/* Used to export PrimitiveID from the correct vertex. */2113(shader->key.mono.u.vs_export_prim_id ||2114/* Used to generate triangle strip vertex IDs for all threads. */2115shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP)));21162117shader->uses_vs_state_outprim = sscreen->use_ngg &&2118/* Only used by streamout in vertex shaders. */2119sel->info.stage == MESA_SHADER_VERTEX &&2120sel->so.num_outputs;21212122if (sel->info.stage == MESA_SHADER_VERTEX) {2123shader->uses_base_instance = sel->info.uses_base_instance ||2124shader->key.part.vs.prolog.instance_divisor_is_one ||2125shader->key.part.vs.prolog.instance_divisor_is_fetched;2126} else if (sel->info.stage == MESA_SHADER_TESS_CTRL) {2127shader->uses_base_instance = shader->previous_stage_sel &&2128(shader->previous_stage_sel->info.uses_base_instance ||2129shader->key.part.tcs.ls_prolog.instance_divisor_is_one ||2130shader->key.part.tcs.ls_prolog.instance_divisor_is_fetched);2131} else if (sel->info.stage == MESA_SHADER_GEOMETRY) {2132shader->uses_base_instance = shader->previous_stage_sel &&2133(shader->previous_stage_sel->info.uses_base_instance ||2134shader->key.part.gs.vs_prolog.instance_divisor_is_one ||2135shader->key.part.gs.vs_prolog.instance_divisor_is_fetched);2136}21372138si_fix_resource_usage(sscreen, shader);2139si_shader_dump(sscreen, shader, debug, stderr, true);21402141/* Upload. */2142if (!si_shader_binary_upload(sscreen, shader, 0)) {2143fprintf(stderr, "LLVM failed to upload shader\n");2144return false;2145}21462147return true;2148}21492150void si_shader_binary_clean(struct si_shader_binary *binary)2151{2152free((void *)binary->elf_buffer);2153binary->elf_buffer = NULL;21542155free(binary->llvm_ir_string);2156binary->llvm_ir_string = NULL;21572158free(binary->uploaded_code);2159binary->uploaded_code = NULL;2160binary->uploaded_code_size = 0;2161}21622163void si_shader_destroy(struct si_shader *shader)2164{2165if (shader->scratch_bo)2166si_resource_reference(&shader->scratch_bo, NULL);21672168si_resource_reference(&shader->bo, NULL);21692170if (!shader->is_binary_shared)2171si_shader_binary_clean(&shader->binary);21722173free(shader->shader_log);2174}217521762177