Path: blob/21.2-virgl/src/amd/compiler/aco_instruction_selection_setup.cpp
4550 views
/*1* Copyright © 2018 Valve Corporation2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*22*/2324#include "aco_instruction_selection.h"2526#include "common/ac_exp_param.h"27#include "common/sid.h"28#include "vulkan/radv_descriptor_set.h"2930#include "nir_control_flow.h"3132#include <vector>3334namespace aco {3536namespace {3738unsigned39get_interp_input(nir_intrinsic_op intrin, enum glsl_interp_mode interp)40{41switch (interp) {42case INTERP_MODE_SMOOTH:43case INTERP_MODE_NONE:44if (intrin == nir_intrinsic_load_barycentric_pixel ||45intrin == nir_intrinsic_load_barycentric_at_sample ||46intrin == nir_intrinsic_load_barycentric_at_offset)47return S_0286CC_PERSP_CENTER_ENA(1);48else if (intrin == nir_intrinsic_load_barycentric_centroid)49return S_0286CC_PERSP_CENTROID_ENA(1);50else if (intrin == nir_intrinsic_load_barycentric_sample)51return S_0286CC_PERSP_SAMPLE_ENA(1);52break;53case INTERP_MODE_NOPERSPECTIVE:54if (intrin == nir_intrinsic_load_barycentric_pixel)55return S_0286CC_LINEAR_CENTER_ENA(1);56else if (intrin == nir_intrinsic_load_barycentric_centroid)57return S_0286CC_LINEAR_CENTROID_ENA(1);58else if (intrin == nir_intrinsic_load_barycentric_sample)59return S_0286CC_LINEAR_SAMPLE_ENA(1);60break;61default: break;62}63return 0;64}6566bool67is_loop_header_block(nir_block* block)68{69return block->cf_node.parent->type == nir_cf_node_loop &&70block == nir_loop_first_block(nir_cf_node_as_loop(block->cf_node.parent));71}7273/* similar to nir_block_is_unreachable(), but does not require dominance information */74bool75is_block_reachable(nir_function_impl* impl, nir_block* known_reachable, nir_block* block)76{77if (block == nir_start_block(impl) || block == known_reachable)78return true;7980/* skip loop back-edges */81if (is_loop_header_block(block)) {82nir_loop* loop = nir_cf_node_as_loop(block->cf_node.parent);83nir_block* preheader = nir_block_cf_tree_prev(nir_loop_first_block(loop));84return is_block_reachable(impl, known_reachable, preheader);85}8687set_foreach (block->predecessors, entry) {88if (is_block_reachable(impl, known_reachable, (nir_block*)entry->key))89return true;90}9192return false;93}9495/* Check whether the given SSA def is only used by cross-lane instructions. */96bool97only_used_by_cross_lane_instrs(nir_ssa_def* ssa, bool follow_phis = true)98{99nir_foreach_use (src, ssa) {100switch (src->parent_instr->type) {101case nir_instr_type_alu: {102nir_alu_instr* alu = nir_instr_as_alu(src->parent_instr);103if (alu->op != nir_op_unpack_64_2x32_split_x && alu->op != nir_op_unpack_64_2x32_split_y)104return false;105if (!only_used_by_cross_lane_instrs(&alu->dest.dest.ssa, follow_phis))106return false;107108continue;109}110case nir_instr_type_intrinsic: {111nir_intrinsic_instr* intrin = nir_instr_as_intrinsic(src->parent_instr);112if (intrin->intrinsic != nir_intrinsic_read_invocation &&113intrin->intrinsic != nir_intrinsic_read_first_invocation &&114intrin->intrinsic != nir_intrinsic_lane_permute_16_amd)115return false;116117continue;118}119case nir_instr_type_phi: {120/* Don't follow more than 1 phis, this avoids infinite loops. */121if (!follow_phis)122return false;123124nir_phi_instr* phi = nir_instr_as_phi(src->parent_instr);125if (!only_used_by_cross_lane_instrs(&phi->dest.ssa, false))126return false;127128continue;129}130default: return false;131}132}133134return true;135}136137/* If one side of a divergent IF ends in a branch and the other doesn't, we138* might have to emit the contents of the side without the branch at the merge139* block instead. This is so that we can use any SGPR live-out of the side140* without the branch without creating a linear phi in the invert or merge block. */141bool142sanitize_if(nir_function_impl* impl, nir_if* nif)143{144// TODO: skip this if the condition is uniform and there are no divergent breaks/continues?145146nir_block* then_block = nir_if_last_then_block(nif);147nir_block* else_block = nir_if_last_else_block(nif);148bool then_jump = nir_block_ends_in_jump(then_block) ||149!is_block_reachable(impl, nir_if_first_then_block(nif), then_block);150bool else_jump = nir_block_ends_in_jump(else_block) ||151!is_block_reachable(impl, nir_if_first_else_block(nif), else_block);152if (then_jump == else_jump)153return false;154155/* If the continue from block is empty then return as there is nothing to156* move.157*/158if (nir_cf_list_is_empty_block(else_jump ? &nif->then_list : &nif->else_list))159return false;160161/* Even though this if statement has a jump on one side, we may still have162* phis afterwards. Single-source phis can be produced by loop unrolling163* or dead control-flow passes and are perfectly legal. Run a quick phi164* removal on the block after the if to clean up any such phis.165*/166nir_opt_remove_phis_block(nir_cf_node_as_block(nir_cf_node_next(&nif->cf_node)));167168/* Finally, move the continue from branch after the if-statement. */169nir_block* last_continue_from_blk = else_jump ? then_block : else_block;170nir_block* first_continue_from_blk =171else_jump ? nir_if_first_then_block(nif) : nir_if_first_else_block(nif);172173nir_cf_list tmp;174nir_cf_extract(&tmp, nir_before_block(first_continue_from_blk),175nir_after_block(last_continue_from_blk));176nir_cf_reinsert(&tmp, nir_after_cf_node(&nif->cf_node));177178return true;179}180181bool182sanitize_cf_list(nir_function_impl* impl, struct exec_list* cf_list)183{184bool progress = false;185foreach_list_typed (nir_cf_node, cf_node, node, cf_list) {186switch (cf_node->type) {187case nir_cf_node_block: break;188case nir_cf_node_if: {189nir_if* nif = nir_cf_node_as_if(cf_node);190progress |= sanitize_cf_list(impl, &nif->then_list);191progress |= sanitize_cf_list(impl, &nif->else_list);192progress |= sanitize_if(impl, nif);193break;194}195case nir_cf_node_loop: {196nir_loop* loop = nir_cf_node_as_loop(cf_node);197progress |= sanitize_cf_list(impl, &loop->body);198break;199}200case nir_cf_node_function: unreachable("Invalid cf type");201}202}203204return progress;205}206207void208apply_nuw_to_ssa(isel_context* ctx, nir_ssa_def* ssa)209{210nir_ssa_scalar scalar;211scalar.def = ssa;212scalar.comp = 0;213214if (!nir_ssa_scalar_is_alu(scalar) || nir_ssa_scalar_alu_op(scalar) != nir_op_iadd)215return;216217nir_alu_instr* add = nir_instr_as_alu(ssa->parent_instr);218219if (add->no_unsigned_wrap)220return;221222nir_ssa_scalar src0 = nir_ssa_scalar_chase_alu_src(scalar, 0);223nir_ssa_scalar src1 = nir_ssa_scalar_chase_alu_src(scalar, 1);224225if (nir_ssa_scalar_is_const(src0)) {226nir_ssa_scalar tmp = src0;227src0 = src1;228src1 = tmp;229}230231uint32_t src1_ub = nir_unsigned_upper_bound(ctx->shader, ctx->range_ht, src1, &ctx->ub_config);232add->no_unsigned_wrap =233!nir_addition_might_overflow(ctx->shader, ctx->range_ht, src0, src1_ub, &ctx->ub_config);234}235236void237apply_nuw_to_offsets(isel_context* ctx, nir_function_impl* impl)238{239nir_foreach_block (block, impl) {240nir_foreach_instr (instr, block) {241if (instr->type != nir_instr_type_intrinsic)242continue;243nir_intrinsic_instr* intrin = nir_instr_as_intrinsic(instr);244245switch (intrin->intrinsic) {246case nir_intrinsic_load_constant:247case nir_intrinsic_load_uniform:248case nir_intrinsic_load_push_constant:249if (!nir_src_is_divergent(intrin->src[0]))250apply_nuw_to_ssa(ctx, intrin->src[0].ssa);251break;252case nir_intrinsic_load_ubo:253case nir_intrinsic_load_ssbo:254if (!nir_src_is_divergent(intrin->src[1]))255apply_nuw_to_ssa(ctx, intrin->src[1].ssa);256break;257case nir_intrinsic_store_ssbo:258if (!nir_src_is_divergent(intrin->src[2]))259apply_nuw_to_ssa(ctx, intrin->src[2].ssa);260break;261default: break;262}263}264}265}266267RegClass268get_reg_class(isel_context* ctx, RegType type, unsigned components, unsigned bitsize)269{270if (bitsize == 1)271return RegClass(RegType::sgpr, ctx->program->lane_mask.size() * components);272else273return RegClass::get(type, components * bitsize / 8u);274}275276void277setup_vs_output_info(isel_context* ctx, nir_shader* nir, bool export_prim_id,278bool export_clip_dists, radv_vs_output_info* outinfo)279{280memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,281sizeof(outinfo->vs_output_param_offset));282283outinfo->param_exports = 0;284int pos_written = 0x1;285bool writes_primitive_shading_rate =286outinfo->writes_primitive_shading_rate || ctx->options->force_vrs_rates;287if (outinfo->writes_pointsize || outinfo->writes_viewport_index || outinfo->writes_layer ||288writes_primitive_shading_rate)289pos_written |= 1 << 1;290291uint64_t mask = nir->info.outputs_written;292while (mask) {293int idx = u_bit_scan64(&mask);294if (idx >= VARYING_SLOT_VAR0 || idx == VARYING_SLOT_LAYER ||295idx == VARYING_SLOT_PRIMITIVE_ID || idx == VARYING_SLOT_VIEWPORT ||296((idx == VARYING_SLOT_CLIP_DIST0 || idx == VARYING_SLOT_CLIP_DIST1) &&297export_clip_dists)) {298if (outinfo->vs_output_param_offset[idx] == AC_EXP_PARAM_UNDEFINED)299outinfo->vs_output_param_offset[idx] = outinfo->param_exports++;300}301}302if (outinfo->writes_layer &&303outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] == AC_EXP_PARAM_UNDEFINED) {304/* when ctx->options->key.has_multiview_view_index = true, the layer305* variable isn't declared in NIR and it's isel's job to get the layer */306outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] = outinfo->param_exports++;307}308309if (export_prim_id) {310assert(outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] == AC_EXP_PARAM_UNDEFINED);311outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = outinfo->param_exports++;312}313314ctx->export_clip_dists = export_clip_dists;315ctx->num_clip_distances = util_bitcount(outinfo->clip_dist_mask);316ctx->num_cull_distances = util_bitcount(outinfo->cull_dist_mask);317318assert(ctx->num_clip_distances + ctx->num_cull_distances <= 8);319320if (ctx->num_clip_distances + ctx->num_cull_distances > 0)321pos_written |= 1 << 2;322if (ctx->num_clip_distances + ctx->num_cull_distances > 4)323pos_written |= 1 << 3;324325outinfo->pos_exports = util_bitcount(pos_written);326327/* GFX10+ early rasterization:328* When there are no param exports in an NGG (or legacy VS) shader,329* RADV sets NO_PC_EXPORT=1, which means the HW will start clipping and rasterization330* as soon as it encounters a DONE pos export. When this happens, PS waves can launch331* before the NGG (or VS) waves finish.332*/333ctx->program->early_rast = ctx->program->chip_class >= GFX10 && outinfo->param_exports == 0;334}335336void337setup_vs_variables(isel_context* ctx, nir_shader* nir)338{339if (ctx->stage == vertex_vs || ctx->stage == vertex_ngg) {340radv_vs_output_info* outinfo = &ctx->program->info->vs.outinfo;341setup_vs_output_info(ctx, nir, outinfo->export_prim_id,342ctx->options->key.vs_common_out.export_clip_dists, outinfo);343344/* TODO: NGG streamout */345if (ctx->stage.hw == HWStage::NGG)346assert(!ctx->args->shader_info->so.num_outputs);347}348349if (ctx->stage == vertex_ngg) {350ctx->program->config->lds_size =351DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);352assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) <353(32 * 1024));354}355}356357void358setup_gs_variables(isel_context* ctx, nir_shader* nir)359{360if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs) {361ctx->program->config->lds_size =362ctx->program->info->gs_ring_info.lds_size; /* Already in units of the alloc granularity */363} else if (ctx->stage == vertex_geometry_ngg || ctx->stage == tess_eval_geometry_ngg) {364radv_vs_output_info* outinfo = &ctx->program->info->vs.outinfo;365setup_vs_output_info(ctx, nir, false, ctx->options->key.vs_common_out.export_clip_dists,366outinfo);367368ctx->program->config->lds_size =369DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);370}371372if (ctx->stage.has(SWStage::VS))373ctx->program->info->gs.es_type = MESA_SHADER_VERTEX;374else if (ctx->stage.has(SWStage::TES))375ctx->program->info->gs.es_type = MESA_SHADER_TESS_EVAL;376}377378void379setup_tcs_info(isel_context* ctx, nir_shader* nir, nir_shader* vs)380{381ctx->tcs_in_out_eq = ctx->args->shader_info->vs.tcs_in_out_eq;382ctx->tcs_temp_only_inputs = ctx->args->shader_info->vs.tcs_temp_only_input_mask;383ctx->tcs_num_patches = ctx->args->shader_info->num_tess_patches;384ctx->program->config->lds_size = ctx->args->shader_info->tcs.num_lds_blocks;385}386387void388setup_tes_variables(isel_context* ctx, nir_shader* nir)389{390ctx->tcs_num_patches = ctx->args->shader_info->num_tess_patches;391392if (ctx->stage == tess_eval_vs || ctx->stage == tess_eval_ngg) {393radv_vs_output_info* outinfo = &ctx->program->info->tes.outinfo;394setup_vs_output_info(ctx, nir, outinfo->export_prim_id,395ctx->options->key.vs_common_out.export_clip_dists, outinfo);396397/* TODO: NGG streamout */398if (ctx->stage.hw == HWStage::NGG)399assert(!ctx->args->shader_info->so.num_outputs);400}401402if (ctx->stage == tess_eval_ngg) {403ctx->program->config->lds_size =404DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);405assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) <406(32 * 1024));407}408}409410void411setup_variables(isel_context* ctx, nir_shader* nir)412{413switch (nir->info.stage) {414case MESA_SHADER_FRAGMENT: {415break;416}417case MESA_SHADER_COMPUTE: {418ctx->program->config->lds_size =419DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);420break;421}422case MESA_SHADER_VERTEX: {423setup_vs_variables(ctx, nir);424break;425}426case MESA_SHADER_GEOMETRY: {427setup_gs_variables(ctx, nir);428break;429}430case MESA_SHADER_TESS_CTRL: {431break;432}433case MESA_SHADER_TESS_EVAL: {434setup_tes_variables(ctx, nir);435break;436}437default: unreachable("Unhandled shader stage.");438}439440/* Make sure we fit the available LDS space. */441assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) <=442ctx->program->dev.lds_limit);443}444445void446setup_nir(isel_context* ctx, nir_shader* nir)447{448/* the variable setup has to be done before lower_io / CSE */449setup_variables(ctx, nir);450451nir_convert_to_lcssa(nir, true, false);452nir_lower_phis_to_scalar(nir, true);453454nir_function_impl* func = nir_shader_get_entrypoint(nir);455nir_index_ssa_defs(func);456}457458} /* end namespace */459460void461init_context(isel_context* ctx, nir_shader* shader)462{463nir_function_impl* impl = nir_shader_get_entrypoint(shader);464ctx->shader = shader;465466/* Init NIR range analysis. */467ctx->range_ht = _mesa_pointer_hash_table_create(NULL);468ctx->ub_config.min_subgroup_size = 64;469ctx->ub_config.max_subgroup_size = 64;470if (ctx->shader->info.stage == MESA_SHADER_COMPUTE && ctx->options->key.cs.subgroup_size) {471ctx->ub_config.min_subgroup_size = ctx->options->key.cs.subgroup_size;472ctx->ub_config.max_subgroup_size = ctx->options->key.cs.subgroup_size;473}474ctx->ub_config.max_workgroup_invocations = 2048;475ctx->ub_config.max_workgroup_count[0] = 65535;476ctx->ub_config.max_workgroup_count[1] = 65535;477ctx->ub_config.max_workgroup_count[2] = 65535;478ctx->ub_config.max_workgroup_size[0] = 2048;479ctx->ub_config.max_workgroup_size[1] = 2048;480ctx->ub_config.max_workgroup_size[2] = 2048;481for (unsigned i = 0; i < MAX_VERTEX_ATTRIBS; i++) {482unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[i];483unsigned dfmt = attrib_format & 0xf;484unsigned nfmt = (attrib_format >> 4) & 0x7;485486uint32_t max = UINT32_MAX;487if (nfmt == V_008F0C_BUF_NUM_FORMAT_UNORM) {488max = 0x3f800000u;489} else if (nfmt == V_008F0C_BUF_NUM_FORMAT_UINT || nfmt == V_008F0C_BUF_NUM_FORMAT_USCALED) {490bool uscaled = nfmt == V_008F0C_BUF_NUM_FORMAT_USCALED;491switch (dfmt) {492case V_008F0C_BUF_DATA_FORMAT_8:493case V_008F0C_BUF_DATA_FORMAT_8_8:494case V_008F0C_BUF_DATA_FORMAT_8_8_8_8: max = uscaled ? 0x437f0000u : UINT8_MAX; break;495case V_008F0C_BUF_DATA_FORMAT_10_10_10_2:496case V_008F0C_BUF_DATA_FORMAT_2_10_10_10: max = uscaled ? 0x447fc000u : 1023; break;497case V_008F0C_BUF_DATA_FORMAT_10_11_11:498case V_008F0C_BUF_DATA_FORMAT_11_11_10: max = uscaled ? 0x44ffe000u : 2047; break;499case V_008F0C_BUF_DATA_FORMAT_16:500case V_008F0C_BUF_DATA_FORMAT_16_16:501case V_008F0C_BUF_DATA_FORMAT_16_16_16_16: max = uscaled ? 0x477fff00u : UINT16_MAX; break;502case V_008F0C_BUF_DATA_FORMAT_32:503case V_008F0C_BUF_DATA_FORMAT_32_32:504case V_008F0C_BUF_DATA_FORMAT_32_32_32:505case V_008F0C_BUF_DATA_FORMAT_32_32_32_32: max = uscaled ? 0x4f800000u : UINT32_MAX; break;506}507}508ctx->ub_config.vertex_attrib_max[i] = max;509}510511nir_divergence_analysis(shader);512nir_opt_uniform_atomics(shader);513514apply_nuw_to_offsets(ctx, impl);515516/* sanitize control flow */517sanitize_cf_list(impl, &impl->body);518nir_metadata_preserve(impl, nir_metadata_none);519520/* we'll need these for isel */521nir_metadata_require(impl, nir_metadata_block_index);522523if (!ctx->stage.has(SWStage::GSCopy) && ctx->options->dump_preoptir) {524fprintf(stderr, "NIR shader before instruction selection:\n");525nir_print_shader(shader, stderr);526}527528ctx->first_temp_id = ctx->program->peekAllocationId();529ctx->program->allocateRange(impl->ssa_alloc);530RegClass* regclasses = ctx->program->temp_rc.data() + ctx->first_temp_id;531532unsigned spi_ps_inputs = 0;533534std::unique_ptr<unsigned[]> nir_to_aco{new unsigned[impl->num_blocks]()};535536/* TODO: make this recursive to improve compile times */537bool done = false;538while (!done) {539done = true;540nir_foreach_block (block, impl) {541nir_foreach_instr (instr, block) {542switch (instr->type) {543case nir_instr_type_alu: {544nir_alu_instr* alu_instr = nir_instr_as_alu(instr);545RegType type =546nir_dest_is_divergent(alu_instr->dest.dest) ? RegType::vgpr : RegType::sgpr;547switch (alu_instr->op) {548case nir_op_fmul:549case nir_op_fadd:550case nir_op_fsub:551case nir_op_fmax:552case nir_op_fmin:553case nir_op_fneg:554case nir_op_fabs:555case nir_op_fsat:556case nir_op_fsign:557case nir_op_frcp:558case nir_op_frsq:559case nir_op_fsqrt:560case nir_op_fexp2:561case nir_op_flog2:562case nir_op_ffract:563case nir_op_ffloor:564case nir_op_fceil:565case nir_op_ftrunc:566case nir_op_fround_even:567case nir_op_fsin:568case nir_op_fcos:569case nir_op_f2f16:570case nir_op_f2f16_rtz:571case nir_op_f2f16_rtne:572case nir_op_f2f32:573case nir_op_f2f64:574case nir_op_u2f16:575case nir_op_u2f32:576case nir_op_u2f64:577case nir_op_i2f16:578case nir_op_i2f32:579case nir_op_i2f64:580case nir_op_pack_half_2x16_split:581case nir_op_unpack_half_2x16_split_x:582case nir_op_unpack_half_2x16_split_y:583case nir_op_fddx:584case nir_op_fddy:585case nir_op_fddx_fine:586case nir_op_fddy_fine:587case nir_op_fddx_coarse:588case nir_op_fddy_coarse:589case nir_op_fquantize2f16:590case nir_op_ldexp:591case nir_op_frexp_sig:592case nir_op_frexp_exp:593case nir_op_cube_face_index_amd:594case nir_op_cube_face_coord_amd:595case nir_op_sad_u8x4: type = RegType::vgpr; break;596case nir_op_f2i16:597case nir_op_f2u16:598case nir_op_f2i32:599case nir_op_f2u32:600case nir_op_f2i64:601case nir_op_f2u64:602case nir_op_b2i8:603case nir_op_b2i16:604case nir_op_b2i32:605case nir_op_b2i64:606case nir_op_b2b32:607case nir_op_b2f16:608case nir_op_b2f32:609case nir_op_mov: break;610case nir_op_iadd:611case nir_op_isub:612case nir_op_imul:613case nir_op_imin:614case nir_op_imax:615case nir_op_umin:616case nir_op_umax:617case nir_op_ishl:618case nir_op_ishr:619case nir_op_ushr:620/* packed 16bit instructions have to be VGPR */621type = alu_instr->dest.dest.ssa.num_components == 2 ? RegType::vgpr : type;622FALLTHROUGH;623default:624for (unsigned i = 0; i < nir_op_infos[alu_instr->op].num_inputs; i++) {625if (regclasses[alu_instr->src[i].src.ssa->index].type() == RegType::vgpr)626type = RegType::vgpr;627}628break;629}630631RegClass rc = get_reg_class(ctx, type, alu_instr->dest.dest.ssa.num_components,632alu_instr->dest.dest.ssa.bit_size);633regclasses[alu_instr->dest.dest.ssa.index] = rc;634break;635}636case nir_instr_type_load_const: {637unsigned num_components = nir_instr_as_load_const(instr)->def.num_components;638unsigned bit_size = nir_instr_as_load_const(instr)->def.bit_size;639RegClass rc = get_reg_class(ctx, RegType::sgpr, num_components, bit_size);640regclasses[nir_instr_as_load_const(instr)->def.index] = rc;641break;642}643case nir_instr_type_intrinsic: {644nir_intrinsic_instr* intrinsic = nir_instr_as_intrinsic(instr);645if (!nir_intrinsic_infos[intrinsic->intrinsic].has_dest)646break;647RegType type = RegType::sgpr;648switch (intrinsic->intrinsic) {649case nir_intrinsic_load_push_constant:650case nir_intrinsic_load_workgroup_id:651case nir_intrinsic_load_num_workgroups:652case nir_intrinsic_load_subgroup_id:653case nir_intrinsic_load_num_subgroups:654case nir_intrinsic_load_first_vertex:655case nir_intrinsic_load_base_instance:656case nir_intrinsic_vote_all:657case nir_intrinsic_vote_any:658case nir_intrinsic_read_first_invocation:659case nir_intrinsic_read_invocation:660case nir_intrinsic_first_invocation:661case nir_intrinsic_ballot:662case nir_intrinsic_load_ring_tess_factors_amd:663case nir_intrinsic_load_ring_tess_factors_offset_amd:664case nir_intrinsic_load_ring_tess_offchip_amd:665case nir_intrinsic_load_ring_tess_offchip_offset_amd:666case nir_intrinsic_load_ring_esgs_amd:667case nir_intrinsic_load_ring_es2gs_offset_amd:668case nir_intrinsic_image_deref_samples:669case nir_intrinsic_has_input_vertex_amd:670case nir_intrinsic_has_input_primitive_amd:671case nir_intrinsic_load_workgroup_num_input_vertices_amd:672case nir_intrinsic_load_workgroup_num_input_primitives_amd:673case nir_intrinsic_load_shader_query_enabled_amd:674case nir_intrinsic_load_cull_front_face_enabled_amd:675case nir_intrinsic_load_cull_back_face_enabled_amd:676case nir_intrinsic_load_cull_ccw_amd:677case nir_intrinsic_load_cull_small_primitives_enabled_amd:678case nir_intrinsic_load_cull_any_enabled_amd:679case nir_intrinsic_load_viewport_x_scale:680case nir_intrinsic_load_viewport_y_scale:681case nir_intrinsic_load_viewport_x_offset:682case nir_intrinsic_load_viewport_y_offset: type = RegType::sgpr; break;683case nir_intrinsic_load_sample_id:684case nir_intrinsic_load_sample_mask_in:685case nir_intrinsic_load_input:686case nir_intrinsic_load_output:687case nir_intrinsic_load_input_vertex:688case nir_intrinsic_load_per_vertex_input:689case nir_intrinsic_load_per_vertex_output:690case nir_intrinsic_load_vertex_id:691case nir_intrinsic_load_vertex_id_zero_base:692case nir_intrinsic_load_barycentric_sample:693case nir_intrinsic_load_barycentric_pixel:694case nir_intrinsic_load_barycentric_model:695case nir_intrinsic_load_barycentric_centroid:696case nir_intrinsic_load_barycentric_at_sample:697case nir_intrinsic_load_barycentric_at_offset:698case nir_intrinsic_load_interpolated_input:699case nir_intrinsic_load_frag_coord:700case nir_intrinsic_load_frag_shading_rate:701case nir_intrinsic_load_sample_pos:702case nir_intrinsic_load_layer_id:703case nir_intrinsic_load_local_invocation_id:704case nir_intrinsic_load_local_invocation_index:705case nir_intrinsic_load_subgroup_invocation:706case nir_intrinsic_load_tess_coord:707case nir_intrinsic_write_invocation_amd:708case nir_intrinsic_mbcnt_amd:709case nir_intrinsic_byte_permute_amd:710case nir_intrinsic_lane_permute_16_amd:711case nir_intrinsic_load_instance_id:712case nir_intrinsic_ssbo_atomic_add:713case nir_intrinsic_ssbo_atomic_imin:714case nir_intrinsic_ssbo_atomic_umin:715case nir_intrinsic_ssbo_atomic_imax:716case nir_intrinsic_ssbo_atomic_umax:717case nir_intrinsic_ssbo_atomic_and:718case nir_intrinsic_ssbo_atomic_or:719case nir_intrinsic_ssbo_atomic_xor:720case nir_intrinsic_ssbo_atomic_exchange:721case nir_intrinsic_ssbo_atomic_comp_swap:722case nir_intrinsic_global_atomic_add:723case nir_intrinsic_global_atomic_imin:724case nir_intrinsic_global_atomic_umin:725case nir_intrinsic_global_atomic_imax:726case nir_intrinsic_global_atomic_umax:727case nir_intrinsic_global_atomic_and:728case nir_intrinsic_global_atomic_or:729case nir_intrinsic_global_atomic_xor:730case nir_intrinsic_global_atomic_exchange:731case nir_intrinsic_global_atomic_comp_swap:732case nir_intrinsic_image_deref_atomic_add:733case nir_intrinsic_image_deref_atomic_umin:734case nir_intrinsic_image_deref_atomic_imin:735case nir_intrinsic_image_deref_atomic_umax:736case nir_intrinsic_image_deref_atomic_imax:737case nir_intrinsic_image_deref_atomic_and:738case nir_intrinsic_image_deref_atomic_or:739case nir_intrinsic_image_deref_atomic_xor:740case nir_intrinsic_image_deref_atomic_exchange:741case nir_intrinsic_image_deref_atomic_comp_swap:742case nir_intrinsic_image_deref_size:743case nir_intrinsic_shared_atomic_add:744case nir_intrinsic_shared_atomic_imin:745case nir_intrinsic_shared_atomic_umin:746case nir_intrinsic_shared_atomic_imax:747case nir_intrinsic_shared_atomic_umax:748case nir_intrinsic_shared_atomic_and:749case nir_intrinsic_shared_atomic_or:750case nir_intrinsic_shared_atomic_xor:751case nir_intrinsic_shared_atomic_exchange:752case nir_intrinsic_shared_atomic_comp_swap:753case nir_intrinsic_shared_atomic_fadd:754case nir_intrinsic_load_scratch:755case nir_intrinsic_load_invocation_id:756case nir_intrinsic_load_primitive_id:757case nir_intrinsic_load_buffer_amd:758case nir_intrinsic_load_tess_rel_patch_id_amd:759case nir_intrinsic_load_gs_vertex_offset_amd:760case nir_intrinsic_load_initial_edgeflag_amd:761case nir_intrinsic_load_packed_passthrough_primitive_amd:762case nir_intrinsic_gds_atomic_add_amd:763case nir_intrinsic_load_sbt_amd:764case nir_intrinsic_bvh64_intersect_ray_amd:765case nir_intrinsic_load_cull_small_prim_precision_amd: type = RegType::vgpr; break;766case nir_intrinsic_load_shared:767/* When the result of these loads is only used by cross-lane instructions,768* it is beneficial to use a VGPR destination. This is because this allows769* to put the s_waitcnt further down, which decreases latency.770*/771if (only_used_by_cross_lane_instrs(&intrinsic->dest.ssa)) {772type = RegType::vgpr;773break;774}775FALLTHROUGH;776case nir_intrinsic_shuffle:777case nir_intrinsic_quad_broadcast:778case nir_intrinsic_quad_swap_horizontal:779case nir_intrinsic_quad_swap_vertical:780case nir_intrinsic_quad_swap_diagonal:781case nir_intrinsic_quad_swizzle_amd:782case nir_intrinsic_masked_swizzle_amd:783case nir_intrinsic_inclusive_scan:784case nir_intrinsic_exclusive_scan:785case nir_intrinsic_reduce:786case nir_intrinsic_load_ubo:787case nir_intrinsic_load_ssbo:788case nir_intrinsic_load_global:789case nir_intrinsic_vulkan_resource_index:790case nir_intrinsic_get_ssbo_size:791type = nir_dest_is_divergent(intrinsic->dest) ? RegType::vgpr : RegType::sgpr;792break;793case nir_intrinsic_load_view_index:794type = ctx->stage == fragment_fs ? RegType::vgpr : RegType::sgpr;795break;796default:797for (unsigned i = 0; i < nir_intrinsic_infos[intrinsic->intrinsic].num_srcs;798i++) {799if (regclasses[intrinsic->src[i].ssa->index].type() == RegType::vgpr)800type = RegType::vgpr;801}802break;803}804RegClass rc = get_reg_class(ctx, type, intrinsic->dest.ssa.num_components,805intrinsic->dest.ssa.bit_size);806regclasses[intrinsic->dest.ssa.index] = rc;807808switch (intrinsic->intrinsic) {809case nir_intrinsic_load_barycentric_sample:810case nir_intrinsic_load_barycentric_pixel:811case nir_intrinsic_load_barycentric_centroid:812case nir_intrinsic_load_barycentric_at_sample:813case nir_intrinsic_load_barycentric_at_offset: {814glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(intrinsic);815spi_ps_inputs |= get_interp_input(intrinsic->intrinsic, mode);816break;817}818case nir_intrinsic_load_barycentric_model:819spi_ps_inputs |= S_0286CC_PERSP_PULL_MODEL_ENA(1);820break;821case nir_intrinsic_load_front_face:822spi_ps_inputs |= S_0286CC_FRONT_FACE_ENA(1);823break;824case nir_intrinsic_load_frag_coord:825case nir_intrinsic_load_sample_pos: {826uint8_t mask = nir_ssa_def_components_read(&intrinsic->dest.ssa);827for (unsigned i = 0; i < 4; i++) {828if (mask & (1 << i))829spi_ps_inputs |= S_0286CC_POS_X_FLOAT_ENA(1) << i;830}831832if (ctx->options->adjust_frag_coord_z &&833intrinsic->intrinsic == nir_intrinsic_load_frag_coord &&834G_0286CC_POS_Z_FLOAT_ENA(spi_ps_inputs)) {835/* Enable ancillary for adjusting gl_FragCoord.z for836* VRS due to a hw bug on some GFX10.3 chips.837*/838spi_ps_inputs |= S_0286CC_ANCILLARY_ENA(1);839}840break;841}842case nir_intrinsic_load_sample_id:843case nir_intrinsic_load_frag_shading_rate:844spi_ps_inputs |= S_0286CC_ANCILLARY_ENA(1);845break;846case nir_intrinsic_load_sample_mask_in:847spi_ps_inputs |= S_0286CC_ANCILLARY_ENA(1);848spi_ps_inputs |= S_0286CC_SAMPLE_COVERAGE_ENA(1);849break;850default: break;851}852break;853}854case nir_instr_type_tex: {855nir_tex_instr* tex = nir_instr_as_tex(instr);856RegType type = nir_dest_is_divergent(tex->dest) ? RegType::vgpr : RegType::sgpr;857858if (tex->op == nir_texop_texture_samples) {859assert(!tex->dest.ssa.divergent);860}861862RegClass rc =863get_reg_class(ctx, type, tex->dest.ssa.num_components, tex->dest.ssa.bit_size);864regclasses[tex->dest.ssa.index] = rc;865break;866}867case nir_instr_type_parallel_copy: {868nir_foreach_parallel_copy_entry (entry, nir_instr_as_parallel_copy(instr)) {869regclasses[entry->dest.ssa.index] = regclasses[entry->src.ssa->index];870}871break;872}873case nir_instr_type_ssa_undef: {874unsigned num_components = nir_instr_as_ssa_undef(instr)->def.num_components;875unsigned bit_size = nir_instr_as_ssa_undef(instr)->def.bit_size;876RegClass rc = get_reg_class(ctx, RegType::sgpr, num_components, bit_size);877regclasses[nir_instr_as_ssa_undef(instr)->def.index] = rc;878break;879}880case nir_instr_type_phi: {881nir_phi_instr* phi = nir_instr_as_phi(instr);882RegType type = RegType::sgpr;883unsigned num_components = phi->dest.ssa.num_components;884assert((phi->dest.ssa.bit_size != 1 || num_components == 1) &&885"Multiple components not supported on boolean phis.");886887if (nir_dest_is_divergent(phi->dest)) {888type = RegType::vgpr;889} else {890nir_foreach_phi_src (src, phi) {891if (regclasses[src->src.ssa->index].type() == RegType::vgpr)892type = RegType::vgpr;893}894}895896RegClass rc = get_reg_class(ctx, type, num_components, phi->dest.ssa.bit_size);897if (rc != regclasses[phi->dest.ssa.index])898done = false;899regclasses[phi->dest.ssa.index] = rc;900break;901}902default: break;903}904}905}906}907908if (G_0286CC_POS_W_FLOAT_ENA(spi_ps_inputs)) {909/* If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be enabled too */910spi_ps_inputs |= S_0286CC_PERSP_CENTER_ENA(1);911}912913if (!(spi_ps_inputs & 0x7F)) {914/* At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled */915spi_ps_inputs |= S_0286CC_PERSP_CENTER_ENA(1);916}917918ctx->program->config->spi_ps_input_ena = spi_ps_inputs;919ctx->program->config->spi_ps_input_addr = spi_ps_inputs;920921ctx->cf_info.nir_to_aco.reset(nir_to_aco.release());922923/* align and copy constant data */924while (ctx->program->constant_data.size() % 4u)925ctx->program->constant_data.push_back(0);926ctx->constant_data_offset = ctx->program->constant_data.size();927ctx->program->constant_data.insert(ctx->program->constant_data.end(),928(uint8_t*)shader->constant_data,929(uint8_t*)shader->constant_data + shader->constant_data_size);930}931932void933cleanup_context(isel_context* ctx)934{935_mesa_hash_table_destroy(ctx->range_ht, NULL);936}937938isel_context939setup_isel_context(Program* program, unsigned shader_count, struct nir_shader* const* shaders,940ac_shader_config* config, struct radv_shader_args* args, bool is_gs_copy_shader)941{942SWStage sw_stage = SWStage::None;943for (unsigned i = 0; i < shader_count; i++) {944switch (shaders[i]->info.stage) {945case MESA_SHADER_VERTEX: sw_stage = sw_stage | SWStage::VS; break;946case MESA_SHADER_TESS_CTRL: sw_stage = sw_stage | SWStage::TCS; break;947case MESA_SHADER_TESS_EVAL: sw_stage = sw_stage | SWStage::TES; break;948case MESA_SHADER_GEOMETRY:949sw_stage = sw_stage | (is_gs_copy_shader ? SWStage::GSCopy : SWStage::GS);950break;951case MESA_SHADER_FRAGMENT: sw_stage = sw_stage | SWStage::FS; break;952case MESA_SHADER_COMPUTE: sw_stage = sw_stage | SWStage::CS; break;953default: unreachable("Shader stage not implemented");954}955}956bool gfx9_plus = args->options->chip_class >= GFX9;957bool ngg = args->shader_info->is_ngg && args->options->chip_class >= GFX10;958HWStage hw_stage{};959if (sw_stage == SWStage::VS && args->shader_info->vs.as_es && !ngg)960hw_stage = HWStage::ES;961else if (sw_stage == SWStage::VS && !args->shader_info->vs.as_ls && !ngg)962hw_stage = HWStage::VS;963else if (sw_stage == SWStage::VS && ngg)964hw_stage = HWStage::NGG; /* GFX10/NGG: VS without GS uses the HW GS stage */965else if (sw_stage == SWStage::GS)966hw_stage = HWStage::GS;967else if (sw_stage == SWStage::FS)968hw_stage = HWStage::FS;969else if (sw_stage == SWStage::CS)970hw_stage = HWStage::CS;971else if (sw_stage == SWStage::GSCopy)972hw_stage = HWStage::VS;973else if (sw_stage == SWStage::VS_GS && gfx9_plus && !ngg)974hw_stage = HWStage::GS; /* GFX6-9: VS+GS merged into a GS (and GFX10/legacy) */975else if (sw_stage == SWStage::VS_GS && ngg)976hw_stage = HWStage::NGG; /* GFX10+: VS+GS merged into an NGG GS */977else if (sw_stage == SWStage::VS && args->shader_info->vs.as_ls)978hw_stage = HWStage::LS; /* GFX6-8: VS is a Local Shader, when tessellation is used */979else if (sw_stage == SWStage::TCS)980hw_stage = HWStage::HS; /* GFX6-8: TCS is a Hull Shader */981else if (sw_stage == SWStage::VS_TCS)982hw_stage = HWStage::HS; /* GFX9-10: VS+TCS merged into a Hull Shader */983else if (sw_stage == SWStage::TES && !args->shader_info->tes.as_es && !ngg)984hw_stage = HWStage::VS; /* GFX6-9: TES without GS uses the HW VS stage (and GFX10/legacy) */985else if (sw_stage == SWStage::TES && !args->shader_info->tes.as_es && ngg)986hw_stage = HWStage::NGG; /* GFX10/NGG: TES without GS */987else if (sw_stage == SWStage::TES && args->shader_info->tes.as_es && !ngg)988hw_stage = HWStage::ES; /* GFX6-8: TES is an Export Shader */989else if (sw_stage == SWStage::TES_GS && gfx9_plus && !ngg)990hw_stage = HWStage::GS; /* GFX9: TES+GS merged into a GS (and GFX10/legacy) */991else if (sw_stage == SWStage::TES_GS && ngg)992hw_stage = HWStage::NGG; /* GFX10+: TES+GS merged into an NGG GS */993else994unreachable("Shader stage not implemented");995996init_program(program, Stage{hw_stage, sw_stage}, args->shader_info, args->options->chip_class,997args->options->family, args->options->wgp_mode, config);998999isel_context ctx = {};1000ctx.program = program;1001ctx.args = args;1002ctx.options = args->options;1003ctx.stage = program->stage;10041005/* TODO: Check if we need to adjust min_waves for unknown workgroup sizes. */1006if (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::FS) {1007/* PS and legacy VS have separate waves, no workgroups */1008program->workgroup_size = program->wave_size;1009} else if (program->stage == compute_cs) {1010/* CS sets the workgroup size explicitly */1011program->workgroup_size = shaders[0]->info.workgroup_size[0] *1012shaders[0]->info.workgroup_size[1] *1013shaders[0]->info.workgroup_size[2];1014} else if (program->stage.hw == HWStage::ES || program->stage == geometry_gs) {1015/* Unmerged ESGS operate in workgroups if on-chip GS (LDS rings) are enabled on GFX7-81016* (not implemented in Mesa) */1017program->workgroup_size = program->wave_size;1018} else if (program->stage.hw == HWStage::GS) {1019/* If on-chip GS (LDS rings) are enabled on GFX9 or later, merged GS operates in workgroups */1020assert(program->chip_class >= GFX9);1021uint32_t es_verts_per_subgrp =1022G_028A44_ES_VERTS_PER_SUBGRP(program->info->gs_ring_info.vgt_gs_onchip_cntl);1023uint32_t gs_instr_prims_in_subgrp =1024G_028A44_GS_INST_PRIMS_IN_SUBGRP(program->info->gs_ring_info.vgt_gs_onchip_cntl);1025uint32_t workgroup_size = MAX2(es_verts_per_subgrp, gs_instr_prims_in_subgrp);1026program->workgroup_size = MAX2(MIN2(workgroup_size, 256), 1);1027} else if (program->stage == vertex_ls) {1028/* Unmerged LS operates in workgroups */1029program->workgroup_size = UINT_MAX; /* TODO: probably tcs_num_patches * tcs_vertices_in, but1030those are not plumbed to ACO for LS */1031} else if (program->stage == tess_control_hs) {1032/* Unmerged HS operates in workgroups, size is determined by the output vertices */1033setup_tcs_info(&ctx, shaders[0], NULL);1034program->workgroup_size = ctx.tcs_num_patches * shaders[0]->info.tess.tcs_vertices_out;1035} else if (program->stage == vertex_tess_control_hs) {1036/* Merged LSHS operates in workgroups, but can still have a different number of LS and HS1037* invocations */1038setup_tcs_info(&ctx, shaders[1], shaders[0]);1039program->workgroup_size =1040ctx.tcs_num_patches *1041MAX2(shaders[1]->info.tess.tcs_vertices_out, ctx.args->options->key.tcs.input_vertices);1042} else if (program->stage.hw == HWStage::NGG) {1043gfx10_ngg_info& ngg_info = args->shader_info->ngg_info;1044unsigned num_gs_invocations =1045(program->stage.has(SWStage::GS)) ? MAX2(shaders[1]->info.gs.invocations, 1) : 1;10461047/* Max ES (SW VS/TES) threads */1048uint32_t max_esverts = ngg_info.hw_max_esverts;1049/* Max GS input primitives = max GS threads */1050uint32_t max_gs_input_prims = ngg_info.max_gsprims * num_gs_invocations;1051/* Maximum output vertices -- each thread can export only 1 vertex */1052uint32_t max_out_vtx = ngg_info.max_out_verts;1053/* Maximum output primitives -- each thread can export only 1 or 0 primitive */1054uint32_t max_out_prm = ngg_info.max_gsprims * num_gs_invocations * ngg_info.prim_amp_factor;10551056program->workgroup_size = MAX4(max_esverts, max_gs_input_prims, max_out_vtx, max_out_prm);1057} else {1058unreachable("Unsupported shader stage.");1059}10601061calc_min_waves(program);10621063unsigned scratch_size = 0;1064if (program->stage == gs_copy_vs) {1065assert(shader_count == 1);1066setup_vs_output_info(&ctx, shaders[0], false, true, &args->shader_info->vs.outinfo);1067} else {1068for (unsigned i = 0; i < shader_count; i++) {1069nir_shader* nir = shaders[i];1070setup_nir(&ctx, nir);1071}10721073for (unsigned i = 0; i < shader_count; i++)1074scratch_size = std::max(scratch_size, shaders[i]->scratch_size);1075}10761077ctx.program->config->scratch_bytes_per_wave = align(scratch_size * ctx.program->wave_size, 1024);10781079ctx.block = ctx.program->create_and_insert_block();1080ctx.block->kind = block_kind_top_level;10811082return ctx;1083}10841085} // namespace aco108610871088