Path: blob/21.2-virgl/src/freedreno/ir3/ir3_nir.c
4565 views
/*1* Copyright (C) 2015 Rob Clark <[email protected]>2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,19* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE20* SOFTWARE.21*22* Authors:23* Rob Clark <[email protected]>24*/2526#include "util/debug.h"27#include "util/u_math.h"2829#include "ir3_compiler.h"30#include "ir3_nir.h"31#include "ir3_shader.h"3233static const nir_shader_compiler_options options = {34.lower_fpow = true,35.lower_scmp = true,36.lower_flrp16 = true,37.lower_flrp32 = true,38.lower_flrp64 = true,39.lower_ffract = true,40.lower_fmod = true,41.lower_fdiv = true,42.lower_isign = true,43.lower_ldexp = true,44.lower_uadd_carry = true,45.lower_usub_borrow = true,46.lower_mul_high = true,47.lower_mul_2x32_64 = true,48.fuse_ffma16 = true,49.fuse_ffma32 = true,50.fuse_ffma64 = true,51.vertex_id_zero_based = true,52.lower_extract_byte = true,53.lower_extract_word = true,54.lower_insert_byte = true,55.lower_insert_word = true,56.lower_helper_invocation = true,57.lower_bitfield_insert_to_shifts = true,58.lower_bitfield_extract_to_shifts = true,59.lower_pack_half_2x16 = true,60.lower_pack_snorm_4x8 = true,61.lower_pack_snorm_2x16 = true,62.lower_pack_unorm_4x8 = true,63.lower_pack_unorm_2x16 = true,64.lower_unpack_half_2x16 = true,65.lower_unpack_snorm_4x8 = true,66.lower_unpack_snorm_2x16 = true,67.lower_unpack_unorm_4x8 = true,68.lower_unpack_unorm_2x16 = true,69.lower_pack_split = true,70.use_interpolated_input_intrinsics = true,71.lower_rotate = true,72.lower_to_scalar = true,73.has_imul24 = true,74.has_fsub = true,75.has_isub = true,76.lower_wpos_pntc = true,77.lower_cs_local_index_from_id = true,7879/* Only needed for the spirv_to_nir() pass done in ir3_cmdline.c80* but that should be harmless for GL since 64b is not81* supported there.82*/83.lower_int64_options = (nir_lower_int64_options)~0,84.lower_uniforms_to_ubo = true,85.use_scoped_barrier = true,86};8788/* we don't want to lower vertex_id to _zero_based on newer gpus: */89static const nir_shader_compiler_options options_a6xx = {90.lower_fpow = true,91.lower_scmp = true,92.lower_flrp16 = true,93.lower_flrp32 = true,94.lower_flrp64 = true,95.lower_ffract = true,96.lower_fmod = true,97.lower_fdiv = true,98.lower_isign = true,99.lower_ldexp = true,100.lower_uadd_carry = true,101.lower_usub_borrow = true,102.lower_mul_high = true,103.lower_mul_2x32_64 = true,104.fuse_ffma16 = true,105.fuse_ffma32 = true,106.fuse_ffma64 = true,107.vertex_id_zero_based = false,108.lower_extract_byte = true,109.lower_extract_word = true,110.lower_insert_byte = true,111.lower_insert_word = true,112.lower_helper_invocation = true,113.lower_bitfield_insert_to_shifts = true,114.lower_bitfield_extract_to_shifts = true,115.lower_pack_half_2x16 = true,116.lower_pack_snorm_4x8 = true,117.lower_pack_snorm_2x16 = true,118.lower_pack_unorm_4x8 = true,119.lower_pack_unorm_2x16 = true,120.lower_unpack_half_2x16 = true,121.lower_unpack_snorm_4x8 = true,122.lower_unpack_snorm_2x16 = true,123.lower_unpack_unorm_4x8 = true,124.lower_unpack_unorm_2x16 = true,125.lower_pack_split = true,126.use_interpolated_input_intrinsics = true,127.lower_rotate = true,128.vectorize_io = true,129.lower_to_scalar = true,130.has_imul24 = true,131.has_fsub = true,132.has_isub = true,133.max_unroll_iterations = 32,134.lower_wpos_pntc = true,135.lower_cs_local_index_from_id = true,136137/* Only needed for the spirv_to_nir() pass done in ir3_cmdline.c138* but that should be harmless for GL since 64b is not139* supported there.140*/141.lower_int64_options = (nir_lower_int64_options)~0,142.lower_uniforms_to_ubo = true,143.lower_device_index_to_zero = true,144.use_scoped_barrier = true,145};146147const nir_shader_compiler_options *148ir3_get_compiler_options(struct ir3_compiler *compiler)149{150if (compiler->gpu_id >= 600)151return &options_a6xx;152return &options;153}154155static bool156ir3_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,157unsigned bit_size, unsigned num_components,158nir_intrinsic_instr *low,159nir_intrinsic_instr *high, void *data)160{161assert(bit_size >= 8);162if (bit_size != 32)163return false;164unsigned byte_size = bit_size / 8;165166int size = num_components * byte_size;167168/* Don't care about alignment past vec4. */169assert(util_is_power_of_two_nonzero(align_mul));170align_mul = MIN2(align_mul, 16);171align_offset &= 15;172173/* Our offset alignment should aways be at least 4 bytes */174if (align_mul < 4)175return false;176177unsigned worst_start_offset = 16 - align_mul + align_offset;178if (worst_start_offset + size > 16)179return false;180181return true;182}183184#define OPT(nir, pass, ...) \185({ \186bool this_progress = false; \187NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \188this_progress; \189})190191#define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)192193void194ir3_optimize_loop(struct ir3_compiler *compiler, nir_shader *s)195{196bool progress;197unsigned lower_flrp = (s->options->lower_flrp16 ? 16 : 0) |198(s->options->lower_flrp32 ? 32 : 0) |199(s->options->lower_flrp64 ? 64 : 0);200201do {202progress = false;203204OPT_V(s, nir_lower_vars_to_ssa);205progress |= OPT(s, nir_opt_copy_prop_vars);206progress |= OPT(s, nir_opt_dead_write_vars);207progress |= OPT(s, nir_lower_alu_to_scalar, NULL, NULL);208progress |= OPT(s, nir_lower_phis_to_scalar, false);209210progress |= OPT(s, nir_copy_prop);211progress |= OPT(s, nir_opt_dce);212progress |= OPT(s, nir_opt_cse);213static int gcm = -1;214if (gcm == -1)215gcm = env_var_as_unsigned("GCM", 0);216if (gcm == 1)217progress |= OPT(s, nir_opt_gcm, true);218else if (gcm == 2)219progress |= OPT(s, nir_opt_gcm, false);220progress |= OPT(s, nir_opt_peephole_select, 16, true, true);221progress |= OPT(s, nir_opt_intrinsics);222/* NOTE: GS lowering inserts an output var with varying slot that223* is larger than VARYING_SLOT_MAX (ie. GS_VERTEX_FLAGS_IR3),224* which triggers asserts in nir_shader_gather_info(). To work225* around that skip lowering phi precision for GS.226*227* Calling nir_shader_gather_info() late also seems to cause228* problems for tess lowering, for now since we only enable229* fp16/int16 for frag and compute, skip phi precision lowering230* for other stages.231*/232if ((s->info.stage == MESA_SHADER_FRAGMENT) ||233(s->info.stage == MESA_SHADER_COMPUTE)) {234progress |= OPT(s, nir_opt_phi_precision);235}236progress |= OPT(s, nir_opt_algebraic);237progress |= OPT(s, nir_lower_alu);238progress |= OPT(s, nir_lower_pack);239progress |= OPT(s, nir_opt_constant_folding);240241nir_load_store_vectorize_options vectorize_opts = {242.modes = nir_var_mem_ubo,243.callback = ir3_nir_should_vectorize_mem,244.robust_modes = compiler->robust_ubo_access ? nir_var_mem_ubo : 0,245};246progress |= OPT(s, nir_opt_load_store_vectorize, &vectorize_opts);247248if (lower_flrp != 0) {249if (OPT(s, nir_lower_flrp, lower_flrp, false /* always_precise */)) {250OPT(s, nir_opt_constant_folding);251progress = true;252}253254/* Nothing should rematerialize any flrps, so we only255* need to do this lowering once.256*/257lower_flrp = 0;258}259260progress |= OPT(s, nir_opt_dead_cf);261if (OPT(s, nir_opt_trivial_continues)) {262progress |= true;263/* If nir_opt_trivial_continues makes progress, then we need to clean264* things up if we want any hope of nir_opt_if or nir_opt_loop_unroll265* to make progress.266*/267OPT(s, nir_copy_prop);268OPT(s, nir_opt_dce);269}270progress |= OPT(s, nir_opt_if, false);271progress |= OPT(s, nir_opt_loop_unroll, nir_var_all);272progress |= OPT(s, nir_opt_remove_phis);273progress |= OPT(s, nir_opt_undef);274} while (progress);275}276277static bool278should_split_wrmask(const nir_instr *instr, const void *data)279{280nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);281282switch (intr->intrinsic) {283case nir_intrinsic_store_ssbo:284case nir_intrinsic_store_shared:285case nir_intrinsic_store_global:286case nir_intrinsic_store_scratch:287return true;288default:289return false;290}291}292293void294ir3_nir_lower_io_to_temporaries(nir_shader *s)295{296/* Outputs consumed by the VPC, VS inputs, and FS outputs are all handled297* by the hardware pre-loading registers at the beginning and then reading298* them at the end, so we can't access them indirectly except through299* normal register-indirect accesses, and therefore ir3 doesn't support300* indirect accesses on those. Other i/o is lowered in ir3_nir_lower_tess,301* and indirects work just fine for those. GS outputs may be consumed by302* VPC, but have their own lowering in ir3_nir_lower_gs() which does303* something similar to nir_lower_io_to_temporaries so we shouldn't need304* to lower them.305*306* Note: this might be a little inefficient for VS or TES outputs which are307* when the next stage isn't an FS, but it probably don't make sense to308* depend on the next stage before variant creation.309*310* TODO: for gallium, mesa/st also does some redundant lowering, including311* running this pass for GS inputs/outputs which we don't want but not312* including TES outputs or FS inputs which we do need. We should probably313* stop doing that once we're sure all drivers are doing their own314* indirect i/o lowering.315*/316bool lower_input = s->info.stage == MESA_SHADER_VERTEX ||317s->info.stage == MESA_SHADER_FRAGMENT;318bool lower_output = s->info.stage != MESA_SHADER_TESS_CTRL &&319s->info.stage != MESA_SHADER_GEOMETRY;320if (lower_input || lower_output) {321NIR_PASS_V(s, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(s),322lower_output, lower_input);323324/* nir_lower_io_to_temporaries() creates global variables and copy325* instructions which need to be cleaned up.326*/327NIR_PASS_V(s, nir_split_var_copies);328NIR_PASS_V(s, nir_lower_var_copies);329NIR_PASS_V(s, nir_lower_global_vars_to_local);330}331332/* Regardless of the above, we need to lower indirect references to333* compact variables such as clip/cull distances because due to how334* TCS<->TES IO works we cannot handle indirect accesses that "straddle"335* vec4 components. nir_lower_indirect_derefs has a special case for336* compact variables, so it will actually lower them even though we pass337* in 0 modes.338*339* Using temporaries would be slightly better but340* nir_lower_io_to_temporaries currently doesn't support TCS i/o.341*/342NIR_PASS_V(s, nir_lower_indirect_derefs, 0, UINT32_MAX);343}344345void346ir3_finalize_nir(struct ir3_compiler *compiler, nir_shader *s)347{348struct nir_lower_tex_options tex_options = {349.lower_rect = 0,350.lower_tg4_offsets = true,351};352353if (compiler->gpu_id >= 400) {354/* a4xx seems to have *no* sam.p */355tex_options.lower_txp = ~0; /* lower all txp */356} else {357/* a3xx just needs to avoid sam.p for 3d tex */358tex_options.lower_txp = (1 << GLSL_SAMPLER_DIM_3D);359}360361if (ir3_shader_debug & IR3_DBG_DISASM) {362mesa_logi("----------------------");363nir_log_shaderi(s);364mesa_logi("----------------------");365}366367if (s->info.stage == MESA_SHADER_GEOMETRY)368NIR_PASS_V(s, ir3_nir_lower_gs);369370NIR_PASS_V(s, nir_lower_amul, ir3_glsl_type_size);371372OPT_V(s, nir_lower_regs_to_ssa);373OPT_V(s, nir_lower_wrmasks, should_split_wrmask, s);374375OPT_V(s, nir_lower_tex, &tex_options);376OPT_V(s, nir_lower_load_const_to_scalar);377if (compiler->gpu_id < 500)378OPT_V(s, ir3_nir_lower_tg4_to_tex);379380ir3_optimize_loop(compiler, s);381382/* do idiv lowering after first opt loop to get a chance to propagate383* constants for divide by immed power-of-two:384*/385nir_lower_idiv_options idiv_options = {386.imprecise_32bit_lowering = true,387.allow_fp16 = true,388};389const bool idiv_progress = OPT(s, nir_lower_idiv, &idiv_options);390391if (idiv_progress)392ir3_optimize_loop(compiler, s);393394OPT_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL);395396if (ir3_shader_debug & IR3_DBG_DISASM) {397mesa_logi("----------------------");398nir_log_shaderi(s);399mesa_logi("----------------------");400}401402/* st_program.c's parameter list optimization requires that future nir403* variants don't reallocate the uniform storage, so we have to remove404* uniforms that occupy storage. But we don't want to remove samplers,405* because they're needed for YUV variant lowering.406*/407nir_foreach_uniform_variable_safe (var, s) {408if (var->data.mode == nir_var_uniform &&409(glsl_type_get_image_count(var->type) ||410glsl_type_get_sampler_count(var->type)))411continue;412413exec_node_remove(&var->node);414}415nir_validate_shader(s, "after uniform var removal");416417nir_sweep(s);418}419420static bool421lower_subgroup_id_filter(const nir_instr *instr, const void *unused)422{423(void)unused;424425if (instr->type != nir_instr_type_intrinsic)426return false;427428nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);429return intr->intrinsic == nir_intrinsic_load_subgroup_invocation ||430intr->intrinsic == nir_intrinsic_load_subgroup_id ||431intr->intrinsic == nir_intrinsic_load_num_subgroups;432}433434static nir_ssa_def *435lower_subgroup_id(nir_builder *b, nir_instr *instr, void *unused)436{437(void)instr;438(void)unused;439440nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);441if (intr->intrinsic == nir_intrinsic_load_subgroup_invocation) {442return nir_iand(443b, nir_load_local_invocation_index(b),444nir_isub(b, nir_load_subgroup_size(b), nir_imm_int(b, 1)));445} else if (intr->intrinsic == nir_intrinsic_load_subgroup_id) {446return nir_ishr(b, nir_load_local_invocation_index(b),447nir_load_subgroup_id_shift_ir3(b));448} else {449assert(intr->intrinsic == nir_intrinsic_load_num_subgroups);450/* If the workgroup size is constant,451* nir_lower_compute_system_values() will replace local_size with a452* constant so this can mostly be constant folded away.453*/454nir_ssa_def *local_size = nir_load_workgroup_size(b);455nir_ssa_def *size =456nir_imul24(b, nir_channel(b, local_size, 0),457nir_imul24(b, nir_channel(b, local_size, 1),458nir_channel(b, local_size, 2)));459nir_ssa_def *one = nir_imm_int(b, 1);460return nir_iadd(b, one,461nir_ishr(b, nir_isub(b, size, one),462nir_load_subgroup_id_shift_ir3(b)));463}464}465466static bool467ir3_nir_lower_subgroup_id_cs(nir_shader *shader)468{469return nir_shader_lower_instructions(shader, lower_subgroup_id_filter,470lower_subgroup_id, NULL);471}472473/**474* Late passes that need to be done after pscreen->finalize_nir()475*/476void477ir3_nir_post_finalize(struct ir3_compiler *compiler, nir_shader *s)478{479NIR_PASS_V(s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,480ir3_glsl_type_size, (nir_lower_io_options)0);481482if (s->info.stage == MESA_SHADER_FRAGMENT) {483/* NOTE: lower load_barycentric_at_sample first, since it484* produces load_barycentric_at_offset:485*/486NIR_PASS_V(s, ir3_nir_lower_load_barycentric_at_sample);487NIR_PASS_V(s, ir3_nir_lower_load_barycentric_at_offset);488NIR_PASS_V(s, ir3_nir_move_varying_inputs);489NIR_PASS_V(s, nir_lower_fb_read);490}491492if (compiler->gpu_id >= 600 && s->info.stage == MESA_SHADER_FRAGMENT &&493!(ir3_shader_debug & IR3_DBG_NOFP16)) {494NIR_PASS_V(s, nir_lower_mediump_io, nir_var_shader_out, 0, false);495}496497if (s->info.stage == MESA_SHADER_COMPUTE) {498bool progress = false;499NIR_PASS(progress, s, nir_lower_subgroups,500&(nir_lower_subgroups_options){501.subgroup_size = 128,502.ballot_bit_size = 32,503.ballot_components = 4,504.lower_to_scalar = true,505.lower_vote_eq = true,506.lower_subgroup_masks = true,507.lower_read_invocation_to_cond = true,508});509510progress = false;511NIR_PASS(progress, s, ir3_nir_lower_subgroup_id_cs);512513/* ir3_nir_lower_subgroup_id_cs creates extra compute intrinsics which514* we need to lower again.515*/516if (progress)517NIR_PASS_V(s, nir_lower_compute_system_values, NULL);518}519520/* we cannot ensure that ir3_finalize_nir() is only called once, so521* we also need to do trig workarounds here:522*/523OPT_V(s, ir3_nir_apply_trig_workarounds);524525ir3_optimize_loop(compiler, s);526}527528static bool529ir3_nir_lower_view_layer_id(nir_shader *nir, bool layer_zero, bool view_zero)530{531unsigned layer_id_loc = ~0, view_id_loc = ~0;532nir_foreach_shader_in_variable (var, nir) {533if (var->data.location == VARYING_SLOT_LAYER)534layer_id_loc = var->data.driver_location;535if (var->data.location == VARYING_SLOT_VIEWPORT)536view_id_loc = var->data.driver_location;537}538539assert(!layer_zero || layer_id_loc != ~0);540assert(!view_zero || view_id_loc != ~0);541542bool progress = false;543nir_builder b;544545nir_foreach_function (func, nir) {546nir_builder_init(&b, func->impl);547548nir_foreach_block (block, func->impl) {549nir_foreach_instr_safe (instr, block) {550if (instr->type != nir_instr_type_intrinsic)551continue;552553nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);554555if (intrin->intrinsic != nir_intrinsic_load_input)556continue;557558unsigned base = nir_intrinsic_base(intrin);559if (base != layer_id_loc && base != view_id_loc)560continue;561562b.cursor = nir_before_instr(&intrin->instr);563nir_ssa_def *zero = nir_imm_int(&b, 0);564nir_ssa_def_rewrite_uses(&intrin->dest.ssa, zero);565nir_instr_remove(&intrin->instr);566progress = true;567}568}569570if (progress) {571nir_metadata_preserve(572func->impl, nir_metadata_block_index | nir_metadata_dominance);573} else {574nir_metadata_preserve(func->impl, nir_metadata_all);575}576}577578return progress;579}580581void582ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s)583{584if (ir3_shader_debug & IR3_DBG_DISASM) {585mesa_logi("----------------------");586nir_log_shaderi(s);587mesa_logi("----------------------");588}589590bool progress = false;591592if (so->key.has_gs || so->key.tessellation) {593switch (so->shader->type) {594case MESA_SHADER_VERTEX:595NIR_PASS_V(s, ir3_nir_lower_to_explicit_output, so,596so->key.tessellation);597progress = true;598break;599case MESA_SHADER_TESS_CTRL:600NIR_PASS_V(s, ir3_nir_lower_tess_ctrl, so, so->key.tessellation);601NIR_PASS_V(s, ir3_nir_lower_to_explicit_input, so);602progress = true;603break;604case MESA_SHADER_TESS_EVAL:605NIR_PASS_V(s, ir3_nir_lower_tess_eval, so, so->key.tessellation);606if (so->key.has_gs)607NIR_PASS_V(s, ir3_nir_lower_to_explicit_output, so,608so->key.tessellation);609progress = true;610break;611case MESA_SHADER_GEOMETRY:612NIR_PASS_V(s, ir3_nir_lower_to_explicit_input, so);613progress = true;614break;615default:616break;617}618}619620if (s->info.stage == MESA_SHADER_VERTEX) {621if (so->key.ucp_enables)622progress |=623OPT(s, nir_lower_clip_vs, so->key.ucp_enables, false, false, NULL);624} else if (s->info.stage == MESA_SHADER_FRAGMENT) {625bool layer_zero =626so->key.layer_zero && (s->info.inputs_read & VARYING_BIT_LAYER);627bool view_zero =628so->key.view_zero && (s->info.inputs_read & VARYING_BIT_VIEWPORT);629630if (so->key.ucp_enables && !so->shader->compiler->has_clip_cull)631progress |= OPT(s, nir_lower_clip_fs, so->key.ucp_enables, false);632if (layer_zero || view_zero)633progress |= OPT(s, ir3_nir_lower_view_layer_id, layer_zero, view_zero);634}635636/* Move large constant variables to the constants attached to the NIR637* shader, which we will upload in the immediates range. This generates638* amuls, so we need to clean those up after.639*640* Passing no size_align, we would get packed values, which if we end up641* having to load with LDC would result in extra reads to unpack from642* straddling loads. Align everything to vec4 to avoid that, though we643* could theoretically do better.644*/645OPT_V(s, nir_opt_large_constants, glsl_get_vec4_size_align_bytes,64632 /* bytes */);647OPT_V(s, ir3_nir_lower_load_constant, so);648649if (!so->binning_pass)650OPT_V(s, ir3_nir_analyze_ubo_ranges, so);651652progress |= OPT(s, ir3_nir_lower_ubo_loads, so);653654/* Lower large temporaries to scratch, which in Qualcomm terms is private655* memory, to avoid excess register pressure. This should happen after656* nir_opt_large_constants, because loading from a UBO is much, much less657* expensive.658*/659if (so->shader->compiler->has_pvtmem) {660progress |= OPT(s, nir_lower_vars_to_scratch, nir_var_function_temp,66116 * 16 /* bytes */, glsl_get_natural_size_align_bytes);662}663664/* Lower scratch writemasks */665progress |= OPT(s, nir_lower_wrmasks, should_split_wrmask, s);666667OPT_V(s, nir_lower_amul, ir3_glsl_type_size);668669/* UBO offset lowering has to come after we've decided what will670* be left as load_ubo671*/672if (so->shader->compiler->gpu_id >= 600)673progress |= OPT(s, nir_lower_ubo_vec4);674675OPT_V(s, ir3_nir_lower_io_offsets, so->shader->compiler->gpu_id);676677if (progress)678ir3_optimize_loop(so->shader->compiler, s);679680/* Fixup indirect load_uniform's which end up with a const base offset681* which is too large to encode. Do this late(ish) so we actually682* can differentiate indirect vs non-indirect.683*/684if (OPT(s, ir3_nir_fixup_load_uniform))685ir3_optimize_loop(so->shader->compiler, s);686687/* Do late algebraic optimization to turn add(a, neg(b)) back into688* subs, then the mandatory cleanup after algebraic. Note that it may689* produce fnegs, and if so then we need to keep running to squash690* fneg(fneg(a)).691*/692bool more_late_algebraic = true;693while (more_late_algebraic) {694more_late_algebraic = OPT(s, nir_opt_algebraic_late);695OPT_V(s, nir_opt_constant_folding);696OPT_V(s, nir_copy_prop);697OPT_V(s, nir_opt_dce);698OPT_V(s, nir_opt_cse);699}700701OPT_V(s, nir_opt_sink, nir_move_const_undef);702703if (ir3_shader_debug & IR3_DBG_DISASM) {704mesa_logi("----------------------");705nir_log_shaderi(s);706mesa_logi("----------------------");707}708709nir_sweep(s);710711/* Binning pass variants re-use the const_state of the corresponding712* draw pass shader, so that same const emit can be re-used for both713* passes:714*/715if (!so->binning_pass)716ir3_setup_const_state(s, so, ir3_const_state(so));717}718719static void720ir3_nir_scan_driver_consts(nir_shader *shader, struct ir3_const_state *layout)721{722nir_foreach_function (function, shader) {723if (!function->impl)724continue;725726nir_foreach_block (block, function->impl) {727nir_foreach_instr (instr, block) {728if (instr->type != nir_instr_type_intrinsic)729continue;730731nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);732unsigned idx;733734switch (intr->intrinsic) {735case nir_intrinsic_get_ssbo_size:736if (ir3_bindless_resource(intr->src[0]))737break;738idx = nir_src_as_uint(intr->src[0]);739if (layout->ssbo_size.mask & (1 << idx))740break;741layout->ssbo_size.mask |= (1 << idx);742layout->ssbo_size.off[idx] = layout->ssbo_size.count;743layout->ssbo_size.count += 1; /* one const per */744break;745case nir_intrinsic_image_atomic_add:746case nir_intrinsic_image_atomic_imin:747case nir_intrinsic_image_atomic_umin:748case nir_intrinsic_image_atomic_imax:749case nir_intrinsic_image_atomic_umax:750case nir_intrinsic_image_atomic_and:751case nir_intrinsic_image_atomic_or:752case nir_intrinsic_image_atomic_xor:753case nir_intrinsic_image_atomic_exchange:754case nir_intrinsic_image_atomic_comp_swap:755case nir_intrinsic_image_store:756case nir_intrinsic_image_size:757idx = nir_src_as_uint(intr->src[0]);758if (layout->image_dims.mask & (1 << idx))759break;760layout->image_dims.mask |= (1 << idx);761layout->image_dims.off[idx] = layout->image_dims.count;762layout->image_dims.count += 3; /* three const per */763break;764case nir_intrinsic_load_base_vertex:765case nir_intrinsic_load_first_vertex:766layout->num_driver_params =767MAX2(layout->num_driver_params, IR3_DP_VTXID_BASE + 1);768break;769case nir_intrinsic_load_base_instance:770layout->num_driver_params =771MAX2(layout->num_driver_params, IR3_DP_INSTID_BASE + 1);772break;773case nir_intrinsic_load_user_clip_plane:774idx = nir_intrinsic_ucp_id(intr);775layout->num_driver_params = MAX2(layout->num_driver_params,776IR3_DP_UCP0_X + (idx + 1) * 4);777break;778case nir_intrinsic_load_num_workgroups:779layout->num_driver_params =780MAX2(layout->num_driver_params, IR3_DP_NUM_WORK_GROUPS_Z + 1);781break;782case nir_intrinsic_load_workgroup_size:783layout->num_driver_params = MAX2(layout->num_driver_params,784IR3_DP_LOCAL_GROUP_SIZE_Z + 1);785break;786case nir_intrinsic_load_base_workgroup_id:787layout->num_driver_params =788MAX2(layout->num_driver_params, IR3_DP_BASE_GROUP_Z + 1);789break;790case nir_intrinsic_load_subgroup_size:791layout->num_driver_params =792MAX2(layout->num_driver_params, IR3_DP_SUBGROUP_SIZE + 1);793break;794case nir_intrinsic_load_subgroup_id_shift_ir3:795layout->num_driver_params =796MAX2(layout->num_driver_params, IR3_DP_SUBGROUP_ID_SHIFT + 1);797break;798default:799break;800}801}802}803}804}805806/* Sets up the variant-dependent constant state for the ir3_shader. Note807* that it is also used from ir3_nir_analyze_ubo_ranges() to figure out the808* maximum number of driver params that would eventually be used, to leave809* space for this function to allocate the driver params.810*/811void812ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v,813struct ir3_const_state *const_state)814{815struct ir3_compiler *compiler = v->shader->compiler;816817memset(&const_state->offsets, ~0, sizeof(const_state->offsets));818819ir3_nir_scan_driver_consts(nir, const_state);820821if ((compiler->gpu_id < 500) && (v->shader->stream_output.num_outputs > 0)) {822const_state->num_driver_params =823MAX2(const_state->num_driver_params, IR3_DP_VTXCNT_MAX + 1);824}825826const_state->num_ubos = nir->info.num_ubos;827828/* num_driver_params is scalar, align to vec4: */829const_state->num_driver_params = align(const_state->num_driver_params, 4);830831debug_assert((const_state->ubo_state.size % 16) == 0);832unsigned constoff = const_state->ubo_state.size / 16;833unsigned ptrsz = ir3_pointer_size(compiler);834835if (const_state->num_ubos > 0) {836const_state->offsets.ubo = constoff;837constoff += align(const_state->num_ubos * ptrsz, 4) / 4;838}839840if (const_state->ssbo_size.count > 0) {841unsigned cnt = const_state->ssbo_size.count;842const_state->offsets.ssbo_sizes = constoff;843constoff += align(cnt, 4) / 4;844}845846if (const_state->image_dims.count > 0) {847unsigned cnt = const_state->image_dims.count;848const_state->offsets.image_dims = constoff;849constoff += align(cnt, 4) / 4;850}851852if (const_state->num_driver_params > 0) {853/* offset cannot be 0 for vs params loaded by CP_DRAW_INDIRECT_MULTI */854if (v->type == MESA_SHADER_VERTEX && compiler->gpu_id >= 600)855constoff = MAX2(constoff, 1);856const_state->offsets.driver_param = constoff;857}858constoff += const_state->num_driver_params / 4;859860if ((v->type == MESA_SHADER_VERTEX) && (compiler->gpu_id < 500) &&861v->shader->stream_output.num_outputs > 0) {862const_state->offsets.tfbo = constoff;863constoff += align(IR3_MAX_SO_BUFFERS * ptrsz, 4) / 4;864}865866switch (v->type) {867case MESA_SHADER_VERTEX:868const_state->offsets.primitive_param = constoff;869constoff += 1;870break;871case MESA_SHADER_TESS_CTRL:872case MESA_SHADER_TESS_EVAL:873constoff = align(constoff - 1, 4) + 3;874const_state->offsets.primitive_param = constoff;875const_state->offsets.primitive_map = constoff + 5;876constoff += 5 + DIV_ROUND_UP(v->input_size, 4);877break;878case MESA_SHADER_GEOMETRY:879const_state->offsets.primitive_param = constoff;880const_state->offsets.primitive_map = constoff + 1;881constoff += 1 + DIV_ROUND_UP(v->input_size, 4);882break;883default:884break;885}886887const_state->offsets.immediate = constoff;888889assert(constoff <= ir3_max_const(v));890}891892893