Path: blob/21.2-virgl/src/compiler/nir/nir_lower_amul.c
4549 views
/*1* Copyright © 2019 Google, Inc.2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,19* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE20* SOFTWARE.21*/2223#include "nir.h"24#include "nir_vla.h"2526/* Lowering for amul instructions, for drivers that support imul24.27* This pass will analyze indirect derefs, and convert corresponding28* amul instructions to either imul or imul24, depending on the29* required range.30*31* 1) Analyze the uniform variables and build a table of UBOs and SSBOs32* that are either too large, or might be too large (unknown size)33* for imul2434*35* 2) Loop thru looking at all the intrinsics, finding dereferences of36* large variables, and recursively replacing all amul instructions37* used with imul38*39* 3) Finally loop again thru all instructions replacing any remaining40* amul with imul24. At this point any remaining amul instructions41* are not involved in calculating an offset into a large variable,42* thanks to the 2nd step, so they can be safely replace with imul24.43*44* Using two passes over all the instructions lets us handle the case45* where, due to CSE, an amul is used to calculate an offset into both46* a large and small variable.47*/4849typedef struct {50nir_shader *shader;5152int (*type_size)(const struct glsl_type *, bool);5354/* Tables of UBOs and SSBOs mapping driver_location/base whether55* they are too large to use imul24:56*/57bool *large_ubos;58bool *large_ssbos;5960/* for cases that we cannot determine UBO/SSBO index, track if *any*61* UBO/SSBO is too large for imul24:62*/63bool has_large_ubo;64bool has_large_ssbo;6566unsigned max_slot;67} lower_state;6869/* Lower 'amul's in offset src of large variables to 'imul': */70static bool71lower_large_src(nir_src *src, void *s)72{73lower_state *state = s;7475assert(src->is_ssa);7677nir_instr *parent = src->ssa->parent_instr;7879/* No need to visit instructions we've already visited.. this also80* avoids infinite recursion when phi's are involved:81*/82if (parent->pass_flags)83return false;8485bool progress = nir_foreach_src(parent, lower_large_src, state);8687if (parent->type == nir_instr_type_alu) {88nir_alu_instr *alu = nir_instr_as_alu(parent);89if (alu->op == nir_op_amul) {90alu->op = nir_op_imul;91progress = true;92}93}9495parent->pass_flags = 1;9697return progress;98}99100static bool101large_ubo(lower_state *state, nir_src src)102{103if (!nir_src_is_const(src))104return state->has_large_ubo;105unsigned idx = nir_src_as_uint(src);106assert(idx < state->shader->info.num_ubos);107return state->large_ubos[idx];108}109110static bool111large_ssbo(lower_state *state, nir_src src)112{113if (!nir_src_is_const(src))114return state->has_large_ssbo;115unsigned idx = nir_src_as_uint(src);116assert(idx < state->shader->info.num_ssbos);117return state->large_ssbos[idx];118}119120static bool121lower_intrinsic(lower_state *state, nir_intrinsic_instr *intr)122{123switch (intr->intrinsic) {124case nir_intrinsic_load_ubo:125//# src[] = { buffer_index, offset }.126if (large_ubo(state, intr->src[0]))127return lower_large_src(&intr->src[1], state);128return false;129130case nir_intrinsic_load_ssbo:131//# src[] = { buffer_index, offset }.132if (large_ssbo(state, intr->src[0]))133return lower_large_src(&intr->src[1], state);134return false;135136case nir_intrinsic_store_ssbo:137//# src[] = { value, block_index, offset }138if (large_ssbo(state, intr->src[1]))139return lower_large_src(&intr->src[2], state);140return false;141142case nir_intrinsic_ssbo_atomic_add:143case nir_intrinsic_ssbo_atomic_imin:144case nir_intrinsic_ssbo_atomic_umin:145case nir_intrinsic_ssbo_atomic_imax:146case nir_intrinsic_ssbo_atomic_umax:147case nir_intrinsic_ssbo_atomic_and:148case nir_intrinsic_ssbo_atomic_or:149case nir_intrinsic_ssbo_atomic_xor:150case nir_intrinsic_ssbo_atomic_exchange:151case nir_intrinsic_ssbo_atomic_comp_swap:152case nir_intrinsic_ssbo_atomic_fadd:153case nir_intrinsic_ssbo_atomic_fmin:154case nir_intrinsic_ssbo_atomic_fmax:155case nir_intrinsic_ssbo_atomic_fcomp_swap:156/* 0: SSBO index157* 1: offset158*/159if (large_ssbo(state, intr->src[0]))160return lower_large_src(&intr->src[1], state);161return false;162163case nir_intrinsic_global_atomic_add:164case nir_intrinsic_global_atomic_imin:165case nir_intrinsic_global_atomic_umin:166case nir_intrinsic_global_atomic_imax:167case nir_intrinsic_global_atomic_umax:168case nir_intrinsic_global_atomic_and:169case nir_intrinsic_global_atomic_or:170case nir_intrinsic_global_atomic_xor:171case nir_intrinsic_global_atomic_exchange:172case nir_intrinsic_global_atomic_comp_swap:173case nir_intrinsic_global_atomic_fadd:174case nir_intrinsic_global_atomic_fmin:175case nir_intrinsic_global_atomic_fmax:176case nir_intrinsic_global_atomic_fcomp_swap:177/* just assume we that 24b is not sufficient: */178return lower_large_src(&intr->src[0], state);179180/* These should all be small enough to unconditionally use imul24: */181case nir_intrinsic_shared_atomic_add:182case nir_intrinsic_shared_atomic_imin:183case nir_intrinsic_shared_atomic_umin:184case nir_intrinsic_shared_atomic_imax:185case nir_intrinsic_shared_atomic_umax:186case nir_intrinsic_shared_atomic_and:187case nir_intrinsic_shared_atomic_or:188case nir_intrinsic_shared_atomic_xor:189case nir_intrinsic_shared_atomic_exchange:190case nir_intrinsic_shared_atomic_comp_swap:191case nir_intrinsic_shared_atomic_fadd:192case nir_intrinsic_shared_atomic_fmin:193case nir_intrinsic_shared_atomic_fmax:194case nir_intrinsic_shared_atomic_fcomp_swap:195case nir_intrinsic_load_uniform:196case nir_intrinsic_load_input:197case nir_intrinsic_load_output:198case nir_intrinsic_store_output:199default:200return false;201}202}203204static bool205lower_instr(lower_state *state, nir_instr *instr)206{207bool progress = false;208209if (instr->type == nir_instr_type_intrinsic) {210progress |= lower_intrinsic(state, nir_instr_as_intrinsic(instr));211}212213return progress;214}215216static bool217is_large(lower_state *state, nir_variable *var)218{219const struct glsl_type *type = glsl_without_array(var->type);220unsigned size = state->type_size(type, false);221222/* if size is not known (ie. VLA) then assume the worst: */223if (!size)224return true;225226return size >= (1 << 23);227}228229bool230nir_lower_amul(nir_shader *shader,231int (*type_size)(const struct glsl_type *, bool))232{233assert(shader->options->has_imul24);234assert(type_size);235236NIR_VLA_FILL(bool, large_ubos, shader->info.num_ubos, 0);237NIR_VLA_FILL(bool, large_ssbos, shader->info.num_ssbos, 0);238239lower_state state = {240.shader = shader,241.type_size = type_size,242.large_ubos = large_ubos,243.large_ssbos = large_ssbos,244};245246/* Figure out which UBOs or SSBOs are large enough to be247* disqualified from imul24:248*/249nir_foreach_variable_in_shader (var, shader) {250if (var->data.mode == nir_var_mem_ubo) {251if (is_large(&state, var)) {252state.has_large_ubo = true;253unsigned size = MAX2(1, glsl_array_size(var->type));254for (unsigned i = 0; i < size; i++)255state.large_ubos[var->data.binding + i] = true;256}257} else if (var->data.mode == nir_var_mem_ssbo) {258if (is_large(&state, var)) {259state.has_large_ssbo = true;260unsigned size = MAX2(1, glsl_array_size(var->type));261for (unsigned i = 0; i < size; i++)262state.large_ssbos[var->data.binding + i] = true;263}264}265}266267/* clear pass flags: */268nir_foreach_function(function, shader) {269nir_function_impl *impl = function->impl;270if (!impl)271continue;272273nir_foreach_block(block, impl) {274nir_foreach_instr(instr, block) {275instr->pass_flags = 0;276}277}278}279280bool progress = false;281nir_foreach_function(function, shader) {282nir_function_impl *impl = function->impl;283284if (!impl)285continue;286287nir_foreach_block(block, impl) {288nir_foreach_instr(instr, block) {289progress |= lower_instr(&state, instr);290}291}292}293294/* At this point, all 'amul's used in calculating an offset into295* a large variable have been replaced with 'imul'. So remaining296* 'amul's can be replaced with 'imul24':297*/298nir_foreach_function(function, shader) {299nir_function_impl *impl = function->impl;300301if (!impl)302continue;303304nir_foreach_block(block, impl) {305nir_foreach_instr(instr, block) {306if (instr->type != nir_instr_type_alu)307continue;308309nir_alu_instr *alu = nir_instr_as_alu(instr);310if (alu->op != nir_op_amul)311continue;312313alu->op = nir_op_imul24;314progress |= true;315}316}317318nir_metadata_preserve(impl, nir_metadata_block_index |319nir_metadata_dominance);320321}322323return progress;324}325326327