Path: blob/21.2-virgl/src/panfrost/bifrost/bifrost_compile.c
4564 views
/*1* Copyright (C) 2020 Collabora Ltd.2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,19* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE20* SOFTWARE.21*22* Authors (Collabora):23* Alyssa Rosenzweig <[email protected]>24*/2526#include "main/mtypes.h"27#include "compiler/glsl/glsl_to_nir.h"28#include "compiler/nir_types.h"29#include "compiler/nir/nir_builder.h"30#include "util/u_debug.h"3132#include "disassemble.h"33#include "bifrost_compile.h"34#include "compiler.h"35#include "bi_quirks.h"36#include "bi_builder.h"37#include "bifrost_nir.h"3839static const struct debug_named_value bifrost_debug_options[] = {40{"msgs", BIFROST_DBG_MSGS, "Print debug messages"},41{"shaders", BIFROST_DBG_SHADERS, "Dump shaders in NIR and MIR"},42{"shaderdb", BIFROST_DBG_SHADERDB, "Print statistics"},43{"verbose", BIFROST_DBG_VERBOSE, "Disassemble verbosely"},44{"internal", BIFROST_DBG_INTERNAL, "Dump even internal shaders"},45{"nosched", BIFROST_DBG_NOSCHED, "Force trivial bundling"},46{"inorder", BIFROST_DBG_INORDER, "Force in-order bundling"},47DEBUG_NAMED_VALUE_END48};4950DEBUG_GET_ONCE_FLAGS_OPTION(bifrost_debug, "BIFROST_MESA_DEBUG", bifrost_debug_options, 0)5152/* How many bytes are prefetched by the Bifrost shader core. From the final53* clause of the shader, this range must be valid instructions or zero. */54#define BIFROST_SHADER_PREFETCH 1285556int bifrost_debug = 0;5758#define DBG(fmt, ...) \59do { if (bifrost_debug & BIFROST_DBG_MSGS) \60fprintf(stderr, "%s:%d: "fmt, \61__FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)6263static bi_block *emit_cf_list(bi_context *ctx, struct exec_list *list);6465static void66bi_emit_jump(bi_builder *b, nir_jump_instr *instr)67{68bi_instr *branch = bi_jump(b, bi_zero());6970switch (instr->type) {71case nir_jump_break:72branch->branch_target = b->shader->break_block;73break;74case nir_jump_continue:75branch->branch_target = b->shader->continue_block;76break;77default:78unreachable("Unhandled jump type");79}8081pan_block_add_successor(&b->shader->current_block->base, &branch->branch_target->base);82b->shader->current_block->base.unconditional_jumps = true;83}8485static bi_index86bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)87{88switch (intr->intrinsic) {89case nir_intrinsic_load_barycentric_centroid:90case nir_intrinsic_load_barycentric_sample:91return bi_register(61);9293/* Need to put the sample ID in the top 16-bits */94case nir_intrinsic_load_barycentric_at_sample:95return bi_mkvec_v2i16(b, bi_half(bi_dontcare(), false),96bi_half(bi_src_index(&intr->src[0]), false));9798/* Interpret as 8:8 signed fixed point positions in pixels along X and99* Y axes respectively, relative to top-left of pixel. In NIR, (0, 0)100* is the center of the pixel so we first fixup and then convert. For101* fp16 input:102*103* f2i16(((x, y) + (0.5, 0.5)) * 2**8) =104* f2i16((256 * (x, y)) + (128, 128)) =105* V2F16_TO_V2S16(FMA.v2f16((x, y), #256, #128))106*107* For fp32 input, that lacks enough precision for MSAA 16x, but the108* idea is the same. FIXME: still doesn't pass109*/110case nir_intrinsic_load_barycentric_at_offset: {111bi_index offset = bi_src_index(&intr->src[0]);112bi_index f16 = bi_null();113unsigned sz = nir_src_bit_size(intr->src[0]);114115if (sz == 16) {116f16 = bi_fma_v2f16(b, offset, bi_imm_f16(256.0),117bi_imm_f16(128.0), BI_ROUND_NONE);118} else {119assert(sz == 32);120bi_index f[2];121for (unsigned i = 0; i < 2; ++i) {122f[i] = bi_fadd_rscale_f32(b,123bi_word(offset, i),124bi_imm_f32(0.5), bi_imm_u32(8),125BI_ROUND_NONE, BI_SPECIAL_NONE);126}127128f16 = bi_v2f32_to_v2f16(b, f[0], f[1], BI_ROUND_NONE);129}130131return bi_v2f16_to_v2s16(b, f16, BI_ROUND_RTZ);132}133134case nir_intrinsic_load_barycentric_pixel:135default:136return bi_dontcare();137}138}139140static enum bi_sample141bi_interp_for_intrinsic(nir_intrinsic_op op)142{143switch (op) {144case nir_intrinsic_load_barycentric_centroid:145return BI_SAMPLE_CENTROID;146case nir_intrinsic_load_barycentric_sample:147case nir_intrinsic_load_barycentric_at_sample:148return BI_SAMPLE_SAMPLE;149case nir_intrinsic_load_barycentric_at_offset:150return BI_SAMPLE_EXPLICIT;151case nir_intrinsic_load_barycentric_pixel:152default:153return BI_SAMPLE_CENTER;154}155}156157/* auto, 64-bit omitted */158static enum bi_register_format159bi_reg_fmt_for_nir(nir_alu_type T)160{161switch (T) {162case nir_type_float16: return BI_REGISTER_FORMAT_F16;163case nir_type_float32: return BI_REGISTER_FORMAT_F32;164case nir_type_int16: return BI_REGISTER_FORMAT_S16;165case nir_type_uint16: return BI_REGISTER_FORMAT_U16;166case nir_type_int32: return BI_REGISTER_FORMAT_S32;167case nir_type_uint32: return BI_REGISTER_FORMAT_U32;168default: unreachable("Invalid type for register format");169}170}171172/* Checks if the _IMM variant of an intrinsic can be used, returning in imm the173* immediate to be used (which applies even if _IMM can't be used) */174175static bool176bi_is_intr_immediate(nir_intrinsic_instr *instr, unsigned *immediate, unsigned max)177{178nir_src *offset = nir_get_io_offset_src(instr);179180if (!nir_src_is_const(*offset))181return false;182183*immediate = nir_intrinsic_base(instr) + nir_src_as_uint(*offset);184return (*immediate) < max;185}186187static void188bi_make_vec_to(bi_builder *b, bi_index final_dst,189bi_index *src,190unsigned *channel,191unsigned count,192unsigned bitsize);193194/* Bifrost's load instructions lack a component offset despite operating in195* terms of vec4 slots. Usually I/O vectorization avoids nonzero components,196* but they may be unavoidable with separate shaders in use. To solve this, we197* lower to a larger load and an explicit copy of the desired components. */198199static void200bi_copy_component(bi_builder *b, nir_intrinsic_instr *instr, bi_index tmp)201{202unsigned component = nir_intrinsic_component(instr);203204if (component == 0)205return;206207bi_index srcs[] = { tmp, tmp, tmp, tmp };208unsigned channels[] = { component, component + 1, component + 2 };209210bi_make_vec_to(b,211bi_dest_index(&instr->dest),212srcs, channels, instr->num_components,213nir_dest_bit_size(instr->dest));214}215216static void217bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr)218{219nir_alu_type T = nir_intrinsic_dest_type(instr);220enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);221nir_src *offset = nir_get_io_offset_src(instr);222unsigned component = nir_intrinsic_component(instr);223enum bi_vecsize vecsize = (instr->num_components + component - 1);224unsigned imm_index = 0;225unsigned base = nir_intrinsic_base(instr);226bool constant = nir_src_is_const(*offset);227bool immediate = bi_is_intr_immediate(instr, &imm_index, 16);228bi_index dest = (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader);229230if (immediate) {231bi_ld_attr_imm_to(b, dest, bi_register(61), bi_register(62),232regfmt, vecsize, imm_index);233} else {234bi_index idx = bi_src_index(&instr->src[0]);235236if (constant)237idx = bi_imm_u32(imm_index);238else if (base != 0)239idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);240241bi_ld_attr_to(b, dest, bi_register(61), bi_register(62),242idx, regfmt, vecsize);243}244245bi_copy_component(b, instr, dest);246}247248static void249bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)250{251enum bi_sample sample = BI_SAMPLE_CENTER;252enum bi_update update = BI_UPDATE_STORE;253enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO;254bool smooth = instr->intrinsic == nir_intrinsic_load_interpolated_input;255bi_index src0 = bi_null();256257unsigned component = nir_intrinsic_component(instr);258enum bi_vecsize vecsize = (instr->num_components + component - 1);259bi_index dest = (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader);260261unsigned sz = nir_dest_bit_size(instr->dest);262263if (smooth) {264nir_intrinsic_instr *parent = nir_src_as_intrinsic(instr->src[0]);265assert(parent);266267sample = bi_interp_for_intrinsic(parent->intrinsic);268src0 = bi_varying_src0_for_barycentric(b, parent);269270assert(sz == 16 || sz == 32);271regfmt = (sz == 16) ? BI_REGISTER_FORMAT_F16272: BI_REGISTER_FORMAT_F32;273} else {274assert(sz == 32);275regfmt = BI_REGISTER_FORMAT_U32;276}277278nir_src *offset = nir_get_io_offset_src(instr);279unsigned imm_index = 0;280bool immediate = bi_is_intr_immediate(instr, &imm_index, 20);281282if (immediate && smooth) {283bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update,284vecsize, imm_index);285} else if (immediate && !smooth) {286bi_ld_var_flat_imm_to(b, dest, BI_FUNCTION_NONE, regfmt,287vecsize, imm_index);288} else {289bi_index idx = bi_src_index(offset);290unsigned base = nir_intrinsic_base(instr);291292if (base != 0)293idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);294295if (smooth) {296bi_ld_var_to(b, dest, src0, idx, regfmt, sample,297update, vecsize);298} else {299bi_ld_var_flat_to(b, dest, idx, BI_FUNCTION_NONE,300regfmt, vecsize);301}302}303304bi_copy_component(b, instr, dest);305}306307static void308bi_make_vec16_to(bi_builder *b, bi_index dst, bi_index *src,309unsigned *channel, unsigned count)310{311for (unsigned i = 0; i < count; i += 2) {312bool next = (i + 1) < count;313314unsigned chan = channel ? channel[i] : 0;315unsigned nextc = next && channel ? channel[i + 1] : 0;316317bi_index w0 = bi_word(src[i], chan >> 1);318bi_index w1 = next ? bi_word(src[i + 1], nextc >> 1) : bi_zero();319320bi_index h0 = bi_half(w0, chan & 1);321bi_index h1 = bi_half(w1, nextc & 1);322323bi_index to = bi_word(dst, i >> 1);324325if (bi_is_word_equiv(w0, w1) && (chan & 1) == 0 && ((nextc & 1) == 1))326bi_mov_i32_to(b, to, w0);327else if (bi_is_word_equiv(w0, w1))328bi_swz_v2i16_to(b, to, bi_swz_16(w0, chan & 1, nextc & 1));329else330bi_mkvec_v2i16_to(b, to, h0, h1);331}332}333334static void335bi_make_vec_to(bi_builder *b, bi_index final_dst,336bi_index *src,337unsigned *channel,338unsigned count,339unsigned bitsize)340{341/* If we reads our own output, we need a temporary move to allow for342* swapping. TODO: Could do a bit better for pairwise swaps of 16-bit343* vectors */344bool reads_self = false;345346for (unsigned i = 0; i < count; ++i)347reads_self |= bi_is_equiv(final_dst, src[i]);348349/* SSA can't read itself */350assert(!reads_self || final_dst.reg);351352bi_index dst = reads_self ? bi_temp(b->shader) : final_dst;353354if (bitsize == 32) {355for (unsigned i = 0; i < count; ++i) {356bi_mov_i32_to(b, bi_word(dst, i),357bi_word(src[i], channel ? channel[i] : 0));358}359} else if (bitsize == 16) {360bi_make_vec16_to(b, dst, src, channel, count);361} else if (bitsize == 8 && count == 1) {362bi_swz_v4i8_to(b, dst, bi_byte(363bi_word(src[0], channel[0] >> 2),364channel[0] & 3));365} else {366unreachable("8-bit mkvec not yet supported");367}368369/* Emit an explicit copy if needed */370if (!bi_is_equiv(dst, final_dst)) {371unsigned shift = (bitsize == 8) ? 2 : (bitsize == 16) ? 1 : 0;372unsigned vec = (1 << shift);373374for (unsigned i = 0; i < count; i += vec) {375bi_mov_i32_to(b, bi_word(final_dst, i >> shift),376bi_word(dst, i >> shift));377}378}379}380381static bi_instr *382bi_load_sysval_to(bi_builder *b, bi_index dest, int sysval,383unsigned nr_components, unsigned offset)384{385unsigned sysval_ubo =386MAX2(b->shader->inputs->sysval_ubo, b->shader->nir->info.num_ubos);387unsigned uniform =388pan_lookup_sysval(b->shader->sysval_to_id,389&b->shader->info->sysvals,390sysval);391unsigned idx = (uniform * 16) + offset;392393return bi_load_to(b, nr_components * 32, dest,394bi_imm_u32(idx),395bi_imm_u32(sysval_ubo), BI_SEG_UBO);396}397398static void399bi_load_sysval_nir(bi_builder *b, nir_intrinsic_instr *intr,400unsigned nr_components, unsigned offset)401{402bi_load_sysval_to(b, bi_dest_index(&intr->dest),403panfrost_sysval_for_instr(&intr->instr, NULL),404nr_components, offset);405}406407static bi_index408bi_load_sysval(bi_builder *b, int sysval,409unsigned nr_components, unsigned offset)410{411bi_index tmp = bi_temp(b->shader);412bi_load_sysval_to(b, tmp, sysval, nr_components, offset);413return tmp;414}415416static void417bi_emit_load_blend_input(bi_builder *b, nir_intrinsic_instr *instr)418{419ASSERTED nir_io_semantics sem = nir_intrinsic_io_semantics(instr);420421/* Source color is passed through r0-r3, or r4-r7 for the second422* source when dual-source blending. TODO: Precolour instead */423bi_index srcs[] = {424bi_register(0), bi_register(1), bi_register(2), bi_register(3)425};426bi_index srcs2[] = {427bi_register(4), bi_register(5), bi_register(6), bi_register(7)428};429430bool second_source = (sem.location == VARYING_SLOT_VAR0);431432bi_make_vec_to(b, bi_dest_index(&instr->dest),433second_source ? srcs2 : srcs,434NULL, 4, 32);435}436437static void438bi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T, unsigned rt)439{440/* Reads 2 or 4 staging registers to cover the input */441unsigned sr_count = (nir_alu_type_get_type_size(T) <= 16) ? 2 : 4;442443if (b->shader->inputs->is_blend) {444uint64_t blend_desc = b->shader->inputs->blend.bifrost_blend_desc;445446/* Blend descriptor comes from the compile inputs */447/* Put the result in r0 */448bi_blend_to(b, bi_register(0), rgba,449bi_register(60),450bi_imm_u32(blend_desc & 0xffffffff),451bi_imm_u32(blend_desc >> 32), sr_count);452} else {453/* Blend descriptor comes from the FAU RAM. By convention, the454* return address is stored in r48 and will be used by the455* blend shader to jump back to the fragment shader after */456bi_blend_to(b, bi_register(48), rgba,457bi_register(60),458bi_fau(BIR_FAU_BLEND_0 + rt, false),459bi_fau(BIR_FAU_BLEND_0 + rt, true), sr_count);460}461462assert(rt < 8);463b->shader->info->bifrost.blend[rt].type = T;464}465466/* Blend shaders do not need to run ATEST since they are dependent on a467* fragment shader that runs it. Blit shaders may not need to run ATEST, since468* ATEST is not needed if early-z is forced, alpha-to-coverage is disabled, and469* there are no writes to the coverage mask. The latter two are satisfied for470* all blit shaders, so we just care about early-z, which blit shaders force471* iff they do not write depth or stencil */472473static bool474bi_skip_atest(bi_context *ctx, bool emit_zs)475{476return (ctx->inputs->is_blit && !emit_zs) || ctx->inputs->is_blend;477}478479static void480bi_emit_atest(bi_builder *b, bi_index alpha)481{482bi_index coverage = bi_register(60);483bi_instr *atest = bi_atest_to(b, coverage, coverage, alpha);484b->shader->emitted_atest = true;485486/* Pseudo-source to encode in the tuple */487atest->src[2] = bi_fau(BIR_FAU_ATEST_PARAM, false);488}489490static void491bi_emit_fragment_out(bi_builder *b, nir_intrinsic_instr *instr)492{493bool combined = instr->intrinsic ==494nir_intrinsic_store_combined_output_pan;495496unsigned writeout = combined ? nir_intrinsic_component(instr) :497PAN_WRITEOUT_C;498499bool emit_blend = writeout & (PAN_WRITEOUT_C);500bool emit_zs = writeout & (PAN_WRITEOUT_Z | PAN_WRITEOUT_S);501502const nir_variable *var =503nir_find_variable_with_driver_location(b->shader->nir,504nir_var_shader_out, nir_intrinsic_base(instr));505assert(var);506507unsigned loc = var->data.location;508bi_index src0 = bi_src_index(&instr->src[0]);509510/* By ISA convention, the coverage mask is stored in R60. The store511* itself will be handled by a subsequent ATEST instruction */512if (loc == FRAG_RESULT_SAMPLE_MASK) {513bi_index orig = bi_register(60);514bi_index msaa = bi_load_sysval(b, PAN_SYSVAL_MULTISAMPLED, 1, 0);515bi_index new = bi_lshift_and_i32(b, orig, src0, bi_imm_u8(0));516bi_mux_i32_to(b, orig, orig, new, msaa, BI_MUX_INT_ZERO);517return;518}519520521/* Dual-source blending is implemented by putting the color in522* registers r4-r7. */523if (var->data.index) {524unsigned count = nir_src_num_components(instr->src[0]);525526for (unsigned i = 0; i < count; ++i)527bi_mov_i32_to(b, bi_register(4 + i), bi_word(src0, i));528529b->shader->info->bifrost.blend_src1_type =530nir_intrinsic_src_type(instr);531532return;533}534535/* Emit ATEST if we have to, note ATEST requires a floating-point alpha536* value, but render target #0 might not be floating point. However the537* alpha value is only used for alpha-to-coverage, a stage which is538* skipped for pure integer framebuffers, so the issue is moot. */539540if (!b->shader->emitted_atest && !bi_skip_atest(b->shader, emit_zs)) {541nir_alu_type T = nir_intrinsic_src_type(instr);542543bi_index rgba = bi_src_index(&instr->src[0]);544bi_index alpha =545(T == nir_type_float16) ? bi_half(bi_word(rgba, 1), true) :546(T == nir_type_float32) ? bi_word(rgba, 3) :547bi_dontcare();548549/* Don't read out-of-bounds */550if (nir_src_num_components(instr->src[0]) < 4)551alpha = bi_imm_f32(1.0);552553bi_emit_atest(b, alpha);554}555556if (emit_zs) {557bi_index z = { 0 }, s = { 0 };558559if (writeout & PAN_WRITEOUT_Z)560z = bi_src_index(&instr->src[2]);561562if (writeout & PAN_WRITEOUT_S)563s = bi_src_index(&instr->src[3]);564565bi_zs_emit_to(b, bi_register(60), z, s, bi_register(60),566writeout & PAN_WRITEOUT_S,567writeout & PAN_WRITEOUT_Z);568}569570if (emit_blend) {571assert(loc >= FRAG_RESULT_DATA0);572573unsigned rt = (loc - FRAG_RESULT_DATA0);574bi_index color = bi_src_index(&instr->src[0]);575576/* Explicit copy since BLEND inputs are precoloured to R0-R3,577* TODO: maybe schedule around this or implement in RA as a578* spill */579bool has_mrt = false;580581nir_foreach_shader_out_variable(var, b->shader->nir)582has_mrt |= (var->data.location > FRAG_RESULT_DATA0);583584if (has_mrt) {585bi_index srcs[4] = { color, color, color, color };586unsigned channels[4] = { 0, 1, 2, 3 };587color = bi_temp(b->shader);588bi_make_vec_to(b, color, srcs, channels,589nir_src_num_components(instr->src[0]),590nir_alu_type_get_type_size(nir_intrinsic_src_type(instr)));591}592593bi_emit_blend_op(b, color, nir_intrinsic_src_type(instr), rt);594}595596if (b->shader->inputs->is_blend) {597/* Jump back to the fragment shader, return address is stored598* in r48 (see above).599*/600bi_jump(b, bi_register(48));601}602}603604static void605bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)606{607/* In principle we can do better for 16-bit. At the moment we require608* 32-bit to permit the use of .auto, in order to force .u32 for flat609* varyings, to handle internal TGSI shaders that set flat in the VS610* but smooth in the FS */611612ASSERTED nir_alu_type T = nir_intrinsic_src_type(instr);613assert(nir_alu_type_get_type_size(T) == 32);614enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO;615616unsigned imm_index = 0;617bool immediate = bi_is_intr_immediate(instr, &imm_index, 16);618619bi_index address;620if (immediate) {621address = bi_lea_attr_imm(b,622bi_register(61), bi_register(62),623regfmt, imm_index);624} else {625bi_index idx =626bi_iadd_u32(b,627bi_src_index(nir_get_io_offset_src(instr)),628bi_imm_u32(nir_intrinsic_base(instr)),629false);630address = bi_lea_attr(b,631bi_register(61), bi_register(62),632idx, regfmt);633}634635/* Only look at the total components needed. In effect, we fill in all636* the intermediate "holes" in the write mask, since we can't mask off637* stores. Since nir_lower_io_to_temporaries ensures each varying is638* written at most once, anything that's masked out is undefined, so it639* doesn't matter what we write there. So we may as well do the640* simplest thing possible. */641unsigned nr = util_last_bit(nir_intrinsic_write_mask(instr));642assert(nr > 0 && nr <= nir_intrinsic_src_components(instr, 0));643644bi_st_cvt(b, bi_src_index(&instr->src[0]), address,645bi_word(address, 1), bi_word(address, 2),646regfmt, nr - 1);647}648649static void650bi_emit_load_ubo(bi_builder *b, nir_intrinsic_instr *instr)651{652nir_src *offset = nir_get_io_offset_src(instr);653654bool offset_is_const = nir_src_is_const(*offset);655bi_index dyn_offset = bi_src_index(offset);656uint32_t const_offset = offset_is_const ? nir_src_as_uint(*offset) : 0;657bool kernel_input = (instr->intrinsic == nir_intrinsic_load_kernel_input);658659bi_load_to(b, instr->num_components * nir_dest_bit_size(instr->dest),660bi_dest_index(&instr->dest), offset_is_const ?661bi_imm_u32(const_offset) : dyn_offset,662kernel_input ? bi_zero() : bi_src_index(&instr->src[0]),663BI_SEG_UBO);664}665666static bi_index667bi_addr_high(nir_src *src)668{669return (nir_src_bit_size(*src) == 64) ?670bi_word(bi_src_index(src), 1) : bi_zero();671}672673static void674bi_emit_load(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg)675{676bi_load_to(b, instr->num_components * nir_dest_bit_size(instr->dest),677bi_dest_index(&instr->dest),678bi_src_index(&instr->src[0]), bi_addr_high(&instr->src[0]),679seg);680}681682static void683bi_emit_store(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg)684{685/* Require contiguous masks, gauranteed by nir_lower_wrmasks */686assert(nir_intrinsic_write_mask(instr) ==687BITFIELD_MASK(instr->num_components));688689bi_store(b, instr->num_components * nir_src_bit_size(instr->src[0]),690bi_src_index(&instr->src[0]),691bi_src_index(&instr->src[1]), bi_addr_high(&instr->src[1]),692seg);693}694695/* Exchanges the staging register with memory */696697static void698bi_emit_axchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg, enum bi_seg seg)699{700assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS);701702unsigned sz = nir_src_bit_size(*arg);703assert(sz == 32 || sz == 64);704705bi_index data = bi_src_index(arg);706707bi_index data_words[] = {708bi_word(data, 0),709bi_word(data, 1),710};711712bi_index inout = bi_temp_reg(b->shader);713bi_make_vec_to(b, inout, data_words, NULL, sz / 32, 32);714715bi_axchg_to(b, sz, inout, inout,716bi_word(addr, 0),717(seg == BI_SEG_NONE) ? bi_word(addr, 1) : bi_zero(),718seg);719720bi_index inout_words[] = {721bi_word(inout, 0),722bi_word(inout, 1),723};724725bi_make_vec_to(b, dst, inout_words, NULL, sz / 32, 32);726}727728/* Exchanges the second staging register with memory if comparison with first729* staging register passes */730731static void732bi_emit_acmpxchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg_1, nir_src *arg_2, enum bi_seg seg)733{734assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS);735736/* hardware is swapped from NIR */737bi_index src0 = bi_src_index(arg_2);738bi_index src1 = bi_src_index(arg_1);739740unsigned sz = nir_src_bit_size(*arg_1);741assert(sz == 32 || sz == 64);742743bi_index data_words[] = {744bi_word(src0, 0),745sz == 32 ? bi_word(src1, 0) : bi_word(src0, 1),746747/* 64-bit */748bi_word(src1, 0),749bi_word(src1, 1),750};751752bi_index inout = bi_temp_reg(b->shader);753bi_make_vec_to(b, inout, data_words, NULL, 2 * (sz / 32), 32);754755bi_acmpxchg_to(b, sz, inout, inout,756bi_word(addr, 0),757(seg == BI_SEG_NONE) ? bi_word(addr, 1) : bi_zero(),758seg);759760bi_index inout_words[] = {761bi_word(inout, 0),762bi_word(inout, 1),763};764765bi_make_vec_to(b, dst, inout_words, NULL, sz / 32, 32);766}767768/* Extracts an atomic opcode */769770static enum bi_atom_opc771bi_atom_opc_for_nir(nir_intrinsic_op op)772{773switch (op) {774case nir_intrinsic_global_atomic_add:775case nir_intrinsic_shared_atomic_add:776case nir_intrinsic_image_atomic_add:777return BI_ATOM_OPC_AADD;778779case nir_intrinsic_global_atomic_imin:780case nir_intrinsic_shared_atomic_imin:781case nir_intrinsic_image_atomic_imin:782return BI_ATOM_OPC_ASMIN;783784case nir_intrinsic_global_atomic_umin:785case nir_intrinsic_shared_atomic_umin:786case nir_intrinsic_image_atomic_umin:787return BI_ATOM_OPC_AUMIN;788789case nir_intrinsic_global_atomic_imax:790case nir_intrinsic_shared_atomic_imax:791case nir_intrinsic_image_atomic_imax:792return BI_ATOM_OPC_ASMAX;793794case nir_intrinsic_global_atomic_umax:795case nir_intrinsic_shared_atomic_umax:796case nir_intrinsic_image_atomic_umax:797return BI_ATOM_OPC_AUMAX;798799case nir_intrinsic_global_atomic_and:800case nir_intrinsic_shared_atomic_and:801case nir_intrinsic_image_atomic_and:802return BI_ATOM_OPC_AAND;803804case nir_intrinsic_global_atomic_or:805case nir_intrinsic_shared_atomic_or:806case nir_intrinsic_image_atomic_or:807return BI_ATOM_OPC_AOR;808809case nir_intrinsic_global_atomic_xor:810case nir_intrinsic_shared_atomic_xor:811case nir_intrinsic_image_atomic_xor:812return BI_ATOM_OPC_AXOR;813814default:815unreachable("Unexpected computational atomic");816}817}818819/* Optimized unary atomics are available with an implied #1 argument */820821static bool822bi_promote_atom_c1(enum bi_atom_opc op, bi_index arg, enum bi_atom_opc *out)823{824/* Check we have a compatible constant */825if (arg.type != BI_INDEX_CONSTANT)826return false;827828if (!(arg.value == 1 || (arg.value == -1 && op == BI_ATOM_OPC_AADD)))829return false;830831/* Check for a compatible operation */832switch (op) {833case BI_ATOM_OPC_AADD:834*out = (arg.value == 1) ? BI_ATOM_OPC_AINC : BI_ATOM_OPC_ADEC;835return true;836case BI_ATOM_OPC_ASMAX:837*out = BI_ATOM_OPC_ASMAX1;838return true;839case BI_ATOM_OPC_AUMAX:840*out = BI_ATOM_OPC_AUMAX1;841return true;842case BI_ATOM_OPC_AOR:843*out = BI_ATOM_OPC_AOR1;844return true;845default:846return false;847}848}849850/* Coordinates are 16-bit integers in Bifrost but 32-bit in NIR */851852static bi_index853bi_emit_image_coord(bi_builder *b, bi_index coord)854{855return bi_mkvec_v2i16(b,856bi_half(bi_word(coord, 0), false),857bi_half(bi_word(coord, 1), false));858}859860static bi_index861bi_emit_image_index(bi_builder *b, nir_intrinsic_instr *instr)862{863nir_src src = instr->src[0];864bi_index index = bi_src_index(&src);865bi_context *ctx = b->shader;866867/* Images come after vertex attributes, so handle an explicit offset */868unsigned offset = (ctx->stage == MESA_SHADER_VERTEX) ?869util_bitcount64(ctx->nir->info.inputs_read) : 0;870871if (offset == 0)872return index;873else if (nir_src_is_const(src))874return bi_imm_u32(nir_src_as_uint(src) + offset);875else876return bi_iadd_u32(b, index, bi_imm_u32(offset), false);877}878879static void880bi_emit_image_load(bi_builder *b, nir_intrinsic_instr *instr)881{882enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);883ASSERTED unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim);884885bi_index coords = bi_src_index(&instr->src[1]);886/* TODO: MSAA */887assert(nr_dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported");888889bi_ld_attr_tex_to(b, bi_dest_index(&instr->dest),890bi_emit_image_coord(b, coords),891bi_emit_image_coord(b, bi_word(coords, 2)),892bi_emit_image_index(b, instr),893bi_reg_fmt_for_nir(nir_intrinsic_dest_type(instr)),894instr->num_components - 1);895}896897static bi_index898bi_emit_lea_image(bi_builder *b, nir_intrinsic_instr *instr)899{900enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);901ASSERTED unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim);902903/* TODO: MSAA */904assert(nr_dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported");905906enum bi_register_format type = (instr->intrinsic == nir_intrinsic_image_store) ?907bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr)) :908BI_REGISTER_FORMAT_AUTO;909910bi_index coords = bi_src_index(&instr->src[1]);911bi_index xy = bi_emit_image_coord(b, coords);912bi_index zw = bi_emit_image_coord(b, bi_word(coords, 2));913914bi_instr *I = bi_lea_attr_tex_to(b, bi_temp(b->shader), xy, zw,915bi_emit_image_index(b, instr), type);916917/* LEA_ATTR_TEX defaults to the secondary attribute table, but our ABI918* has all images in the primary attribute table */919I->table = BI_TABLE_ATTRIBUTE_1;920921return I->dest[0];922}923924static void925bi_emit_image_store(bi_builder *b, nir_intrinsic_instr *instr)926{927bi_index addr = bi_emit_lea_image(b, instr);928929bi_st_cvt(b, bi_src_index(&instr->src[3]),930addr, bi_word(addr, 1), bi_word(addr, 2),931bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr)),932instr->num_components - 1);933}934935static void936bi_emit_atomic_i32_to(bi_builder *b, bi_index dst,937bi_index addr, bi_index arg, nir_intrinsic_op intrinsic)938{939/* ATOM_C.i32 takes a vector with {arg, coalesced}, ATOM_C1.i32 doesn't940* take any vector but can still output in RETURN mode */941bi_index sr = bi_temp_reg(b->shader);942943enum bi_atom_opc opc = bi_atom_opc_for_nir(intrinsic);944enum bi_atom_opc post_opc = opc;945946/* Generate either ATOM_C or ATOM_C1 as required */947if (bi_promote_atom_c1(opc, arg, &opc)) {948bi_patom_c1_i32_to(b, sr, bi_word(addr, 0),949bi_word(addr, 1), opc, 2);950} else {951bi_mov_i32_to(b, sr, arg);952bi_patom_c_i32_to(b, sr, sr, bi_word(addr, 0),953bi_word(addr, 1), opc, 2);954}955956/* Post-process it */957bi_atom_post_i32_to(b, dst, bi_word(sr, 0), bi_word(sr, 1), post_opc);958}959960/* gl_FragCoord.xy = u16_to_f32(R59.xy) + 0.5961* gl_FragCoord.z = ld_vary(fragz)962* gl_FragCoord.w = ld_vary(fragw)963*/964965static void966bi_emit_load_frag_coord(bi_builder *b, nir_intrinsic_instr *instr)967{968bi_index src[4] = {};969970for (unsigned i = 0; i < 2; ++i) {971src[i] = bi_fadd_f32(b,972bi_u16_to_f32(b, bi_half(bi_register(59), i)),973bi_imm_f32(0.5f), BI_ROUND_NONE);974}975976for (unsigned i = 0; i < 2; ++i) {977src[2 + i] = bi_ld_var_special(b, bi_zero(),978BI_REGISTER_FORMAT_F32, BI_SAMPLE_CENTER,979BI_UPDATE_CLOBBER,980(i == 0) ? BI_VARYING_NAME_FRAG_Z :981BI_VARYING_NAME_FRAG_W,982BI_VECSIZE_NONE);983}984985bi_make_vec_to(b, bi_dest_index(&instr->dest), src, NULL, 4, 32);986}987988static void989bi_emit_ld_tile(bi_builder *b, nir_intrinsic_instr *instr)990{991unsigned rt = b->shader->inputs->blend.rt;992unsigned size = nir_dest_bit_size(instr->dest);993994/* Get the render target */995if (!b->shader->inputs->is_blend) {996const nir_variable *var =997nir_find_variable_with_driver_location(b->shader->nir,998nir_var_shader_out, nir_intrinsic_base(instr));999unsigned loc = var->data.location;1000assert(loc >= FRAG_RESULT_DATA0);1001rt = (loc - FRAG_RESULT_DATA0);1002}10031004/* We want to load the current pixel.1005* FIXME: The sample to load is currently hardcoded to 0. This should1006* be addressed for multi-sample FBs.1007*/1008struct bifrost_pixel_indices pix = {1009.y = BIFROST_CURRENT_PIXEL,1010.rt = rt1011};10121013bi_index desc = b->shader->inputs->is_blend ?1014bi_imm_u32(b->shader->inputs->blend.bifrost_blend_desc >> 32) :1015bi_load_sysval(b, PAN_SYSVAL(RT_CONVERSION, rt | (size << 4)), 1, 0);10161017uint32_t indices = 0;1018memcpy(&indices, &pix, sizeof(indices));10191020bi_ld_tile_to(b, bi_dest_index(&instr->dest), bi_imm_u32(indices),1021bi_register(60), desc, (instr->num_components - 1));1022}10231024static void1025bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)1026{1027bi_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest ?1028bi_dest_index(&instr->dest) : bi_null();1029gl_shader_stage stage = b->shader->stage;10301031switch (instr->intrinsic) {1032case nir_intrinsic_load_barycentric_pixel:1033case nir_intrinsic_load_barycentric_centroid:1034case nir_intrinsic_load_barycentric_sample:1035case nir_intrinsic_load_barycentric_at_sample:1036case nir_intrinsic_load_barycentric_at_offset:1037/* handled later via load_vary */1038break;1039case nir_intrinsic_load_interpolated_input:1040case nir_intrinsic_load_input:1041if (b->shader->inputs->is_blend)1042bi_emit_load_blend_input(b, instr);1043else if (stage == MESA_SHADER_FRAGMENT)1044bi_emit_load_vary(b, instr);1045else if (stage == MESA_SHADER_VERTEX)1046bi_emit_load_attr(b, instr);1047else1048unreachable("Unsupported shader stage");1049break;10501051case nir_intrinsic_store_output:1052if (stage == MESA_SHADER_FRAGMENT)1053bi_emit_fragment_out(b, instr);1054else if (stage == MESA_SHADER_VERTEX)1055bi_emit_store_vary(b, instr);1056else1057unreachable("Unsupported shader stage");1058break;10591060case nir_intrinsic_store_combined_output_pan:1061assert(stage == MESA_SHADER_FRAGMENT);1062bi_emit_fragment_out(b, instr);1063break;10641065case nir_intrinsic_load_ubo:1066case nir_intrinsic_load_kernel_input:1067bi_emit_load_ubo(b, instr);1068break;10691070case nir_intrinsic_load_global:1071case nir_intrinsic_load_global_constant:1072bi_emit_load(b, instr, BI_SEG_NONE);1073break;10741075case nir_intrinsic_store_global:1076bi_emit_store(b, instr, BI_SEG_NONE);1077break;10781079case nir_intrinsic_load_scratch:1080bi_emit_load(b, instr, BI_SEG_TL);1081break;10821083case nir_intrinsic_store_scratch:1084bi_emit_store(b, instr, BI_SEG_TL);1085break;10861087case nir_intrinsic_load_shared:1088bi_emit_load(b, instr, BI_SEG_WLS);1089break;10901091case nir_intrinsic_store_shared:1092bi_emit_store(b, instr, BI_SEG_WLS);1093break;10941095/* Blob doesn't seem to do anything for memory barriers, note +BARRIER1096* is illegal in fragment shaders */1097case nir_intrinsic_memory_barrier:1098case nir_intrinsic_memory_barrier_buffer:1099case nir_intrinsic_memory_barrier_image:1100case nir_intrinsic_memory_barrier_shared:1101case nir_intrinsic_group_memory_barrier:1102break;11031104case nir_intrinsic_control_barrier:1105assert(b->shader->stage != MESA_SHADER_FRAGMENT);1106bi_barrier(b);1107break;11081109case nir_intrinsic_shared_atomic_add:1110case nir_intrinsic_shared_atomic_imin:1111case nir_intrinsic_shared_atomic_umin:1112case nir_intrinsic_shared_atomic_imax:1113case nir_intrinsic_shared_atomic_umax:1114case nir_intrinsic_shared_atomic_and:1115case nir_intrinsic_shared_atomic_or:1116case nir_intrinsic_shared_atomic_xor: {1117assert(nir_src_bit_size(instr->src[1]) == 32);11181119bi_index addr = bi_seg_add_i64(b, bi_src_index(&instr->src[0]),1120bi_zero(), false, BI_SEG_WLS);11211122bi_emit_atomic_i32_to(b, dst, addr, bi_src_index(&instr->src[1]),1123instr->intrinsic);1124break;1125}11261127case nir_intrinsic_image_atomic_add:1128case nir_intrinsic_image_atomic_imin:1129case nir_intrinsic_image_atomic_umin:1130case nir_intrinsic_image_atomic_imax:1131case nir_intrinsic_image_atomic_umax:1132case nir_intrinsic_image_atomic_and:1133case nir_intrinsic_image_atomic_or:1134case nir_intrinsic_image_atomic_xor:1135assert(nir_src_bit_size(instr->src[3]) == 32);11361137bi_emit_atomic_i32_to(b, dst,1138bi_emit_lea_image(b, instr),1139bi_src_index(&instr->src[3]),1140instr->intrinsic);1141break;11421143case nir_intrinsic_global_atomic_add:1144case nir_intrinsic_global_atomic_imin:1145case nir_intrinsic_global_atomic_umin:1146case nir_intrinsic_global_atomic_imax:1147case nir_intrinsic_global_atomic_umax:1148case nir_intrinsic_global_atomic_and:1149case nir_intrinsic_global_atomic_or:1150case nir_intrinsic_global_atomic_xor:1151assert(nir_src_bit_size(instr->src[1]) == 32);11521153bi_emit_atomic_i32_to(b, dst,1154bi_src_index(&instr->src[0]),1155bi_src_index(&instr->src[1]),1156instr->intrinsic);1157break;11581159case nir_intrinsic_image_load:1160bi_emit_image_load(b, instr);1161break;11621163case nir_intrinsic_image_store:1164bi_emit_image_store(b, instr);1165break;11661167case nir_intrinsic_global_atomic_exchange:1168bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]),1169&instr->src[1], BI_SEG_NONE);1170break;11711172case nir_intrinsic_image_atomic_exchange:1173bi_emit_axchg_to(b, dst, bi_emit_lea_image(b, instr),1174&instr->src[3], BI_SEG_NONE);1175break;11761177case nir_intrinsic_shared_atomic_exchange:1178bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]),1179&instr->src[1], BI_SEG_WLS);1180break;11811182case nir_intrinsic_global_atomic_comp_swap:1183bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]),1184&instr->src[1], &instr->src[2], BI_SEG_NONE);1185break;11861187case nir_intrinsic_image_atomic_comp_swap:1188bi_emit_acmpxchg_to(b, dst, bi_emit_lea_image(b, instr),1189&instr->src[3], &instr->src[4], BI_SEG_NONE);1190break;11911192case nir_intrinsic_shared_atomic_comp_swap:1193bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]),1194&instr->src[1], &instr->src[2], BI_SEG_WLS);1195break;11961197case nir_intrinsic_load_frag_coord:1198bi_emit_load_frag_coord(b, instr);1199break;12001201case nir_intrinsic_load_output:1202bi_emit_ld_tile(b, instr);1203break;12041205case nir_intrinsic_discard_if: {1206bi_index src = bi_src_index(&instr->src[0]);1207assert(nir_src_bit_size(instr->src[0]) == 1);1208bi_discard_f32(b, bi_half(src, false), bi_imm_u16(0), BI_CMPF_NE);1209break;1210}12111212case nir_intrinsic_discard:1213bi_discard_f32(b, bi_zero(), bi_zero(), BI_CMPF_EQ);1214break;12151216case nir_intrinsic_load_ssbo_address:1217bi_load_sysval_nir(b, instr, 2, 0);1218break;12191220case nir_intrinsic_load_work_dim:1221bi_load_sysval_nir(b, instr, 1, 0);1222break;12231224case nir_intrinsic_load_first_vertex:1225bi_load_sysval_nir(b, instr, 1, 0);1226break;12271228case nir_intrinsic_load_base_vertex:1229bi_load_sysval_nir(b, instr, 1, 4);1230break;12311232case nir_intrinsic_load_base_instance:1233bi_load_sysval_nir(b, instr, 1, 8);1234break;12351236case nir_intrinsic_load_draw_id:1237bi_load_sysval_nir(b, instr, 1, 0);1238break;12391240case nir_intrinsic_get_ssbo_size:1241bi_load_sysval_nir(b, instr, 1, 8);1242break;12431244case nir_intrinsic_load_viewport_scale:1245case nir_intrinsic_load_viewport_offset:1246case nir_intrinsic_load_num_workgroups:1247case nir_intrinsic_load_workgroup_size:1248bi_load_sysval_nir(b, instr, 3, 0);1249break;12501251case nir_intrinsic_image_size:1252bi_load_sysval_nir(b, instr,1253nir_dest_num_components(instr->dest), 0);1254break;12551256case nir_intrinsic_load_sample_positions_pan:1257bi_mov_i32_to(b, bi_word(dst, 0),1258bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, false));1259bi_mov_i32_to(b, bi_word(dst, 1),1260bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, true));1261break;12621263case nir_intrinsic_load_sample_mask_in:1264/* r61[0:15] contains the coverage bitmap */1265bi_u16_to_u32_to(b, dst, bi_half(bi_register(61), false));1266break;12671268case nir_intrinsic_load_sample_id: {1269/* r61[16:23] contains the sampleID, mask it out. Upper bits1270* seem to read garbage (despite being architecturally defined1271* as zero), so use a 5-bit mask instead of 8-bits */12721273bi_rshift_and_i32_to(b, dst, bi_register(61), bi_imm_u32(0x1f),1274bi_imm_u8(16));1275break;1276}12771278case nir_intrinsic_load_front_face:1279/* r58 == 0 means primitive is front facing */1280bi_icmp_i32_to(b, dst, bi_register(58), bi_zero(), BI_CMPF_EQ,1281BI_RESULT_TYPE_M1);1282break;12831284case nir_intrinsic_load_point_coord:1285bi_ld_var_special_to(b, dst, bi_zero(), BI_REGISTER_FORMAT_F32,1286BI_SAMPLE_CENTER, BI_UPDATE_CLOBBER,1287BI_VARYING_NAME_POINT, BI_VECSIZE_V2);1288break;12891290case nir_intrinsic_load_vertex_id_zero_base:1291bi_mov_i32_to(b, dst, bi_register(61));1292break;12931294case nir_intrinsic_load_instance_id:1295bi_mov_i32_to(b, dst, bi_register(62));1296break;12971298case nir_intrinsic_load_subgroup_invocation:1299bi_mov_i32_to(b, dst, bi_fau(BIR_FAU_LANE_ID, false));1300break;13011302case nir_intrinsic_load_local_invocation_id:1303for (unsigned i = 0; i < 3; ++i)1304bi_u16_to_u32_to(b, bi_word(dst, i),1305bi_half(bi_register(55 + i / 2), i % 2));1306break;13071308case nir_intrinsic_load_workgroup_id:1309for (unsigned i = 0; i < 3; ++i)1310bi_mov_i32_to(b, bi_word(dst, i), bi_register(57 + i));1311break;13121313case nir_intrinsic_load_global_invocation_id:1314case nir_intrinsic_load_global_invocation_id_zero_base:1315for (unsigned i = 0; i < 3; ++i)1316bi_mov_i32_to(b, bi_word(dst, i), bi_register(60 + i));1317break;13181319case nir_intrinsic_shader_clock:1320bi_ld_gclk_u64_to(b, dst, BI_SOURCE_CYCLE_COUNTER);1321break;13221323default:1324fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name);1325assert(0);1326}1327}13281329static void1330bi_emit_load_const(bi_builder *b, nir_load_const_instr *instr)1331{1332/* Make sure we've been lowered */1333assert(instr->def.num_components <= (32 / instr->def.bit_size));13341335/* Accumulate all the channels of the constant, as if we did an1336* implicit SEL over them */1337uint32_t acc = 0;13381339for (unsigned i = 0; i < instr->def.num_components; ++i) {1340unsigned v = nir_const_value_as_uint(instr->value[i], instr->def.bit_size);1341acc |= (v << (i * instr->def.bit_size));1342}13431344bi_mov_i32_to(b, bi_get_index(instr->def.index, false, 0), bi_imm_u32(acc));1345}13461347static bi_index1348bi_alu_src_index(nir_alu_src src, unsigned comps)1349{1350/* we don't lower modifiers until the backend */1351assert(!(src.negate || src.abs));13521353unsigned bitsize = nir_src_bit_size(src.src);13541355/* TODO: Do we need to do something more clever with 1-bit bools? */1356if (bitsize == 1)1357bitsize = 16;13581359/* the bi_index carries the 32-bit (word) offset separate from the1360* subword swizzle, first handle the offset */13611362unsigned offset = 0;13631364assert(bitsize == 8 || bitsize == 16 || bitsize == 32);1365unsigned subword_shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2;13661367for (unsigned i = 0; i < comps; ++i) {1368unsigned new_offset = (src.swizzle[i] >> subword_shift);13691370if (i > 0)1371assert(offset == new_offset && "wrong vectorization");13721373offset = new_offset;1374}13751376bi_index idx = bi_word(bi_src_index(&src.src), offset);13771378/* Compose the subword swizzle with existing (identity) swizzle */1379assert(idx.swizzle == BI_SWIZZLE_H01);13801381/* Bigger vectors should have been lowered */1382assert(comps <= (1 << subword_shift));13831384if (bitsize == 16) {1385unsigned c0 = src.swizzle[0] & 1;1386unsigned c1 = (comps > 1) ? src.swizzle[1] & 1 : c0;1387idx.swizzle = BI_SWIZZLE_H00 + c1 + (c0 << 1);1388} else if (bitsize == 8) {1389/* 8-bit vectors not yet supported */1390assert(comps == 1 && "8-bit vectors not supported");1391assert(src.swizzle[0] == 0 && "8-bit vectors not supported");1392idx.swizzle = BI_SWIZZLE_B0000;1393}13941395return idx;1396}13971398static enum bi_round1399bi_nir_round(nir_op op)1400{1401switch (op) {1402case nir_op_fround_even: return BI_ROUND_NONE;1403case nir_op_ftrunc: return BI_ROUND_RTZ;1404case nir_op_fceil: return BI_ROUND_RTP;1405case nir_op_ffloor: return BI_ROUND_RTN;1406default: unreachable("invalid nir round op");1407}1408}14091410/* Convenience for lowered transcendentals */14111412static bi_index1413bi_fmul_f32(bi_builder *b, bi_index s0, bi_index s1)1414{1415return bi_fma_f32(b, s0, s1, bi_imm_f32(-0.0f), BI_ROUND_NONE);1416}14171418/* Approximate with FRCP_APPROX.f32 and apply a single iteration of1419* Newton-Raphson to improve precision */14201421static void1422bi_lower_frcp_32(bi_builder *b, bi_index dst, bi_index s0)1423{1424bi_index x1 = bi_frcp_approx_f32(b, s0);1425bi_index m = bi_frexpm_f32(b, s0, false, false);1426bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, false);1427bi_index t1 = bi_fma_rscale_f32(b, m, bi_neg(x1), bi_imm_f32(1.0),1428bi_zero(), BI_ROUND_NONE, BI_SPECIAL_N);1429bi_fma_rscale_f32_to(b, dst, t1, x1, x1, e,1430BI_ROUND_NONE, BI_SPECIAL_NONE);1431}14321433static void1434bi_lower_frsq_32(bi_builder *b, bi_index dst, bi_index s0)1435{1436bi_index x1 = bi_frsq_approx_f32(b, s0);1437bi_index m = bi_frexpm_f32(b, s0, false, true);1438bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, true);1439bi_index t1 = bi_fmul_f32(b, x1, x1);1440bi_index t2 = bi_fma_rscale_f32(b, m, bi_neg(t1), bi_imm_f32(1.0),1441bi_imm_u32(-1), BI_ROUND_NONE, BI_SPECIAL_N);1442bi_fma_rscale_f32_to(b, dst, t2, x1, x1, e,1443BI_ROUND_NONE, BI_SPECIAL_N);1444}14451446/* More complex transcendentals, see1447* https://gitlab.freedesktop.org/panfrost/mali-isa-docs/-/blob/master/Bifrost.adoc1448* for documentation */14491450static void1451bi_lower_fexp2_32(bi_builder *b, bi_index dst, bi_index s0)1452{1453bi_index t1 = bi_temp(b->shader);1454bi_instr *t1_instr = bi_fadd_f32_to(b, t1,1455s0, bi_imm_u32(0x49400000), BI_ROUND_NONE);1456t1_instr->clamp = BI_CLAMP_CLAMP_0_INF;14571458bi_index t2 = bi_fadd_f32(b, t1, bi_imm_u32(0xc9400000), BI_ROUND_NONE);14591460bi_instr *a2 = bi_fadd_f32_to(b, bi_temp(b->shader),1461s0, bi_neg(t2), BI_ROUND_NONE);1462a2->clamp = BI_CLAMP_CLAMP_M1_1;14631464bi_index a1t = bi_fexp_table_u4(b, t1, BI_ADJ_NONE);1465bi_index t3 = bi_isub_u32(b, t1, bi_imm_u32(0x49400000), false);1466bi_index a1i = bi_arshift_i32(b, t3, bi_null(), bi_imm_u8(4));1467bi_index p1 = bi_fma_f32(b, a2->dest[0], bi_imm_u32(0x3d635635),1468bi_imm_u32(0x3e75fffa), BI_ROUND_NONE);1469bi_index p2 = bi_fma_f32(b, p1, a2->dest[0],1470bi_imm_u32(0x3f317218), BI_ROUND_NONE);1471bi_index p3 = bi_fmul_f32(b, a2->dest[0], p2);1472bi_instr *x = bi_fma_rscale_f32_to(b, bi_temp(b->shader),1473p3, a1t, a1t, a1i, BI_ROUND_NONE, BI_SPECIAL_NONE);1474x->clamp = BI_CLAMP_CLAMP_0_INF;14751476bi_instr *max = bi_fmax_f32_to(b, dst, x->dest[0], s0);1477max->sem = BI_SEM_NAN_PROPAGATE;1478}14791480static void1481bi_fexp_32(bi_builder *b, bi_index dst, bi_index s0, bi_index log2_base)1482{1483/* Scale by base, Multiply by 2*24 and convert to integer to get a 8:241484* fixed-point input */1485bi_index scale = bi_fma_rscale_f32(b, s0, log2_base, bi_negzero(),1486bi_imm_u32(24), BI_ROUND_NONE, BI_SPECIAL_NONE);1487bi_index fixed_pt = bi_f32_to_s32(b, scale, BI_ROUND_NONE);14881489/* Compute the result for the fixed-point input, but pass along1490* the floating-point scale for correct NaN propagation */1491bi_fexp_f32_to(b, dst, fixed_pt, scale);1492}14931494static void1495bi_lower_flog2_32(bi_builder *b, bi_index dst, bi_index s0)1496{1497/* s0 = a1 * 2^e, with a1 in [0.75, 1.5) */1498bi_index a1 = bi_frexpm_f32(b, s0, true, false);1499bi_index ei = bi_frexpe_f32(b, s0, true, false);1500bi_index ef = bi_s32_to_f32(b, ei, BI_ROUND_RTZ);15011502/* xt estimates -log(r1), a coarse approximation of log(a1) */1503bi_index r1 = bi_flog_table_f32(b, s0, BI_MODE_RED, BI_PRECISION_NONE);1504bi_index xt = bi_flog_table_f32(b, s0, BI_MODE_BASE2, BI_PRECISION_NONE);15051506/* log(s0) = log(a1 * 2^e) = e + log(a1) = e + log(a1 * r1) -1507* log(r1), so let x1 = e - log(r1) ~= e + xt and x2 = log(a1 * r1),1508* and then log(s0) = x1 + x2 */1509bi_index x1 = bi_fadd_f32(b, ef, xt, BI_ROUND_NONE);15101511/* Since a1 * r1 is close to 1, x2 = log(a1 * r1) may be computed by1512* polynomial approximation around 1. The series is expressed around1513* 1, so set y = (a1 * r1) - 1.0 */1514bi_index y = bi_fma_f32(b, a1, r1, bi_imm_f32(-1.0), BI_ROUND_NONE);15151516/* x2 = log_2(1 + y) = log_e(1 + y) * (1/log_e(2)), so approximate1517* log_e(1 + y) by the Taylor series (lower precision than the blob):1518* y - y^2/2 + O(y^3) = y(1 - y/2) + O(y^3) */1519bi_index loge = bi_fmul_f32(b, y,1520bi_fma_f32(b, y, bi_imm_f32(-0.5), bi_imm_f32(1.0), BI_ROUND_NONE));15211522bi_index x2 = bi_fmul_f32(b, loge, bi_imm_f32(1.0 / logf(2.0)));15231524/* log(s0) = x1 + x2 */1525bi_fadd_f32_to(b, dst, x1, x2, BI_ROUND_NONE);1526}15271528static void1529bi_flog2_32(bi_builder *b, bi_index dst, bi_index s0)1530{1531bi_index frexp = bi_frexpe_f32(b, s0, true, false);1532bi_index frexpi = bi_s32_to_f32(b, frexp, BI_ROUND_RTZ);1533bi_index add = bi_fadd_lscale_f32(b, bi_imm_f32(-1.0f), s0);1534bi_fma_f32_to(b, dst, bi_flogd_f32(b, s0), add, frexpi,1535BI_ROUND_NONE);1536}15371538static void1539bi_lower_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp)1540{1541bi_index log2_base = bi_null();15421543if (base.type == BI_INDEX_CONSTANT) {1544log2_base = bi_imm_f32(log2f(uif(base.value)));1545} else {1546log2_base = bi_temp(b->shader);1547bi_lower_flog2_32(b, log2_base, base);1548}15491550return bi_lower_fexp2_32(b, dst, bi_fmul_f32(b, exp, log2_base));1551}15521553static void1554bi_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp)1555{1556bi_index log2_base = bi_null();15571558if (base.type == BI_INDEX_CONSTANT) {1559log2_base = bi_imm_f32(log2f(uif(base.value)));1560} else {1561log2_base = bi_temp(b->shader);1562bi_flog2_32(b, log2_base, base);1563}15641565return bi_fexp_32(b, dst, exp, log2_base);1566}15671568/* Bifrost has extremely coarse tables for approximating sin/cos, accessible as1569* FSIN/COS_TABLE.u6, which multiplies the bottom 6-bits by pi/32 and1570* calculates the results. We use them to calculate sin/cos via a Taylor1571* approximation:1572*1573* f(x + e) = f(x) + e f'(x) + (e^2)/2 f''(x)1574* sin(x + e) = sin(x) + e cos(x) - (e^2)/2 sin(x)1575* cos(x + e) = cos(x) - e sin(x) - (e^2)/2 cos(x)1576*/15771578#define TWO_OVER_PI bi_imm_f32(2.0f / 3.14159f)1579#define MPI_OVER_TWO bi_imm_f32(-3.14159f / 2.0)1580#define SINCOS_BIAS bi_imm_u32(0x49400000)15811582static void1583bi_lower_fsincos_32(bi_builder *b, bi_index dst, bi_index s0, bool cos)1584{1585/* bottom 6-bits of result times pi/32 approximately s0 mod 2pi */1586bi_index x_u6 = bi_fma_f32(b, s0, TWO_OVER_PI, SINCOS_BIAS, BI_ROUND_NONE);15871588/* Approximate domain error (small) */1589bi_index e = bi_fma_f32(b, bi_fadd_f32(b, x_u6, bi_neg(SINCOS_BIAS),1590BI_ROUND_NONE),1591MPI_OVER_TWO, s0, BI_ROUND_NONE);15921593/* Lookup sin(x), cos(x) */1594bi_index sinx = bi_fsin_table_u6(b, x_u6, false);1595bi_index cosx = bi_fcos_table_u6(b, x_u6, false);15961597/* e^2 / 2 */1598bi_index e2_over_2 = bi_fma_rscale_f32(b, e, e, bi_negzero(),1599bi_imm_u32(-1), BI_ROUND_NONE, BI_SPECIAL_NONE);16001601/* (-e^2)/2 f''(x) */1602bi_index quadratic = bi_fma_f32(b, bi_neg(e2_over_2),1603cos ? cosx : sinx,1604bi_negzero(), BI_ROUND_NONE);16051606/* e f'(x) - (e^2/2) f''(x) */1607bi_instr *I = bi_fma_f32_to(b, bi_temp(b->shader), e,1608cos ? bi_neg(sinx) : cosx,1609quadratic, BI_ROUND_NONE);1610I->clamp = BI_CLAMP_CLAMP_M1_1;16111612/* f(x) + e f'(x) - (e^2/2) f''(x) */1613bi_fadd_f32_to(b, dst, I->dest[0], cos ? cosx : sinx, BI_ROUND_NONE);1614}16151616static bi_instr *1617bi_emit_alu_bool(bi_builder *b, unsigned sz, nir_op op,1618bi_index dst, bi_index s0, bi_index s1, bi_index s2)1619{1620/* Handle 1-bit bools as 0/~0 by default and let the optimizer deal1621* with the bit patterns later. 0/~0 has the nice property of being1622* independent of replicated vectorization. */1623if (sz == 1) sz = 16;1624bi_index f = bi_zero();1625bi_index t = bi_imm_u16(0xFFFF);16261627switch (op) {1628case nir_op_feq:1629return bi_fcmp_to(b, sz, dst, s0, s1, BI_CMPF_EQ, BI_RESULT_TYPE_M1);1630case nir_op_flt:1631return bi_fcmp_to(b, sz, dst, s0, s1, BI_CMPF_LT, BI_RESULT_TYPE_M1);1632case nir_op_fge:1633return bi_fcmp_to(b, sz, dst, s0, s1, BI_CMPF_GE, BI_RESULT_TYPE_M1);1634case nir_op_fneu:1635return bi_fcmp_to(b, sz, dst, s0, s1, BI_CMPF_NE, BI_RESULT_TYPE_M1);16361637case nir_op_ieq:1638return bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, BI_CMPF_EQ, BI_RESULT_TYPE_M1);1639case nir_op_ine:1640return bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, BI_CMPF_NE, BI_RESULT_TYPE_M1);1641case nir_op_ilt:1642return bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, BI_CMPF_LT, BI_RESULT_TYPE_M1);1643case nir_op_ige:1644return bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, BI_CMPF_GE, BI_RESULT_TYPE_M1);1645case nir_op_ult:1646return bi_icmp_to(b, nir_type_uint, sz, dst, s0, s1, BI_CMPF_LT, BI_RESULT_TYPE_M1);1647case nir_op_uge:1648return bi_icmp_to(b, nir_type_uint, sz, dst, s0, s1, BI_CMPF_GE, BI_RESULT_TYPE_M1);16491650case nir_op_iand:1651return bi_lshift_and_to(b, sz, dst, s0, s1, bi_imm_u8(0));1652case nir_op_ior:1653return bi_lshift_or_to(b, sz, dst, s0, s1, bi_imm_u8(0));1654case nir_op_ixor:1655return bi_lshift_xor_to(b, sz, dst, s0, s1, bi_imm_u8(0));1656case nir_op_inot:1657return bi_lshift_or_to(b, sz, dst, bi_zero(), bi_not(s0), bi_imm_u8(0));16581659case nir_op_f2b1:1660return bi_csel_to(b, nir_type_int, sz, dst, s0, f, f, t, BI_CMPF_EQ);1661case nir_op_i2b1:1662return bi_csel_to(b, nir_type_int, sz, dst, s0, f, f, t, BI_CMPF_EQ);1663case nir_op_b2b1:1664return bi_csel_to(b, nir_type_int, sz, dst, s0, f, f, t, BI_CMPF_EQ);16651666case nir_op_bcsel:1667return bi_csel_to(b, nir_type_int, sz, dst, s0, f, s1, s2, BI_CMPF_NE);16681669default:1670fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[op].name);1671unreachable("Unhandled boolean ALU instruction");1672}1673}16741675static void1676bi_emit_alu(bi_builder *b, nir_alu_instr *instr)1677{1678bi_index dst = bi_dest_index(&instr->dest.dest);1679unsigned srcs = nir_op_infos[instr->op].num_inputs;1680unsigned sz = nir_dest_bit_size(instr->dest.dest);1681unsigned comps = nir_dest_num_components(instr->dest.dest);1682unsigned src_sz = srcs > 0 ? nir_src_bit_size(instr->src[0].src) : 0;1683unsigned src1_sz = srcs > 1 ? nir_src_bit_size(instr->src[1].src) : 0;1684bool is_bool = (sz == 1);16851686/* TODO: Anything else? */1687if (sz == 1)1688sz = 16;16891690/* Indicate scalarness */1691if (sz == 16 && comps == 1)1692dst.swizzle = BI_SWIZZLE_H00;16931694if (!instr->dest.dest.is_ssa) {1695for (unsigned i = 0; i < comps; ++i)1696assert(instr->dest.write_mask);1697}16981699/* First, match against the various moves in NIR. These are1700* special-cased because they can operate on vectors even after1701* lowering ALU to scalar. For Bifrost, bi_alu_src_index assumes the1702* instruction is no "bigger" than SIMD-within-a-register. These moves1703* are the exceptions that need to handle swizzles specially. */17041705switch (instr->op) {1706case nir_op_pack_32_2x16:1707case nir_op_vec2:1708case nir_op_vec3:1709case nir_op_vec4: {1710bi_index unoffset_srcs[4] = {1711srcs > 0 ? bi_src_index(&instr->src[0].src) : bi_null(),1712srcs > 1 ? bi_src_index(&instr->src[1].src) : bi_null(),1713srcs > 2 ? bi_src_index(&instr->src[2].src) : bi_null(),1714srcs > 3 ? bi_src_index(&instr->src[3].src) : bi_null(),1715};17161717unsigned channels[4] = {1718instr->src[0].swizzle[0],1719instr->src[1].swizzle[0],1720srcs > 2 ? instr->src[2].swizzle[0] : 0,1721srcs > 3 ? instr->src[3].swizzle[0] : 0,1722};17231724bi_make_vec_to(b, dst, unoffset_srcs, channels, srcs, sz);1725return;1726}17271728case nir_op_vec8:1729case nir_op_vec16:1730unreachable("should've been lowered");17311732case nir_op_unpack_32_2x16:1733case nir_op_unpack_64_2x32_split_x:1734bi_mov_i32_to(b, dst, bi_src_index(&instr->src[0].src));1735return;17361737case nir_op_unpack_64_2x32_split_y:1738bi_mov_i32_to(b, dst, bi_word(bi_src_index(&instr->src[0].src), 1));1739return;17401741case nir_op_pack_64_2x32_split:1742bi_mov_i32_to(b, bi_word(dst, 0), bi_src_index(&instr->src[0].src));1743bi_mov_i32_to(b, bi_word(dst, 1), bi_src_index(&instr->src[1].src));1744return;17451746case nir_op_pack_64_2x32:1747bi_mov_i32_to(b, bi_word(dst, 0), bi_word(bi_src_index(&instr->src[0].src), 0));1748bi_mov_i32_to(b, bi_word(dst, 1), bi_word(bi_src_index(&instr->src[0].src), 1));1749return;17501751case nir_op_mov: {1752bi_index idx = bi_src_index(&instr->src[0].src);1753bi_index unoffset_srcs[4] = { idx, idx, idx, idx };17541755unsigned channels[4] = {1756comps > 0 ? instr->src[0].swizzle[0] : 0,1757comps > 1 ? instr->src[0].swizzle[1] : 0,1758comps > 2 ? instr->src[0].swizzle[2] : 0,1759comps > 3 ? instr->src[0].swizzle[3] : 0,1760};17611762if (sz == 1) sz = 16;1763bi_make_vec_to(b, dst, unoffset_srcs, channels, comps, sz);1764return;1765}17661767case nir_op_f2f16:1768assert(src_sz == 32);1769bi_index idx = bi_src_index(&instr->src[0].src);1770bi_index s0 = bi_word(idx, instr->src[0].swizzle[0]);1771bi_index s1 = comps > 1 ?1772bi_word(idx, instr->src[0].swizzle[1]) : s0;17731774bi_v2f32_to_v2f16_to(b, dst, s0, s1, BI_ROUND_NONE);1775return;17761777/* Vectorized downcasts */1778case nir_op_u2u16:1779case nir_op_i2i16: {1780if (!(src_sz == 32 && comps == 2))1781break;17821783bi_index idx = bi_src_index(&instr->src[0].src);1784bi_index s0 = bi_word(idx, instr->src[0].swizzle[0]);1785bi_index s1 = bi_word(idx, instr->src[0].swizzle[1]);17861787bi_mkvec_v2i16_to(b, dst,1788bi_half(s0, false), bi_half(s1, false));1789return;1790}17911792case nir_op_i2i8:1793case nir_op_u2u8:1794{1795/* Acts like an 8-bit swizzle */1796bi_index idx = bi_src_index(&instr->src[0].src);1797unsigned factor = src_sz / 8;1798unsigned chan[4] = { 0 };17991800for (unsigned i = 0; i < comps; ++i)1801chan[i] = instr->src[0].swizzle[i] * factor;18021803bi_make_vec_to(b, dst, &idx, chan, comps, 8);1804return;1805}18061807default:1808break;1809}18101811bi_index s0 = srcs > 0 ? bi_alu_src_index(instr->src[0], comps) : bi_null();1812bi_index s1 = srcs > 1 ? bi_alu_src_index(instr->src[1], comps) : bi_null();1813bi_index s2 = srcs > 2 ? bi_alu_src_index(instr->src[2], comps) : bi_null();18141815if (is_bool) {1816bi_emit_alu_bool(b, src_sz, instr->op, dst, s0, s1, s2);1817return;1818}18191820switch (instr->op) {1821case nir_op_ffma:1822bi_fma_to(b, sz, dst, s0, s1, s2, BI_ROUND_NONE);1823break;18241825case nir_op_fmul:1826bi_fma_to(b, sz, dst, s0, s1, bi_negzero(), BI_ROUND_NONE);1827break;18281829case nir_op_fsub:1830s1 = bi_neg(s1);1831FALLTHROUGH;1832case nir_op_fadd:1833bi_fadd_to(b, sz, dst, s0, s1, BI_ROUND_NONE);1834break;18351836case nir_op_fsat: {1837bi_instr *I = bi_fadd_to(b, sz, dst, s0, bi_negzero(), BI_ROUND_NONE);1838I->clamp = BI_CLAMP_CLAMP_0_1;1839break;1840}18411842case nir_op_fsat_signed_mali: {1843bi_instr *I = bi_fadd_to(b, sz, dst, s0, bi_negzero(), BI_ROUND_NONE);1844I->clamp = BI_CLAMP_CLAMP_M1_1;1845break;1846}18471848case nir_op_fclamp_pos_mali: {1849bi_instr *I = bi_fadd_to(b, sz, dst, s0, bi_negzero(), BI_ROUND_NONE);1850I->clamp = BI_CLAMP_CLAMP_0_INF;1851break;1852}18531854case nir_op_fneg:1855bi_fadd_to(b, sz, dst, bi_neg(s0), bi_negzero(), BI_ROUND_NONE);1856break;18571858case nir_op_fabs:1859bi_fadd_to(b, sz, dst, bi_abs(s0), bi_negzero(), BI_ROUND_NONE);1860break;18611862case nir_op_fsin:1863bi_lower_fsincos_32(b, dst, s0, false);1864break;18651866case nir_op_fcos:1867bi_lower_fsincos_32(b, dst, s0, true);1868break;18691870case nir_op_fexp2:1871assert(sz == 32); /* should've been lowered */18721873if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)1874bi_lower_fexp2_32(b, dst, s0);1875else1876bi_fexp_32(b, dst, s0, bi_imm_f32(1.0f));18771878break;18791880case nir_op_flog2:1881assert(sz == 32); /* should've been lowered */18821883if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)1884bi_lower_flog2_32(b, dst, s0);1885else1886bi_flog2_32(b, dst, s0);18871888break;18891890case nir_op_fpow:1891assert(sz == 32); /* should've been lowered */18921893if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)1894bi_lower_fpow_32(b, dst, s0, s1);1895else1896bi_fpow_32(b, dst, s0, s1);18971898break;18991900case nir_op_bcsel:1901if (src1_sz == 8)1902bi_mux_v4i8_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);1903else1904bi_csel_to(b, nir_type_int, src1_sz,1905dst, s0, bi_zero(), s1, s2, BI_CMPF_NE);1906break;19071908case nir_op_ishl:1909bi_lshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0));1910break;1911case nir_op_ushr:1912bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0));1913break;19141915case nir_op_ishr:1916bi_arshift_to(b, sz, dst, s0, bi_null(), bi_byte(s1, 0));1917break;19181919case nir_op_imin:1920case nir_op_umin:1921bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst,1922s0, s1, s0, s1, BI_CMPF_LT);1923break;19241925case nir_op_imax:1926case nir_op_umax:1927bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst,1928s0, s1, s0, s1, BI_CMPF_GT);1929break;19301931case nir_op_fddx:1932case nir_op_fddy: {1933bi_index lane1 = bi_lshift_and_i32(b,1934bi_fau(BIR_FAU_LANE_ID, false),1935bi_imm_u32(instr->op == nir_op_fddx ? 2 : 1),1936bi_imm_u8(0));19371938bi_index lane2 = bi_iadd_u32(b, lane1,1939bi_imm_u32(instr->op == nir_op_fddx ? 1 : 2),1940false);19411942bi_index left, right;19431944if (b->shader->arch == 6) {1945left = bi_clper_v6_i32(b, s0, lane1);1946right = bi_clper_v6_i32(b, s0, lane2);1947} else {1948left = bi_clper_v7_i32(b, s0, lane1,1949BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE,1950BI_SUBGROUP_SUBGROUP4);19511952right = bi_clper_v7_i32(b, s0, lane2,1953BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE,1954BI_SUBGROUP_SUBGROUP4);1955}19561957bi_fadd_to(b, sz, dst, right, bi_neg(left), BI_ROUND_NONE);1958break;1959}19601961case nir_op_f2f32:1962bi_f16_to_f32_to(b, dst, s0);1963break;19641965case nir_op_f2i32:1966if (src_sz == 32)1967bi_f32_to_s32_to(b, dst, s0, BI_ROUND_RTZ);1968else1969bi_f16_to_s32_to(b, dst, s0, BI_ROUND_RTZ);1970break;19711972/* Note 32-bit sources => no vectorization, so 32-bit works */1973case nir_op_f2u16:1974if (src_sz == 32)1975bi_f32_to_u32_to(b, dst, s0, BI_ROUND_RTZ);1976else1977bi_v2f16_to_v2u16_to(b, dst, s0, BI_ROUND_RTZ);1978break;19791980case nir_op_f2i16:1981if (src_sz == 32)1982bi_f32_to_s32_to(b, dst, s0, BI_ROUND_RTZ);1983else1984bi_v2f16_to_v2s16_to(b, dst, s0, BI_ROUND_RTZ);1985break;19861987case nir_op_f2u32:1988if (src_sz == 32)1989bi_f32_to_u32_to(b, dst, s0, BI_ROUND_RTZ);1990else1991bi_f16_to_u32_to(b, dst, s0, BI_ROUND_RTZ);1992break;19931994case nir_op_u2f16:1995if (src_sz == 32)1996bi_v2u16_to_v2f16_to(b, dst, bi_half(s0, false), BI_ROUND_RTZ);1997else if (src_sz == 16)1998bi_v2u16_to_v2f16_to(b, dst, s0, BI_ROUND_RTZ);1999else if (src_sz == 8)2000bi_v2u8_to_v2f16_to(b, dst, s0);2001break;20022003case nir_op_u2f32:2004if (src_sz == 32)2005bi_u32_to_f32_to(b, dst, s0, BI_ROUND_RTZ);2006else if (src_sz == 16)2007bi_u16_to_f32_to(b, dst, s0);2008else2009bi_u8_to_f32_to(b, dst, s0);2010break;20112012case nir_op_i2f16:2013if (src_sz == 32)2014bi_v2s16_to_v2f16_to(b, dst, bi_half(s0, false), BI_ROUND_RTZ);2015else if (src_sz == 16)2016bi_v2s16_to_v2f16_to(b, dst, s0, BI_ROUND_RTZ);2017else if (src_sz == 8)2018bi_v2s8_to_v2f16_to(b, dst, s0);2019break;20202021case nir_op_i2f32:2022if (src_sz == 32)2023bi_s32_to_f32_to(b, dst, s0, BI_ROUND_RTZ);2024else if (src_sz == 16)2025bi_s16_to_f32_to(b, dst, s0);2026else if (src_sz == 8)2027bi_s8_to_f32_to(b, dst, s0);2028break;20292030case nir_op_i2i32:2031if (src_sz == 16)2032bi_s16_to_s32_to(b, dst, s0);2033else2034bi_s8_to_s32_to(b, dst, s0);2035break;20362037case nir_op_u2u32:2038if (src_sz == 16)2039bi_u16_to_u32_to(b, dst, s0);2040else2041bi_u8_to_u32_to(b, dst, s0);2042break;20432044case nir_op_i2i16:2045assert(src_sz == 8 || src_sz == 32);20462047if (src_sz == 8)2048bi_v2s8_to_v2s16_to(b, dst, s0);2049else2050bi_mov_i32_to(b, dst, s0);2051break;20522053case nir_op_u2u16:2054assert(src_sz == 8 || src_sz == 32);20552056if (src_sz == 8)2057bi_v2u8_to_v2u16_to(b, dst, s0);2058else2059bi_mov_i32_to(b, dst, s0);2060break;20612062case nir_op_b2f16:2063case nir_op_b2f32:2064bi_csel_to(b, nir_type_int, sz, dst, s0, bi_zero(),2065(sz == 16) ? bi_imm_f16(1.0) : bi_imm_f32(1.0),2066(sz == 16) ? bi_imm_f16(0.0) : bi_imm_f32(0.0),2067BI_CMPF_NE);2068break;20692070case nir_op_b2b32:2071bi_csel_to(b, nir_type_int, sz, dst, s0, bi_zero(),2072bi_imm_u32(~0), bi_zero(), BI_CMPF_NE);2073break;20742075case nir_op_b2i8:2076case nir_op_b2i16:2077case nir_op_b2i32:2078bi_lshift_and_to(b, sz, dst, s0, bi_imm_uintN(1, sz), bi_imm_u8(0));2079break;20802081case nir_op_fround_even:2082case nir_op_fceil:2083case nir_op_ffloor:2084case nir_op_ftrunc:2085bi_fround_to(b, sz, dst, s0, bi_nir_round(instr->op));2086break;20872088case nir_op_fmin:2089bi_fmin_to(b, sz, dst, s0, s1);2090break;20912092case nir_op_fmax:2093bi_fmax_to(b, sz, dst, s0, s1);2094break;20952096case nir_op_iadd:2097bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, false);2098break;20992100case nir_op_iadd_sat:2101bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, true);2102break;21032104case nir_op_uadd_sat:2105bi_iadd_to(b, nir_type_uint, sz, dst, s0, s1, true);2106break;21072108case nir_op_ihadd:2109bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTN);2110break;21112112case nir_op_irhadd:2113bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTP);2114break;21152116case nir_op_ineg:2117bi_isub_to(b, nir_type_int, sz, dst, bi_zero(), s0, false);2118break;21192120case nir_op_isub:2121bi_isub_to(b, nir_type_int, sz, dst, s0, s1, false);2122break;21232124case nir_op_isub_sat:2125bi_isub_to(b, nir_type_int, sz, dst, s0, s1, true);2126break;21272128case nir_op_usub_sat:2129bi_isub_to(b, nir_type_uint, sz, dst, s0, s1, true);2130break;21312132case nir_op_imul:2133bi_imul_to(b, sz, dst, s0, s1);2134break;21352136case nir_op_iabs:2137bi_iabs_to(b, sz, dst, s0);2138break;21392140case nir_op_iand:2141bi_lshift_and_to(b, sz, dst, s0, s1, bi_imm_u8(0));2142break;21432144case nir_op_ior:2145bi_lshift_or_to(b, sz, dst, s0, s1, bi_imm_u8(0));2146break;21472148case nir_op_ixor:2149bi_lshift_xor_to(b, sz, dst, s0, s1, bi_imm_u8(0));2150break;21512152case nir_op_inot:2153bi_lshift_or_to(b, sz, dst, bi_zero(), bi_not(s0), bi_imm_u8(0));2154break;21552156case nir_op_frsq:2157if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)2158bi_lower_frsq_32(b, dst, s0);2159else2160bi_frsq_to(b, sz, dst, s0);2161break;21622163case nir_op_frcp:2164if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)2165bi_lower_frcp_32(b, dst, s0);2166else2167bi_frcp_to(b, sz, dst, s0);2168break;21692170case nir_op_uclz:2171bi_clz_to(b, sz, dst, s0, false);2172break;21732174case nir_op_bit_count:2175bi_popcount_i32_to(b, dst, s0);2176break;21772178case nir_op_bitfield_reverse:2179bi_bitrev_i32_to(b, dst, s0);2180break;21812182case nir_op_ufind_msb: {2183bi_index clz = bi_clz(b, src_sz, s0, false);21842185if (sz == 8)2186clz = bi_byte(clz, 0);2187else if (sz == 16)2188clz = bi_half(clz, false);21892190bi_isub_u32_to(b, dst, bi_imm_u32(src_sz - 1), clz, false);2191break;2192}21932194default:2195fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name);2196unreachable("Unknown ALU op");2197}2198}21992200/* Returns dimension with 0 special casing cubemaps. Shamelessly copied from Midgard */2201static unsigned2202bifrost_tex_format(enum glsl_sampler_dim dim)2203{2204switch (dim) {2205case GLSL_SAMPLER_DIM_1D:2206case GLSL_SAMPLER_DIM_BUF:2207return 1;22082209case GLSL_SAMPLER_DIM_2D:2210case GLSL_SAMPLER_DIM_MS:2211case GLSL_SAMPLER_DIM_EXTERNAL:2212case GLSL_SAMPLER_DIM_RECT:2213return 2;22142215case GLSL_SAMPLER_DIM_3D:2216return 3;22172218case GLSL_SAMPLER_DIM_CUBE:2219return 0;22202221default:2222DBG("Unknown sampler dim type\n");2223assert(0);2224return 0;2225}2226}22272228static enum bifrost_texture_format_full2229bi_texture_format(nir_alu_type T, enum bi_clamp clamp)2230{2231switch (T) {2232case nir_type_float16: return BIFROST_TEXTURE_FORMAT_F16 + clamp;2233case nir_type_float32: return BIFROST_TEXTURE_FORMAT_F32 + clamp;2234case nir_type_uint16: return BIFROST_TEXTURE_FORMAT_U16;2235case nir_type_int16: return BIFROST_TEXTURE_FORMAT_S16;2236case nir_type_uint32: return BIFROST_TEXTURE_FORMAT_U32;2237case nir_type_int32: return BIFROST_TEXTURE_FORMAT_S32;2238default: unreachable("Invalid type for texturing");2239}2240}22412242/* Array indices are specified as 32-bit uints, need to convert. In .z component from NIR */2243static bi_index2244bi_emit_texc_array_index(bi_builder *b, bi_index idx, nir_alu_type T)2245{2246/* For (u)int we can just passthrough */2247nir_alu_type base = nir_alu_type_get_base_type(T);2248if (base == nir_type_int || base == nir_type_uint)2249return idx;22502251/* Otherwise we convert */2252assert(T == nir_type_float32);22532254/* OpenGL ES 3.2 specification section 8.14.2 ("Coordinate Wrapping and2255* Texel Selection") defines the layer to be taken from clamp(RNE(r),2256* 0, dt - 1). So we use round RTE, clamping is handled at the data2257* structure level */22582259return bi_f32_to_u32(b, idx, BI_ROUND_NONE);2260}22612262/* TEXC's explicit and bias LOD modes requires the LOD to be transformed to a2263* 16-bit 8:8 fixed-point format. We lower as:2264*2265* F32_TO_S32(clamp(x, -16.0, +16.0) * 256.0) & 0xFFFF =2266* MKVEC(F32_TO_S32(clamp(x * 1.0/16.0, -1.0, 1.0) * (16.0 * 256.0)), #0)2267*/22682269static bi_index2270bi_emit_texc_lod_88(bi_builder *b, bi_index lod, bool fp16)2271{2272/* Sort of arbitrary. Must be less than 128.0, greater than or equal to2273* the max LOD (16 since we cap at 2^16 texture dimensions), and2274* preferably small to minimize precision loss */2275const float max_lod = 16.0;22762277bi_instr *fsat = bi_fma_f32_to(b, bi_temp(b->shader),2278fp16 ? bi_half(lod, false) : lod,2279bi_imm_f32(1.0f / max_lod), bi_negzero(), BI_ROUND_NONE);22802281fsat->clamp = BI_CLAMP_CLAMP_M1_1;22822283bi_index fmul = bi_fma_f32(b, fsat->dest[0], bi_imm_f32(max_lod * 256.0f),2284bi_negzero(), BI_ROUND_NONE);22852286return bi_mkvec_v2i16(b,2287bi_half(bi_f32_to_s32(b, fmul, BI_ROUND_RTZ), false),2288bi_imm_u16(0));2289}22902291/* FETCH takes a 32-bit staging register containing the LOD as an integer in2292* the bottom 16-bits and (if present) the cube face index in the top 16-bits.2293* TODO: Cube face.2294*/22952296static bi_index2297bi_emit_texc_lod_cube(bi_builder *b, bi_index lod)2298{2299return bi_lshift_or_i32(b, lod, bi_zero(), bi_imm_u8(8));2300}23012302/* The hardware specifies texel offsets and multisample indices together as a2303* u8vec4 <offset, ms index>. By default all are zero, so if have either a2304* nonzero texel offset or a nonzero multisample index, we build a u8vec4 with2305* the bits we need and return that to be passed as a staging register. Else we2306* return 0 to avoid allocating a data register when everything is zero. */23072308static bi_index2309bi_emit_texc_offset_ms_index(bi_builder *b, nir_tex_instr *instr)2310{2311bi_index dest = bi_zero();23122313int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset);2314if (offs_idx >= 0 &&2315(!nir_src_is_const(instr->src[offs_idx].src) ||2316nir_src_as_uint(instr->src[offs_idx].src) != 0)) {2317unsigned nr = nir_src_num_components(instr->src[offs_idx].src);2318bi_index idx = bi_src_index(&instr->src[offs_idx].src);2319dest = bi_mkvec_v4i8(b,2320(nr > 0) ? bi_byte(bi_word(idx, 0), 0) : bi_imm_u8(0),2321(nr > 1) ? bi_byte(bi_word(idx, 1), 0) : bi_imm_u8(0),2322(nr > 2) ? bi_byte(bi_word(idx, 2), 0) : bi_imm_u8(0),2323bi_imm_u8(0));2324}23252326int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);2327if (ms_idx >= 0 &&2328(!nir_src_is_const(instr->src[ms_idx].src) ||2329nir_src_as_uint(instr->src[ms_idx].src) != 0)) {2330dest = bi_lshift_or_i32(b,2331bi_src_index(&instr->src[ms_idx].src), dest,2332bi_imm_u8(24));2333}23342335return dest;2336}23372338static void2339bi_emit_cube_coord(bi_builder *b, bi_index coord,2340bi_index *face, bi_index *s, bi_index *t)2341{2342/* Compute max { |x|, |y|, |z| } */2343bi_instr *cubeface = bi_cubeface_to(b, bi_temp(b->shader),2344bi_temp(b->shader), coord,2345bi_word(coord, 1), bi_word(coord, 2));23462347/* Select coordinates */23482349bi_index ssel = bi_cube_ssel(b, bi_word(coord, 2), coord,2350cubeface->dest[1]);23512352bi_index tsel = bi_cube_tsel(b, bi_word(coord, 1), bi_word(coord, 2),2353cubeface->dest[1]);23542355/* The OpenGL ES specification requires us to transform an input vector2356* (x, y, z) to the coordinate, given the selected S/T:2357*2358* (1/2 ((s / max{x,y,z}) + 1), 1/2 ((t / max{x, y, z}) + 1))2359*2360* We implement (s shown, t similar) in a form friendlier to FMA2361* instructions, and clamp coordinates at the end for correct2362* NaN/infinity handling:2363*2364* fsat(s * (0.5 * (1 / max{x, y, z})) + 0.5)2365*2366* Take the reciprocal of max{x, y, z}2367*/23682369bi_index rcp = bi_frcp_f32(b, cubeface->dest[0]);23702371/* Calculate 0.5 * (1.0 / max{x, y, z}) */2372bi_index fma1 = bi_fma_f32(b, rcp, bi_imm_f32(0.5f), bi_negzero(),2373BI_ROUND_NONE);23742375/* Transform the coordinates */2376*s = bi_temp(b->shader);2377*t = bi_temp(b->shader);23782379bi_instr *S = bi_fma_f32_to(b, *s, fma1, ssel, bi_imm_f32(0.5f),2380BI_ROUND_NONE);2381bi_instr *T = bi_fma_f32_to(b, *t, fma1, tsel, bi_imm_f32(0.5f),2382BI_ROUND_NONE);23832384S->clamp = BI_CLAMP_CLAMP_0_1;2385T->clamp = BI_CLAMP_CLAMP_0_1;23862387/* Face index at bit[29:31], matching the cube map descriptor */2388*face = cubeface->dest[1];2389}23902391/* Emits a cube map descriptor, returning lower 32-bits and putting upper2392* 32-bits in passed pointer t. The packing of the face with the S coordinate2393* exploits the redundancy of floating points with the range restriction of2394* CUBEFACE output.2395*2396* struct cube_map_descriptor {2397* float s : 29;2398* unsigned face : 3;2399* float t : 32;2400* }2401*2402* Since the cube face index is preshifted, this is easy to pack with a bitwise2403* MUX.i32 and a fixed mask, selecting the lower bits 29 from s and the upper 32404* bits from face.2405*/24062407static bi_index2408bi_emit_texc_cube_coord(bi_builder *b, bi_index coord, bi_index *t)2409{2410bi_index face, s;2411bi_emit_cube_coord(b, coord, &face, &s, t);2412bi_index mask = bi_imm_u32(BITFIELD_MASK(29));2413return bi_mux_i32(b, s, face, mask, BI_MUX_BIT);2414}24152416/* Map to the main texture op used. Some of these (txd in particular) will2417* lower to multiple texture ops with different opcodes (GRDESC_DER + TEX in2418* sequence). We assume that lowering is handled elsewhere.2419*/24202421static enum bifrost_tex_op2422bi_tex_op(nir_texop op)2423{2424switch (op) {2425case nir_texop_tex:2426case nir_texop_txb:2427case nir_texop_txl:2428case nir_texop_txd:2429case nir_texop_tex_prefetch:2430return BIFROST_TEX_OP_TEX;2431case nir_texop_txf:2432case nir_texop_txf_ms:2433case nir_texop_txf_ms_fb:2434case nir_texop_txf_ms_mcs:2435case nir_texop_tg4:2436return BIFROST_TEX_OP_FETCH;2437case nir_texop_txs:2438case nir_texop_lod:2439case nir_texop_query_levels:2440case nir_texop_texture_samples:2441case nir_texop_samples_identical:2442unreachable("should've been lowered");2443default:2444unreachable("unsupported tex op");2445}2446}24472448/* Data registers required by texturing in the order they appear. All are2449* optional, the texture operation descriptor determines which are present.2450* Note since 3D arrays are not permitted at an API level, Z_COORD and2451* ARRAY/SHADOW are exlusive, so TEXC in practice reads at most 8 registers */24522453enum bifrost_tex_dreg {2454BIFROST_TEX_DREG_Z_COORD = 0,2455BIFROST_TEX_DREG_Y_DELTAS = 1,2456BIFROST_TEX_DREG_LOD = 2,2457BIFROST_TEX_DREG_GRDESC_HI = 3,2458BIFROST_TEX_DREG_SHADOW = 4,2459BIFROST_TEX_DREG_ARRAY = 5,2460BIFROST_TEX_DREG_OFFSETMS = 6,2461BIFROST_TEX_DREG_SAMPLER = 7,2462BIFROST_TEX_DREG_TEXTURE = 8,2463BIFROST_TEX_DREG_COUNT,2464};24652466static void2467bi_emit_texc(bi_builder *b, nir_tex_instr *instr)2468{2469bool computed_lod = false;24702471struct bifrost_texture_operation desc = {2472.op = bi_tex_op(instr->op),2473.offset_or_bias_disable = false, /* TODO */2474.shadow_or_clamp_disable = instr->is_shadow,2475.array = instr->is_array,2476.dimension = bifrost_tex_format(instr->sampler_dim),2477.format = bi_texture_format(instr->dest_type | nir_dest_bit_size(instr->dest), BI_CLAMP_NONE), /* TODO */2478.mask = 0xF,2479};24802481switch (desc.op) {2482case BIFROST_TEX_OP_TEX:2483desc.lod_or_fetch = BIFROST_LOD_MODE_COMPUTE;2484computed_lod = true;2485break;2486case BIFROST_TEX_OP_FETCH:2487desc.lod_or_fetch = instr->op == nir_texop_tg4 ?2488BIFROST_TEXTURE_FETCH_GATHER4_R + instr->component :2489BIFROST_TEXTURE_FETCH_TEXEL;2490break;2491default:2492unreachable("texture op unsupported");2493}24942495/* 32-bit indices to be allocated as consecutive staging registers */2496bi_index dregs[BIFROST_TEX_DREG_COUNT] = { };2497bi_index cx = bi_null(), cy = bi_null();24982499for (unsigned i = 0; i < instr->num_srcs; ++i) {2500bi_index index = bi_src_index(&instr->src[i].src);2501unsigned sz = nir_src_bit_size(instr->src[i].src);2502ASSERTED nir_alu_type base = nir_tex_instr_src_type(instr, i);2503nir_alu_type T = base | sz;25042505switch (instr->src[i].src_type) {2506case nir_tex_src_coord:2507if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {2508cx = bi_emit_texc_cube_coord(b, index, &cy);2509} else {2510unsigned components = nir_src_num_components(instr->src[i].src);25112512/* Copy XY (for 2D+) or XX (for 1D) */2513cx = index;2514cy = bi_word(index, MIN2(1, components - 1));25152516assert(components >= 1 && components <= 3);25172518if (components < 3) {2519/* nothing to do */2520} else if (desc.array) {2521/* 2D array */2522dregs[BIFROST_TEX_DREG_ARRAY] =2523bi_emit_texc_array_index(b,2524bi_word(index, 2), T);2525} else {2526/* 3D */2527dregs[BIFROST_TEX_DREG_Z_COORD] =2528bi_word(index, 2);2529}2530}2531break;25322533case nir_tex_src_lod:2534if (desc.op == BIFROST_TEX_OP_TEX &&2535nir_src_is_const(instr->src[i].src) &&2536nir_src_as_uint(instr->src[i].src) == 0) {2537desc.lod_or_fetch = BIFROST_LOD_MODE_ZERO;2538} else if (desc.op == BIFROST_TEX_OP_TEX) {2539assert(base == nir_type_float);25402541assert(sz == 16 || sz == 32);2542dregs[BIFROST_TEX_DREG_LOD] =2543bi_emit_texc_lod_88(b, index, sz == 16);2544desc.lod_or_fetch = BIFROST_LOD_MODE_EXPLICIT;2545} else {2546assert(desc.op == BIFROST_TEX_OP_FETCH);2547assert(base == nir_type_uint || base == nir_type_int);2548assert(sz == 16 || sz == 32);25492550dregs[BIFROST_TEX_DREG_LOD] =2551bi_emit_texc_lod_cube(b, index);2552}25532554break;25552556case nir_tex_src_bias:2557/* Upper 16-bits interpreted as a clamp, leave zero */2558assert(desc.op == BIFROST_TEX_OP_TEX);2559assert(base == nir_type_float);2560assert(sz == 16 || sz == 32);2561dregs[BIFROST_TEX_DREG_LOD] =2562bi_emit_texc_lod_88(b, index, sz == 16);2563desc.lod_or_fetch = BIFROST_LOD_MODE_BIAS;2564computed_lod = true;2565break;25662567case nir_tex_src_ms_index:2568case nir_tex_src_offset:2569if (desc.offset_or_bias_disable)2570break;25712572dregs[BIFROST_TEX_DREG_OFFSETMS] =2573bi_emit_texc_offset_ms_index(b, instr);2574if (!bi_is_equiv(dregs[BIFROST_TEX_DREG_OFFSETMS], bi_zero()))2575desc.offset_or_bias_disable = true;2576break;25772578case nir_tex_src_comparator:2579dregs[BIFROST_TEX_DREG_SHADOW] = index;2580break;25812582case nir_tex_src_texture_offset:2583assert(instr->texture_index == 0);2584dregs[BIFROST_TEX_DREG_TEXTURE] = index;2585break;25862587case nir_tex_src_sampler_offset:2588assert(instr->sampler_index == 0);2589dregs[BIFROST_TEX_DREG_SAMPLER] = index;2590break;25912592default:2593unreachable("Unhandled src type in texc emit");2594}2595}25962597if (desc.op == BIFROST_TEX_OP_FETCH && bi_is_null(dregs[BIFROST_TEX_DREG_LOD])) {2598dregs[BIFROST_TEX_DREG_LOD] =2599bi_emit_texc_lod_cube(b, bi_zero());2600}26012602/* Choose an index mode */26032604bool direct_tex = bi_is_null(dregs[BIFROST_TEX_DREG_TEXTURE]);2605bool direct_samp = bi_is_null(dregs[BIFROST_TEX_DREG_SAMPLER]);2606bool direct = direct_tex && direct_samp;26072608desc.immediate_indices = direct && (instr->sampler_index < 16);26092610if (desc.immediate_indices) {2611desc.sampler_index_or_mode = instr->sampler_index;2612desc.index = instr->texture_index;2613} else {2614enum bifrost_index mode = 0;26152616if (direct && instr->sampler_index == instr->texture_index) {2617mode = BIFROST_INDEX_IMMEDIATE_SHARED;2618desc.index = instr->texture_index;2619} else if (direct) {2620mode = BIFROST_INDEX_IMMEDIATE_SAMPLER;2621desc.index = instr->sampler_index;2622dregs[BIFROST_TEX_DREG_TEXTURE] = bi_mov_i32(b,2623bi_imm_u32(instr->texture_index));2624} else if (direct_tex) {2625assert(!direct_samp);2626mode = BIFROST_INDEX_IMMEDIATE_TEXTURE;2627desc.index = instr->texture_index;2628} else if (direct_samp) {2629assert(!direct_tex);2630mode = BIFROST_INDEX_IMMEDIATE_SAMPLER;2631desc.index = instr->sampler_index;2632} else {2633mode = BIFROST_INDEX_REGISTER;2634}26352636desc.sampler_index_or_mode = mode | (0x3 << 2);2637}26382639/* Allocate staging registers contiguously by compacting the array.2640* Index is not SSA (tied operands) */26412642unsigned sr_count = 0;26432644for (unsigned i = 0; i < ARRAY_SIZE(dregs); ++i) {2645if (!bi_is_null(dregs[i]))2646dregs[sr_count++] = dregs[i];2647}26482649bi_index idx = sr_count ? bi_temp_reg(b->shader) : bi_null();26502651if (sr_count)2652bi_make_vec_to(b, idx, dregs, NULL, sr_count, 32);26532654uint32_t desc_u = 0;2655memcpy(&desc_u, &desc, sizeof(desc_u));2656bi_texc_to(b, sr_count ? idx : bi_dest_index(&instr->dest),2657idx, cx, cy, bi_imm_u32(desc_u), !computed_lod,2658sr_count);26592660/* Explicit copy to facilitate tied operands */2661if (sr_count) {2662bi_index srcs[4] = { idx, idx, idx, idx };2663unsigned channels[4] = { 0, 1, 2, 3 };2664bi_make_vec_to(b, bi_dest_index(&instr->dest), srcs, channels, 4, 32);2665}2666}26672668/* Simple textures ops correspond to NIR tex or txl with LOD = 0 on 2D/cube2669* textures with sufficiently small immediate indices. Anything else2670* needs a complete texture op. */26712672static void2673bi_emit_texs(bi_builder *b, nir_tex_instr *instr)2674{2675int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord);2676assert(coord_idx >= 0);2677bi_index coords = bi_src_index(&instr->src[coord_idx].src);26782679if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {2680bi_index face, s, t;2681bi_emit_cube_coord(b, coords, &face, &s, &t);26822683bi_texs_cube_to(b, nir_dest_bit_size(instr->dest),2684bi_dest_index(&instr->dest),2685s, t, face,2686instr->sampler_index, instr->texture_index);2687} else {2688bi_texs_2d_to(b, nir_dest_bit_size(instr->dest),2689bi_dest_index(&instr->dest),2690coords, bi_word(coords, 1),2691instr->op != nir_texop_tex, /* zero LOD */2692instr->sampler_index, instr->texture_index);2693}2694}26952696static bool2697bi_is_simple_tex(nir_tex_instr *instr)2698{2699if (instr->op != nir_texop_tex && instr->op != nir_texop_txl)2700return false;27012702if (instr->dest_type != nir_type_float32 &&2703instr->dest_type != nir_type_float16)2704return false;27052706if (instr->is_shadow || instr->is_array)2707return false;27082709switch (instr->sampler_dim) {2710case GLSL_SAMPLER_DIM_2D:2711case GLSL_SAMPLER_DIM_EXTERNAL:2712case GLSL_SAMPLER_DIM_RECT:2713break;27142715case GLSL_SAMPLER_DIM_CUBE:2716/* LOD can't be specified with TEXS_CUBE */2717if (instr->op == nir_texop_txl)2718return false;2719break;27202721default:2722return false;2723}27242725for (unsigned i = 0; i < instr->num_srcs; ++i) {2726if (instr->src[i].src_type != nir_tex_src_lod &&2727instr->src[i].src_type != nir_tex_src_coord)2728return false;2729}27302731/* Indices need to fit in provided bits */2732unsigned idx_bits = instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE ? 2 : 3;2733if (MAX2(instr->sampler_index, instr->texture_index) >= (1 << idx_bits))2734return false;27352736int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod);2737if (lod_idx < 0)2738return true;27392740nir_src lod = instr->src[lod_idx].src;2741return nir_src_is_const(lod) && nir_src_as_uint(lod) == 0;2742}27432744static void2745bi_emit_tex(bi_builder *b, nir_tex_instr *instr)2746{2747switch (instr->op) {2748case nir_texop_txs:2749bi_load_sysval_to(b, bi_dest_index(&instr->dest),2750panfrost_sysval_for_instr(&instr->instr, NULL),27514, 0);2752return;2753case nir_texop_tex:2754case nir_texop_txl:2755case nir_texop_txb:2756case nir_texop_txf:2757case nir_texop_txf_ms:2758case nir_texop_tg4:2759break;2760default:2761unreachable("Invalid texture operation");2762}27632764if (bi_is_simple_tex(instr))2765bi_emit_texs(b, instr);2766else2767bi_emit_texc(b, instr);2768}27692770static void2771bi_emit_instr(bi_builder *b, struct nir_instr *instr)2772{2773switch (instr->type) {2774case nir_instr_type_load_const:2775bi_emit_load_const(b, nir_instr_as_load_const(instr));2776break;27772778case nir_instr_type_intrinsic:2779bi_emit_intrinsic(b, nir_instr_as_intrinsic(instr));2780break;27812782case nir_instr_type_alu:2783bi_emit_alu(b, nir_instr_as_alu(instr));2784break;27852786case nir_instr_type_tex:2787bi_emit_tex(b, nir_instr_as_tex(instr));2788break;27892790case nir_instr_type_jump:2791bi_emit_jump(b, nir_instr_as_jump(instr));2792break;27932794default:2795unreachable("should've been lowered");2796}2797}27982799static bi_block *2800create_empty_block(bi_context *ctx)2801{2802bi_block *blk = rzalloc(ctx, bi_block);28032804blk->base.predecessors = _mesa_set_create(blk,2805_mesa_hash_pointer,2806_mesa_key_pointer_equal);28072808return blk;2809}28102811static bi_block *2812emit_block(bi_context *ctx, nir_block *block)2813{2814if (ctx->after_block) {2815ctx->current_block = ctx->after_block;2816ctx->after_block = NULL;2817} else {2818ctx->current_block = create_empty_block(ctx);2819}28202821list_addtail(&ctx->current_block->base.link, &ctx->blocks);2822list_inithead(&ctx->current_block->base.instructions);28232824bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));28252826nir_foreach_instr(instr, block) {2827bi_emit_instr(&_b, instr);2828++ctx->instruction_count;2829}28302831return ctx->current_block;2832}28332834static void2835emit_if(bi_context *ctx, nir_if *nif)2836{2837bi_block *before_block = ctx->current_block;28382839/* Speculatively emit the branch, but we can't fill it in until later */2840bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));2841bi_instr *then_branch = bi_branchz_i16(&_b,2842bi_half(bi_src_index(&nif->condition), false),2843bi_zero(), BI_CMPF_EQ);28442845/* Emit the two subblocks. */2846bi_block *then_block = emit_cf_list(ctx, &nif->then_list);2847bi_block *end_then_block = ctx->current_block;28482849/* Emit second block, and check if it's empty */28502851int count_in = ctx->instruction_count;2852bi_block *else_block = emit_cf_list(ctx, &nif->else_list);2853bi_block *end_else_block = ctx->current_block;2854ctx->after_block = create_empty_block(ctx);28552856/* Now that we have the subblocks emitted, fix up the branches */28572858assert(then_block);2859assert(else_block);28602861if (ctx->instruction_count == count_in) {2862then_branch->branch_target = ctx->after_block;2863pan_block_add_successor(&end_then_block->base, &ctx->after_block->base); /* fallthrough */2864} else {2865then_branch->branch_target = else_block;28662867/* Emit a jump from the end of the then block to the end of the else */2868_b.cursor = bi_after_block(end_then_block);2869bi_instr *then_exit = bi_jump(&_b, bi_zero());2870then_exit->branch_target = ctx->after_block;28712872pan_block_add_successor(&end_then_block->base, &then_exit->branch_target->base);2873pan_block_add_successor(&end_else_block->base, &ctx->after_block->base); /* fallthrough */2874}28752876pan_block_add_successor(&before_block->base, &then_branch->branch_target->base); /* then_branch */2877pan_block_add_successor(&before_block->base, &then_block->base); /* fallthrough */2878}28792880static void2881emit_loop(bi_context *ctx, nir_loop *nloop)2882{2883/* Remember where we are */2884bi_block *start_block = ctx->current_block;28852886bi_block *saved_break = ctx->break_block;2887bi_block *saved_continue = ctx->continue_block;28882889ctx->continue_block = create_empty_block(ctx);2890ctx->break_block = create_empty_block(ctx);2891ctx->after_block = ctx->continue_block;28922893/* Emit the body itself */2894emit_cf_list(ctx, &nloop->body);28952896/* Branch back to loop back */2897bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));2898bi_instr *I = bi_jump(&_b, bi_zero());2899I->branch_target = ctx->continue_block;2900pan_block_add_successor(&start_block->base, &ctx->continue_block->base);2901pan_block_add_successor(&ctx->current_block->base, &ctx->continue_block->base);29022903ctx->after_block = ctx->break_block;29042905/* Pop off */2906ctx->break_block = saved_break;2907ctx->continue_block = saved_continue;2908++ctx->loop_count;2909}29102911static bi_block *2912emit_cf_list(bi_context *ctx, struct exec_list *list)2913{2914bi_block *start_block = NULL;29152916foreach_list_typed(nir_cf_node, node, node, list) {2917switch (node->type) {2918case nir_cf_node_block: {2919bi_block *block = emit_block(ctx, nir_cf_node_as_block(node));29202921if (!start_block)2922start_block = block;29232924break;2925}29262927case nir_cf_node_if:2928emit_if(ctx, nir_cf_node_as_if(node));2929break;29302931case nir_cf_node_loop:2932emit_loop(ctx, nir_cf_node_as_loop(node));2933break;29342935default:2936unreachable("Unknown control flow");2937}2938}29392940return start_block;2941}29422943/* shader-db stuff */29442945struct bi_stats {2946unsigned nr_clauses, nr_tuples, nr_ins;2947unsigned nr_arith, nr_texture, nr_varying, nr_ldst;2948};29492950static void2951bi_count_tuple_stats(bi_clause *clause, bi_tuple *tuple, struct bi_stats *stats)2952{2953/* Count instructions */2954stats->nr_ins += (tuple->fma ? 1 : 0) + (tuple->add ? 1 : 0);29552956/* Non-message passing tuples are always arithmetic */2957if (tuple->add != clause->message) {2958stats->nr_arith++;2959return;2960}29612962/* Message + FMA we'll count as arithmetic _and_ message */2963if (tuple->fma)2964stats->nr_arith++;29652966switch (clause->message_type) {2967case BIFROST_MESSAGE_VARYING:2968/* Check components interpolated */2969stats->nr_varying += (clause->message->vecsize + 1) *2970(bi_is_regfmt_16(clause->message->register_format) ? 1 : 2);2971break;29722973case BIFROST_MESSAGE_VARTEX:2974/* 2 coordinates, fp32 each */2975stats->nr_varying += (2 * 2);2976FALLTHROUGH;2977case BIFROST_MESSAGE_TEX:2978stats->nr_texture++;2979break;29802981case BIFROST_MESSAGE_ATTRIBUTE:2982case BIFROST_MESSAGE_LOAD:2983case BIFROST_MESSAGE_STORE:2984case BIFROST_MESSAGE_ATOMIC:2985stats->nr_ldst++;2986break;29872988case BIFROST_MESSAGE_NONE:2989case BIFROST_MESSAGE_BARRIER:2990case BIFROST_MESSAGE_BLEND:2991case BIFROST_MESSAGE_TILE:2992case BIFROST_MESSAGE_Z_STENCIL:2993case BIFROST_MESSAGE_ATEST:2994case BIFROST_MESSAGE_JOB:2995case BIFROST_MESSAGE_64BIT:2996/* Nothing to do */2997break;2998};29993000}30013002static void3003bi_print_stats(bi_context *ctx, unsigned size, FILE *fp)3004{3005struct bi_stats stats = { 0 };30063007/* Count instructions, clauses, and tuples. Also attempt to construct3008* normalized execution engine cycle counts, using the following ratio:3009*3010* 24 arith tuples/cycle3011* 2 texture messages/cycle3012* 16 x 16-bit varying channels interpolated/cycle3013* 1 load store message/cycle3014*3015* These numbers seem to match Arm Mobile Studio's heuristic. The real3016* cycle counts are surely more complicated.3017*/30183019bi_foreach_block(ctx, _block) {3020bi_block *block = (bi_block *) _block;30213022bi_foreach_clause_in_block(block, clause) {3023stats.nr_clauses++;3024stats.nr_tuples += clause->tuple_count;30253026for (unsigned i = 0; i < clause->tuple_count; ++i)3027bi_count_tuple_stats(clause, &clause->tuples[i], &stats);3028}3029}30303031float cycles_arith = ((float) stats.nr_arith) / 24.0;3032float cycles_texture = ((float) stats.nr_texture) / 2.0;3033float cycles_varying = ((float) stats.nr_varying) / 16.0;3034float cycles_ldst = ((float) stats.nr_ldst) / 1.0;30353036float cycles_message = MAX3(cycles_texture, cycles_varying, cycles_ldst);3037float cycles_bound = MAX2(cycles_arith, cycles_message);30383039/* Thread count and register pressure are traded off only on v7 */3040bool full_threads = (ctx->arch == 7 && ctx->info->work_reg_count <= 32);3041unsigned nr_threads = full_threads ? 2 : 1;30423043/* Dump stats */30443045fprintf(stderr, "%s - %s shader: "3046"%u inst, %u tuples, %u clauses, "3047"%f cycles, %f arith, %f texture, %f vary, %f ldst, "3048"%u quadwords, %u threads, %u loops, "3049"%u:%u spills:fills\n",3050ctx->nir->info.label ?: "",3051ctx->inputs->is_blend ? "PAN_SHADER_BLEND" :3052gl_shader_stage_name(ctx->stage),3053stats.nr_ins, stats.nr_tuples, stats.nr_clauses,3054cycles_bound, cycles_arith, cycles_texture,3055cycles_varying, cycles_ldst,3056size / 16, nr_threads,3057ctx->loop_count,3058ctx->spills, ctx->fills);3059}30603061static int3062glsl_type_size(const struct glsl_type *type, bool bindless)3063{3064return glsl_count_attribute_slots(type, false);3065}30663067/* Split stores to memory. We don't split stores to vertex outputs, since3068* nir_lower_io_to_temporaries will ensure there's only a single write.3069*/30703071static bool3072should_split_wrmask(const nir_instr *instr, UNUSED const void *data)3073{3074nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);30753076switch (intr->intrinsic) {3077case nir_intrinsic_store_ssbo:3078case nir_intrinsic_store_shared:3079case nir_intrinsic_store_global:3080case nir_intrinsic_store_scratch:3081return true;3082default:3083return false;3084}3085}30863087/* Bifrost wants transcendentals as FP32 */30883089static unsigned3090bi_lower_bit_size(const nir_instr *instr, UNUSED void *data)3091{3092if (instr->type != nir_instr_type_alu)3093return 0;30943095nir_alu_instr *alu = nir_instr_as_alu(instr);30963097switch (alu->op) {3098case nir_op_fexp2:3099case nir_op_flog2:3100case nir_op_fpow:3101case nir_op_fsin:3102case nir_op_fcos:3103return (nir_dest_bit_size(alu->dest.dest) == 32) ? 0 : 32;3104default:3105return 0;3106}3107}31083109/* Although Bifrost generally supports packed 16-bit vec2 and 8-bit vec4,3110* transcendentals are an exception. Also shifts because of lane size mismatch3111* (8-bit in Bifrost, 32-bit in NIR TODO - workaround!). Some conversions need3112* to be scalarized due to type size. */31133114static bool3115bi_vectorize_filter(const nir_instr *instr, void *data)3116{3117/* Defaults work for everything else */3118if (instr->type != nir_instr_type_alu)3119return true;31203121const nir_alu_instr *alu = nir_instr_as_alu(instr);31223123switch (alu->op) {3124case nir_op_frcp:3125case nir_op_frsq:3126case nir_op_ishl:3127case nir_op_ishr:3128case nir_op_ushr:3129case nir_op_f2i16:3130case nir_op_f2u16:3131case nir_op_i2f16:3132case nir_op_u2f16:3133return false;3134default:3135return true;3136}3137}31383139/* XXX: This is a kludge to workaround NIR's lack of divergence metadata. If we3140* keep divergence info around after we consume it for indirect lowering,3141* nir_convert_from_ssa will regress code quality since it will avoid3142* coalescing divergent with non-divergent nodes. */31433144static bool3145nir_invalidate_divergence_ssa(nir_ssa_def *ssa, UNUSED void *data)3146{3147ssa->divergent = false;3148return true;3149}31503151static bool3152nir_invalidate_divergence(struct nir_builder *b, nir_instr *instr,3153UNUSED void *data)3154{3155return nir_foreach_ssa_def(instr, nir_invalidate_divergence_ssa, NULL);3156}31573158static void3159bi_optimize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend)3160{3161bool progress;3162unsigned lower_flrp = 16 | 32 | 64;31633164NIR_PASS(progress, nir, nir_lower_regs_to_ssa);31653166nir_lower_tex_options lower_tex_options = {3167.lower_txs_lod = true,3168.lower_txp = ~0,3169.lower_tg4_broadcom_swizzle = true,3170.lower_txd = true,3171};31723173NIR_PASS(progress, nir, pan_nir_lower_64bit_intrin);3174NIR_PASS(progress, nir, pan_lower_helper_invocation);31753176NIR_PASS(progress, nir, nir_lower_int64);31773178nir_lower_idiv_options idiv_options = {3179.imprecise_32bit_lowering = true,3180.allow_fp16 = true,3181};3182NIR_PASS(progress, nir, nir_lower_idiv, &idiv_options);31833184NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_options);3185NIR_PASS(progress, nir, nir_lower_alu_to_scalar, NULL, NULL);3186NIR_PASS(progress, nir, nir_lower_load_const_to_scalar);31873188do {3189progress = false;31903191NIR_PASS(progress, nir, nir_lower_var_copies);3192NIR_PASS(progress, nir, nir_lower_vars_to_ssa);3193NIR_PASS(progress, nir, nir_lower_wrmasks, should_split_wrmask, NULL);31943195NIR_PASS(progress, nir, nir_copy_prop);3196NIR_PASS(progress, nir, nir_opt_remove_phis);3197NIR_PASS(progress, nir, nir_opt_dce);3198NIR_PASS(progress, nir, nir_opt_dead_cf);3199NIR_PASS(progress, nir, nir_opt_cse);3200NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);3201NIR_PASS(progress, nir, nir_opt_algebraic);3202NIR_PASS(progress, nir, nir_opt_constant_folding);32033204NIR_PASS(progress, nir, nir_lower_alu);32053206if (lower_flrp != 0) {3207bool lower_flrp_progress = false;3208NIR_PASS(lower_flrp_progress,3209nir,3210nir_lower_flrp,3211lower_flrp,3212false /* always_precise */);3213if (lower_flrp_progress) {3214NIR_PASS(progress, nir,3215nir_opt_constant_folding);3216progress = true;3217}32183219/* Nothing should rematerialize any flrps, so we only3220* need to do this lowering once.3221*/3222lower_flrp = 0;3223}32243225NIR_PASS(progress, nir, nir_opt_undef);3226NIR_PASS(progress, nir, nir_lower_undef_to_zero);32273228NIR_PASS(progress, nir, nir_opt_loop_unroll,3229nir_var_shader_in |3230nir_var_shader_out |3231nir_var_function_temp);3232} while (progress);32333234/* TODO: Why is 64-bit getting rematerialized?3235* KHR-GLES31.core.shader_image_load_store.basic-allTargets-atomicFS */3236NIR_PASS(progress, nir, nir_lower_int64);32373238/* We need to cleanup after each iteration of late algebraic3239* optimizations, since otherwise NIR can produce weird edge cases3240* (like fneg of a constant) which we don't handle */3241bool late_algebraic = true;3242while (late_algebraic) {3243late_algebraic = false;3244NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late);3245NIR_PASS(progress, nir, nir_opt_constant_folding);3246NIR_PASS(progress, nir, nir_copy_prop);3247NIR_PASS(progress, nir, nir_opt_dce);3248NIR_PASS(progress, nir, nir_opt_cse);3249}32503251NIR_PASS(progress, nir, nir_lower_alu_to_scalar, NULL, NULL);3252NIR_PASS(progress, nir, nir_opt_vectorize, bi_vectorize_filter, NULL);3253NIR_PASS(progress, nir, nir_lower_load_const_to_scalar);3254NIR_PASS(progress, nir, nir_opt_dce);32553256/* Prepass to simplify instruction selection */3257NIR_PASS(progress, nir, bifrost_nir_lower_algebraic_late);32583259/* Backend scheduler is purely local, so do some global optimizations3260* to reduce register pressure. */3261nir_move_options move_all =3262nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |3263nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;32643265NIR_PASS_V(nir, nir_opt_sink, move_all);3266NIR_PASS_V(nir, nir_opt_move, move_all);32673268/* We might lower attribute, varying, and image indirects. Use the3269* gathered info to skip the extra analysis in the happy path. */3270bool any_indirects =3271nir->info.inputs_read_indirectly ||3272nir->info.outputs_accessed_indirectly ||3273nir->info.patch_inputs_read_indirectly ||3274nir->info.patch_outputs_accessed_indirectly ||3275nir->info.images_used;32763277if (any_indirects) {3278nir_convert_to_lcssa(nir, true, true);3279NIR_PASS_V(nir, nir_divergence_analysis);3280NIR_PASS_V(nir, bi_lower_divergent_indirects,3281bifrost_lanes_per_warp(gpu_id));3282NIR_PASS_V(nir, nir_shader_instructions_pass,3283nir_invalidate_divergence, nir_metadata_all, NULL);3284}32853286/* Take us out of SSA */3287NIR_PASS(progress, nir, nir_lower_locals_to_regs);3288NIR_PASS(progress, nir, nir_move_vec_src_uses_to_dest);3289NIR_PASS(progress, nir, nir_convert_from_ssa, true);3290}32913292/* The cmdstream lowers 8-bit fragment output as 16-bit, so we need to do the3293* same lowering here to zero-extend correctly */32943295static bool3296bifrost_nir_lower_i8_fragout_impl(struct nir_builder *b,3297nir_intrinsic_instr *intr, UNUSED void *data)3298{3299if (nir_src_bit_size(intr->src[0]) != 8)3300return false;33013302nir_alu_type type =3303nir_alu_type_get_base_type(nir_intrinsic_src_type(intr));33043305assert(type == nir_type_int || type == nir_type_uint);33063307b->cursor = nir_before_instr(&intr->instr);3308nir_ssa_def *cast = nir_convert_to_bit_size(b, intr->src[0].ssa, type, 16);33093310nir_intrinsic_set_src_type(intr, type | 16);3311nir_instr_rewrite_src_ssa(&intr->instr, &intr->src[0], cast);3312return true;3313}33143315static bool3316bifrost_nir_lower_i8_fragin_impl(struct nir_builder *b,3317nir_intrinsic_instr *intr, UNUSED void *data)3318{3319if (nir_dest_bit_size(intr->dest) != 8)3320return false;33213322nir_alu_type type =3323nir_alu_type_get_base_type(nir_intrinsic_dest_type(intr));33243325assert(type == nir_type_int || type == nir_type_uint);33263327b->cursor = nir_before_instr(&intr->instr);3328nir_ssa_def *out =3329nir_load_output(b, intr->num_components, 16, intr->src[0].ssa,3330.base = nir_intrinsic_base(intr),3331.component = nir_intrinsic_component(intr),3332.dest_type = type | 16,3333.io_semantics = nir_intrinsic_io_semantics(intr));33343335nir_ssa_def *cast = nir_convert_to_bit_size(b, out, type, 8);3336nir_ssa_def_rewrite_uses(&intr->dest.ssa, cast);3337return true;3338}33393340static bool3341bifrost_nir_lower_i8_frag(struct nir_builder *b,3342nir_instr *instr, UNUSED void *data)3343{3344if (instr->type != nir_instr_type_intrinsic)3345return false;33463347nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);3348if (intr->intrinsic == nir_intrinsic_load_output)3349return bifrost_nir_lower_i8_fragin_impl(b, intr, data);3350else if (intr->intrinsic == nir_intrinsic_store_output)3351return bifrost_nir_lower_i8_fragout_impl(b, intr, data);3352else3353return false;3354}33553356static void3357bi_opt_post_ra(bi_context *ctx)3358{3359bi_foreach_instr_global_safe(ctx, ins) {3360if (ins->op == BI_OPCODE_MOV_I32 && bi_is_equiv(ins->dest[0], ins->src[0]))3361bi_remove_instruction(ins);3362}3363}33643365static bool3366bifrost_nir_lower_store_component(struct nir_builder *b,3367nir_instr *instr, void *data)3368{3369if (instr->type != nir_instr_type_intrinsic)3370return false;33713372nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);33733374if (intr->intrinsic != nir_intrinsic_store_output)3375return false;33763377struct hash_table_u64 *slots = data;3378unsigned component = nir_intrinsic_component(intr);3379nir_src *slot_src = nir_get_io_offset_src(intr);3380uint64_t slot = nir_src_as_uint(*slot_src) + nir_intrinsic_base(intr);33813382nir_intrinsic_instr *prev = _mesa_hash_table_u64_search(slots, slot);3383unsigned mask = (prev ? nir_intrinsic_write_mask(prev) : 0);33843385nir_ssa_def *value = intr->src[0].ssa;3386b->cursor = nir_before_instr(&intr->instr);33873388nir_ssa_def *undef = nir_ssa_undef(b, 1, value->bit_size);3389nir_ssa_def *channels[4] = { undef, undef, undef, undef };33903391/* Copy old */3392u_foreach_bit(i, mask) {3393assert(prev != NULL);3394nir_ssa_def *prev_ssa = prev->src[0].ssa;3395channels[i] = nir_channel(b, prev_ssa, i);3396}33973398/* Copy new */3399unsigned new_mask = nir_intrinsic_write_mask(intr);3400mask |= (new_mask << component);34013402u_foreach_bit(i, new_mask) {3403assert(component + i < 4);3404channels[component + i] = nir_channel(b, value, i);3405}34063407intr->num_components = util_last_bit(mask);3408nir_instr_rewrite_src_ssa(instr, &intr->src[0],3409nir_vec(b, channels, intr->num_components));34103411nir_intrinsic_set_component(intr, 0);3412nir_intrinsic_set_write_mask(intr, mask);34133414if (prev) {3415_mesa_hash_table_u64_remove(slots, slot);3416nir_instr_remove(&prev->instr);3417}34183419_mesa_hash_table_u64_insert(slots, slot, intr);3420return false;3421}34223423/* Dead code elimination for branches at the end of a block - only one branch3424* per block is legal semantically, but unreachable jumps can be generated.3425* Likewise we can generate jumps to the terminal block which need to be3426* lowered away to a jump to #0x0, which induces successful termination. */34273428static void3429bi_lower_branch(bi_block *block)3430{3431bool branched = false;3432ASSERTED bool was_jump = false;34333434bi_foreach_instr_in_block_safe(block, ins) {3435if (!ins->branch_target) continue;34363437if (branched) {3438assert(was_jump && (ins->op == BI_OPCODE_JUMP));3439bi_remove_instruction(ins);3440continue;3441}34423443branched = true;3444was_jump = ins->op == BI_OPCODE_JUMP;34453446if (bi_is_terminal_block(ins->branch_target))3447ins->branch_target = NULL;3448}3449}34503451void3452bifrost_compile_shader_nir(nir_shader *nir,3453const struct panfrost_compile_inputs *inputs,3454struct util_dynarray *binary,3455struct pan_shader_info *info)3456{3457bifrost_debug = debug_get_option_bifrost_debug();34583459bi_context *ctx = rzalloc(NULL, bi_context);3460ctx->sysval_to_id = panfrost_init_sysvals(&info->sysvals, ctx);34613462ctx->inputs = inputs;3463ctx->nir = nir;3464ctx->info = info;3465ctx->stage = nir->info.stage;3466ctx->quirks = bifrost_get_quirks(inputs->gpu_id);3467ctx->arch = inputs->gpu_id >> 12;3468list_inithead(&ctx->blocks);34693470/* Lower gl_Position pre-optimisation, but after lowering vars to ssa3471* (so we don't accidentally duplicate the epilogue since mesa/st has3472* messed with our I/O quite a bit already) */34733474NIR_PASS_V(nir, nir_lower_vars_to_ssa);34753476if (ctx->stage == MESA_SHADER_VERTEX) {3477NIR_PASS_V(nir, nir_lower_viewport_transform);3478NIR_PASS_V(nir, nir_lower_point_size, 1.0, 1024.0);3479}34803481/* Lower large arrays to scratch and small arrays to bcsel (TODO: tune3482* threshold, but not until addresses / csel is optimized better) */3483NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 16,3484glsl_get_natural_size_align_bytes);3485NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);34863487NIR_PASS_V(nir, nir_split_var_copies);3488NIR_PASS_V(nir, nir_lower_global_vars_to_local);3489NIR_PASS_V(nir, nir_lower_var_copies);3490NIR_PASS_V(nir, nir_lower_vars_to_ssa);3491NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,3492glsl_type_size, 0);34933494if (ctx->stage == MESA_SHADER_FRAGMENT) {3495NIR_PASS_V(nir, nir_lower_mediump_io, nir_var_shader_out,3496~0, false);3497} else {3498struct hash_table_u64 *stores = _mesa_hash_table_u64_create(ctx);3499NIR_PASS_V(nir, nir_shader_instructions_pass,3500bifrost_nir_lower_store_component,3501nir_metadata_block_index |3502nir_metadata_dominance, stores);3503}35043505NIR_PASS_V(nir, nir_lower_ssbo);3506NIR_PASS_V(nir, pan_nir_lower_zs_store);3507NIR_PASS_V(nir, pan_lower_sample_pos);3508NIR_PASS_V(nir, nir_lower_bit_size, bi_lower_bit_size, NULL);35093510if (nir->info.stage == MESA_SHADER_FRAGMENT) {3511NIR_PASS_V(nir, nir_shader_instructions_pass,3512bifrost_nir_lower_i8_frag,3513nir_metadata_block_index | nir_metadata_dominance,3514NULL);3515}35163517bi_optimize_nir(nir, ctx->inputs->gpu_id, ctx->inputs->is_blend);35183519NIR_PASS_V(nir, pan_nir_reorder_writeout);35203521bool skip_internal = nir->info.internal;3522skip_internal &= !(bifrost_debug & BIFROST_DBG_INTERNAL);35233524if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) {3525nir_print_shader(nir, stdout);3526}35273528info->tls_size = nir->scratch_size;35293530nir_foreach_function(func, nir) {3531if (!func->impl)3532continue;35333534ctx->ssa_alloc += func->impl->ssa_alloc;3535ctx->reg_alloc += func->impl->reg_alloc;35363537emit_cf_list(ctx, &func->impl->body);3538break; /* TODO: Multi-function shaders */3539}35403541unsigned block_source_count = 0;35423543bi_foreach_block(ctx, _block) {3544bi_block *block = (bi_block *) _block;35453546/* Name blocks now that we're done emitting so the order is3547* consistent */3548block->base.name = block_source_count++;3549}35503551/* If the shader doesn't write any colour or depth outputs, it may3552* still need an ATEST at the very end! */3553bool need_dummy_atest =3554(ctx->stage == MESA_SHADER_FRAGMENT) &&3555!ctx->emitted_atest &&3556!bi_skip_atest(ctx, false);35573558if (need_dummy_atest) {3559pan_block *end = list_last_entry(&ctx->blocks, pan_block, link);3560bi_builder b = bi_init_builder(ctx, bi_after_block((bi_block *) end));3561bi_emit_atest(&b, bi_zero());3562}35633564/* Runs before constant folding */3565bi_lower_swizzle(ctx);35663567/* Runs before copy prop */3568bi_opt_push_ubo(ctx);3569bi_opt_constant_fold(ctx);35703571bi_opt_copy_prop(ctx);3572bi_opt_mod_prop_forward(ctx);3573bi_opt_mod_prop_backward(ctx);3574bi_opt_dead_code_eliminate(ctx);3575bi_opt_cse(ctx);3576bi_opt_dead_code_eliminate(ctx);35773578bi_foreach_block(ctx, _block) {3579bi_block *block = (bi_block *) _block;3580bi_lower_branch(block);3581}35823583if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)3584bi_print_shader(ctx, stdout);3585bi_lower_fau(ctx);35863587/* Analyze before register allocation to avoid false dependencies. The3588* skip bit is a function of only the data flow graph and is invariant3589* under valid scheduling. */3590bi_analyze_helper_requirements(ctx);35913592bi_register_allocate(ctx);3593bi_opt_post_ra(ctx);3594if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)3595bi_print_shader(ctx, stdout);3596bi_schedule(ctx);3597bi_assign_scoreboard(ctx);35983599/* Analyze after scheduling since we depend on instruction order. */3600bi_analyze_helper_terminate(ctx);36013602if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)3603bi_print_shader(ctx, stdout);36043605unsigned final_clause = bi_pack(ctx, binary);36063607/* If we need to wait for ATEST or BLEND in the first clause, pass the3608* corresponding bits through to the renderer state descriptor */3609pan_block *first_block = list_first_entry(&ctx->blocks, pan_block, link);3610bi_clause *first_clause = bi_next_clause(ctx, first_block, NULL);36113612unsigned first_deps = first_clause ? first_clause->dependencies : 0;3613info->bifrost.wait_6 = (first_deps & (1 << 6));3614info->bifrost.wait_7 = (first_deps & (1 << 7));36153616info->ubo_mask = ctx->ubo_mask & BITSET_MASK(ctx->nir->info.num_ubos);36173618if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) {3619disassemble_bifrost(stdout, binary->data, binary->size,3620bifrost_debug & BIFROST_DBG_VERBOSE);3621fflush(stdout);3622}36233624/* Pad the shader with enough zero bytes to trick the prefetcher,3625* unless we're compiling an empty shader (in which case we don't pad3626* so the size remains 0) */3627unsigned prefetch_size = BIFROST_SHADER_PREFETCH - final_clause;36283629if (binary->size) {3630memset(util_dynarray_grow(binary, uint8_t, prefetch_size),36310, prefetch_size);3632}36333634if ((bifrost_debug & BIFROST_DBG_SHADERDB || inputs->shaderdb) &&3635!skip_internal) {3636bi_print_stats(ctx, binary->size, stderr);3637}36383639ralloc_free(ctx);3640}364136423643