Path: blob/21.2-virgl/src/freedreno/ir3/ir3_compiler_nir.c
4565 views
/*1* Copyright (C) 2015 Rob Clark <[email protected]>2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,19* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE20* SOFTWARE.21*22* Authors:23* Rob Clark <[email protected]>24*/2526#include <stdarg.h>2728#include "util/u_math.h"29#include "util/u_memory.h"30#include "util/u_string.h"3132#include "ir3_compiler.h"33#include "ir3_image.h"34#include "ir3_nir.h"35#include "ir3_shader.h"3637#include "instr-a3xx.h"38#include "ir3.h"39#include "ir3_context.h"4041void42ir3_handle_nonuniform(struct ir3_instruction *instr,43nir_intrinsic_instr *intrin)44{45if (nir_intrinsic_has_access(intrin) &&46(nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM)) {47instr->flags |= IR3_INSTR_NONUNIF;48}49}5051void52ir3_handle_bindless_cat6(struct ir3_instruction *instr, nir_src rsrc)53{54nir_intrinsic_instr *intrin = ir3_bindless_resource(rsrc);55if (!intrin)56return;5758instr->flags |= IR3_INSTR_B;59instr->cat6.base = nir_intrinsic_desc_set(intrin);60}6162static struct ir3_instruction *63create_input(struct ir3_context *ctx, unsigned compmask)64{65struct ir3_instruction *in;6667in = ir3_instr_create(ctx->in_block, OPC_META_INPUT, 1, 0);68in->input.sysval = ~0;69__ssa_dst(in)->wrmask = compmask;7071array_insert(ctx->ir, ctx->ir->inputs, in);7273return in;74}7576static struct ir3_instruction *77create_frag_input(struct ir3_context *ctx, struct ir3_instruction *coord,78unsigned n)79{80struct ir3_block *block = ctx->block;81struct ir3_instruction *instr;82/* packed inloc is fixed up later: */83struct ir3_instruction *inloc = create_immed(block, n);8485if (coord) {86instr = ir3_BARY_F(block, inloc, 0, coord, 0);87} else if (ctx->compiler->flat_bypass) {88instr = ir3_LDLV(block, inloc, 0, create_immed(block, 1), 0);89instr->cat6.type = TYPE_U32;90instr->cat6.iim_val = 1;91} else {92instr = ir3_BARY_F(block, inloc, 0, ctx->ij[IJ_PERSP_PIXEL], 0);93instr->srcs[1]->wrmask = 0x3;94}9596return instr;97}9899static struct ir3_instruction *100create_driver_param(struct ir3_context *ctx, enum ir3_driver_param dp)101{102/* first four vec4 sysval's reserved for UBOs: */103/* NOTE: dp is in scalar, but there can be >4 dp components: */104struct ir3_const_state *const_state = ir3_const_state(ctx->so);105unsigned n = const_state->offsets.driver_param;106unsigned r = regid(n + dp / 4, dp % 4);107return create_uniform(ctx->block, r);108}109110/*111* Adreno's comparisons produce a 1 for true and 0 for false, in either 16 or112* 32-bit registers. We use NIR's 1-bit integers to represent bools, and113* trust that we will only see and/or/xor on those 1-bit values, so we can114* safely store NIR i1s in a 32-bit reg while always containing either a 1 or115* 0.116*/117118/*119* alu/sfu instructions:120*/121122static struct ir3_instruction *123create_cov(struct ir3_context *ctx, struct ir3_instruction *src,124unsigned src_bitsize, nir_op op)125{126type_t src_type, dst_type;127128switch (op) {129case nir_op_f2f32:130case nir_op_f2f16_rtne:131case nir_op_f2f16_rtz:132case nir_op_f2f16:133case nir_op_f2i32:134case nir_op_f2i16:135case nir_op_f2i8:136case nir_op_f2u32:137case nir_op_f2u16:138case nir_op_f2u8:139switch (src_bitsize) {140case 32:141src_type = TYPE_F32;142break;143case 16:144src_type = TYPE_F16;145break;146default:147ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize);148}149break;150151case nir_op_i2f32:152case nir_op_i2f16:153case nir_op_i2i32:154case nir_op_i2i16:155case nir_op_i2i8:156switch (src_bitsize) {157case 32:158src_type = TYPE_S32;159break;160case 16:161src_type = TYPE_S16;162break;163case 8:164src_type = TYPE_S8;165break;166default:167ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize);168}169break;170171case nir_op_u2f32:172case nir_op_u2f16:173case nir_op_u2u32:174case nir_op_u2u16:175case nir_op_u2u8:176switch (src_bitsize) {177case 32:178src_type = TYPE_U32;179break;180case 16:181src_type = TYPE_U16;182break;183case 8:184src_type = TYPE_U8;185break;186default:187ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize);188}189break;190191case nir_op_b2f16:192case nir_op_b2f32:193case nir_op_b2i8:194case nir_op_b2i16:195case nir_op_b2i32:196src_type = TYPE_U32;197break;198199default:200ir3_context_error(ctx, "invalid conversion op: %u", op);201}202203switch (op) {204case nir_op_f2f32:205case nir_op_i2f32:206case nir_op_u2f32:207case nir_op_b2f32:208dst_type = TYPE_F32;209break;210211case nir_op_f2f16_rtne:212case nir_op_f2f16_rtz:213case nir_op_f2f16:214case nir_op_i2f16:215case nir_op_u2f16:216case nir_op_b2f16:217dst_type = TYPE_F16;218break;219220case nir_op_f2i32:221case nir_op_i2i32:222case nir_op_b2i32:223dst_type = TYPE_S32;224break;225226case nir_op_f2i16:227case nir_op_i2i16:228case nir_op_b2i16:229dst_type = TYPE_S16;230break;231232case nir_op_f2i8:233case nir_op_i2i8:234case nir_op_b2i8:235dst_type = TYPE_S8;236break;237238case nir_op_f2u32:239case nir_op_u2u32:240dst_type = TYPE_U32;241break;242243case nir_op_f2u16:244case nir_op_u2u16:245dst_type = TYPE_U16;246break;247248case nir_op_f2u8:249case nir_op_u2u8:250dst_type = TYPE_U8;251break;252253default:254ir3_context_error(ctx, "invalid conversion op: %u", op);255}256257if (src_type == dst_type)258return src;259260struct ir3_instruction *cov = ir3_COV(ctx->block, src, src_type, dst_type);261262if (op == nir_op_f2f16_rtne) {263cov->cat1.round = ROUND_EVEN;264} else if (op == nir_op_f2f16) {265unsigned execution_mode = ctx->s->info.float_controls_execution_mode;266nir_rounding_mode rounding_mode =267nir_get_rounding_mode_from_float_controls(execution_mode,268nir_type_float16);269if (rounding_mode == nir_rounding_mode_rtne)270cov->cat1.round = ROUND_EVEN;271}272273return cov;274}275276/* For shift instructions NIR always has shift amount as 32 bit integer */277static struct ir3_instruction *278resize_shift_amount(struct ir3_context *ctx, struct ir3_instruction *src,279unsigned bs)280{281if (bs != 16)282return src;283284return ir3_COV(ctx->block, src, TYPE_U32, TYPE_U16);285}286287static void288emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)289{290const nir_op_info *info = &nir_op_infos[alu->op];291struct ir3_instruction **dst, *src[info->num_inputs];292unsigned bs[info->num_inputs]; /* bit size */293struct ir3_block *b = ctx->block;294unsigned dst_sz, wrmask;295type_t dst_type =296nir_dest_bit_size(alu->dest.dest) == 16 ? TYPE_U16 : TYPE_U32;297298if (alu->dest.dest.is_ssa) {299dst_sz = alu->dest.dest.ssa.num_components;300wrmask = (1 << dst_sz) - 1;301} else {302dst_sz = alu->dest.dest.reg.reg->num_components;303wrmask = alu->dest.write_mask;304}305306dst = ir3_get_dst(ctx, &alu->dest.dest, dst_sz);307308/* Vectors are special in that they have non-scalarized writemasks,309* and just take the first swizzle channel for each argument in310* order into each writemask channel.311*/312if ((alu->op == nir_op_vec2) || (alu->op == nir_op_vec3) ||313(alu->op == nir_op_vec4)) {314315for (int i = 0; i < info->num_inputs; i++) {316nir_alu_src *asrc = &alu->src[i];317318compile_assert(ctx, !asrc->abs);319compile_assert(ctx, !asrc->negate);320321src[i] = ir3_get_src(ctx, &asrc->src)[asrc->swizzle[0]];322if (!src[i])323src[i] = create_immed_typed(ctx->block, 0, dst_type);324dst[i] = ir3_MOV(b, src[i], dst_type);325}326327ir3_put_dst(ctx, &alu->dest.dest);328return;329}330331/* We also get mov's with more than one component for mov's so332* handle those specially:333*/334if (alu->op == nir_op_mov) {335nir_alu_src *asrc = &alu->src[0];336struct ir3_instruction *const *src0 = ir3_get_src(ctx, &asrc->src);337338for (unsigned i = 0; i < dst_sz; i++) {339if (wrmask & (1 << i)) {340dst[i] = ir3_MOV(b, src0[asrc->swizzle[i]], dst_type);341} else {342dst[i] = NULL;343}344}345346ir3_put_dst(ctx, &alu->dest.dest);347return;348}349350/* General case: We can just grab the one used channel per src. */351for (int i = 0; i < info->num_inputs; i++) {352unsigned chan = ffs(alu->dest.write_mask) - 1;353nir_alu_src *asrc = &alu->src[i];354355compile_assert(ctx, !asrc->abs);356compile_assert(ctx, !asrc->negate);357358src[i] = ir3_get_src(ctx, &asrc->src)[asrc->swizzle[chan]];359bs[i] = nir_src_bit_size(asrc->src);360361compile_assert(ctx, src[i]);362}363364switch (alu->op) {365case nir_op_f2f32:366case nir_op_f2f16_rtne:367case nir_op_f2f16_rtz:368case nir_op_f2f16:369case nir_op_f2i32:370case nir_op_f2i16:371case nir_op_f2i8:372case nir_op_f2u32:373case nir_op_f2u16:374case nir_op_f2u8:375case nir_op_i2f32:376case nir_op_i2f16:377case nir_op_i2i32:378case nir_op_i2i16:379case nir_op_i2i8:380case nir_op_u2f32:381case nir_op_u2f16:382case nir_op_u2u32:383case nir_op_u2u16:384case nir_op_u2u8:385case nir_op_b2f16:386case nir_op_b2f32:387case nir_op_b2i8:388case nir_op_b2i16:389case nir_op_b2i32:390dst[0] = create_cov(ctx, src[0], bs[0], alu->op);391break;392393case nir_op_fquantize2f16:394dst[0] = create_cov(ctx, create_cov(ctx, src[0], 32, nir_op_f2f16_rtne),39516, nir_op_f2f32);396break;397case nir_op_f2b1:398dst[0] = ir3_CMPS_F(399b, src[0], 0,400create_immed_typed(b, 0, bs[0] == 16 ? TYPE_F16 : TYPE_F32), 0);401dst[0]->cat2.condition = IR3_COND_NE;402break;403404case nir_op_i2b1:405/* i2b1 will appear when translating from nir_load_ubo or406* nir_intrinsic_load_ssbo, where any non-zero value is true.407*/408dst[0] = ir3_CMPS_S(409b, src[0], 0,410create_immed_typed(b, 0, bs[0] == 16 ? TYPE_U16 : TYPE_U32), 0);411dst[0]->cat2.condition = IR3_COND_NE;412break;413414case nir_op_b2b1:415/* b2b1 will appear when translating from416*417* - nir_intrinsic_load_shared of a 32-bit 0/~0 value.418* - nir_intrinsic_load_constant of a 32-bit 0/~0 value419*420* A negate can turn those into a 1 or 0 for us.421*/422dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);423break;424425case nir_op_b2b32:426/* b2b32 will appear when converting our 1-bit bools to a store_shared427* argument.428*429* A negate can turn those into a ~0 for us.430*/431dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);432break;433434case nir_op_fneg:435dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FNEG);436break;437case nir_op_fabs:438dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FABS);439break;440case nir_op_fmax:441dst[0] = ir3_MAX_F(b, src[0], 0, src[1], 0);442break;443case nir_op_fmin:444dst[0] = ir3_MIN_F(b, src[0], 0, src[1], 0);445break;446case nir_op_fsat:447/* if there is just a single use of the src, and it supports448* (sat) bit, we can just fold the (sat) flag back to the449* src instruction and create a mov. This is easier for cp450* to eliminate.451*/452if (alu->src[0].src.is_ssa && is_sat_compatible(src[0]->opc) &&453(list_length(&alu->src[0].src.ssa->uses) == 1)) {454src[0]->flags |= IR3_INSTR_SAT;455dst[0] = ir3_MOV(b, src[0], dst_type);456} else {457/* otherwise generate a max.f that saturates.. blob does458* similar (generating a cat2 mov using max.f)459*/460dst[0] = ir3_MAX_F(b, src[0], 0, src[0], 0);461dst[0]->flags |= IR3_INSTR_SAT;462}463break;464case nir_op_fmul:465dst[0] = ir3_MUL_F(b, src[0], 0, src[1], 0);466break;467case nir_op_fadd:468dst[0] = ir3_ADD_F(b, src[0], 0, src[1], 0);469break;470case nir_op_fsub:471dst[0] = ir3_ADD_F(b, src[0], 0, src[1], IR3_REG_FNEG);472break;473case nir_op_ffma:474dst[0] = ir3_MAD_F32(b, src[0], 0, src[1], 0, src[2], 0);475break;476case nir_op_fddx:477case nir_op_fddx_coarse:478dst[0] = ir3_DSX(b, src[0], 0);479dst[0]->cat5.type = TYPE_F32;480break;481case nir_op_fddx_fine:482dst[0] = ir3_DSXPP_MACRO(b, src[0], 0);483dst[0]->cat5.type = TYPE_F32;484break;485case nir_op_fddy:486case nir_op_fddy_coarse:487dst[0] = ir3_DSY(b, src[0], 0);488dst[0]->cat5.type = TYPE_F32;489break;490break;491case nir_op_fddy_fine:492dst[0] = ir3_DSYPP_MACRO(b, src[0], 0);493dst[0]->cat5.type = TYPE_F32;494break;495case nir_op_flt:496dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);497dst[0]->cat2.condition = IR3_COND_LT;498break;499case nir_op_fge:500dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);501dst[0]->cat2.condition = IR3_COND_GE;502break;503case nir_op_feq:504dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);505dst[0]->cat2.condition = IR3_COND_EQ;506break;507case nir_op_fneu:508dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);509dst[0]->cat2.condition = IR3_COND_NE;510break;511case nir_op_fceil:512dst[0] = ir3_CEIL_F(b, src[0], 0);513break;514case nir_op_ffloor:515dst[0] = ir3_FLOOR_F(b, src[0], 0);516break;517case nir_op_ftrunc:518dst[0] = ir3_TRUNC_F(b, src[0], 0);519break;520case nir_op_fround_even:521dst[0] = ir3_RNDNE_F(b, src[0], 0);522break;523case nir_op_fsign:524dst[0] = ir3_SIGN_F(b, src[0], 0);525break;526527case nir_op_fsin:528dst[0] = ir3_SIN(b, src[0], 0);529break;530case nir_op_fcos:531dst[0] = ir3_COS(b, src[0], 0);532break;533case nir_op_frsq:534dst[0] = ir3_RSQ(b, src[0], 0);535break;536case nir_op_frcp:537dst[0] = ir3_RCP(b, src[0], 0);538break;539case nir_op_flog2:540dst[0] = ir3_LOG2(b, src[0], 0);541break;542case nir_op_fexp2:543dst[0] = ir3_EXP2(b, src[0], 0);544break;545case nir_op_fsqrt:546dst[0] = ir3_SQRT(b, src[0], 0);547break;548549case nir_op_iabs:550dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SABS);551break;552case nir_op_iadd:553dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0);554break;555case nir_op_iand:556dst[0] = ir3_AND_B(b, src[0], 0, src[1], 0);557break;558case nir_op_imax:559dst[0] = ir3_MAX_S(b, src[0], 0, src[1], 0);560break;561case nir_op_umax:562dst[0] = ir3_MAX_U(b, src[0], 0, src[1], 0);563break;564case nir_op_imin:565dst[0] = ir3_MIN_S(b, src[0], 0, src[1], 0);566break;567case nir_op_umin:568dst[0] = ir3_MIN_U(b, src[0], 0, src[1], 0);569break;570case nir_op_umul_low:571dst[0] = ir3_MULL_U(b, src[0], 0, src[1], 0);572break;573case nir_op_imadsh_mix16:574dst[0] = ir3_MADSH_M16(b, src[0], 0, src[1], 0, src[2], 0);575break;576case nir_op_imad24_ir3:577dst[0] = ir3_MAD_S24(b, src[0], 0, src[1], 0, src[2], 0);578break;579case nir_op_imul:580compile_assert(ctx, nir_dest_bit_size(alu->dest.dest) == 16);581dst[0] = ir3_MUL_S24(b, src[0], 0, src[1], 0);582break;583case nir_op_imul24:584dst[0] = ir3_MUL_S24(b, src[0], 0, src[1], 0);585break;586case nir_op_ineg:587dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);588break;589case nir_op_inot:590if (bs[0] == 1) {591dst[0] = ir3_SUB_U(b, create_immed(ctx->block, 1), 0, src[0], 0);592} else {593dst[0] = ir3_NOT_B(b, src[0], 0);594}595break;596case nir_op_ior:597dst[0] = ir3_OR_B(b, src[0], 0, src[1], 0);598break;599case nir_op_ishl:600dst[0] =601ir3_SHL_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0);602break;603case nir_op_ishr:604dst[0] =605ir3_ASHR_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0);606break;607case nir_op_isub:608dst[0] = ir3_SUB_U(b, src[0], 0, src[1], 0);609break;610case nir_op_ixor:611dst[0] = ir3_XOR_B(b, src[0], 0, src[1], 0);612break;613case nir_op_ushr:614dst[0] =615ir3_SHR_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0);616break;617case nir_op_ilt:618dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);619dst[0]->cat2.condition = IR3_COND_LT;620break;621case nir_op_ige:622dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);623dst[0]->cat2.condition = IR3_COND_GE;624break;625case nir_op_ieq:626dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);627dst[0]->cat2.condition = IR3_COND_EQ;628break;629case nir_op_ine:630dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);631dst[0]->cat2.condition = IR3_COND_NE;632break;633case nir_op_ult:634dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);635dst[0]->cat2.condition = IR3_COND_LT;636break;637case nir_op_uge:638dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);639dst[0]->cat2.condition = IR3_COND_GE;640break;641642case nir_op_bcsel: {643struct ir3_instruction *cond = src[0];644645/* If src[0] is a negation (likely as a result of an ir3_b2n(cond)),646* we can ignore that and use original cond, since the nonzero-ness of647* cond stays the same.648*/649if (cond->opc == OPC_ABSNEG_S && cond->flags == 0 &&650(cond->srcs[0]->flags & (IR3_REG_SNEG | IR3_REG_SABS)) ==651IR3_REG_SNEG) {652cond = cond->srcs[0]->def->instr;653}654655compile_assert(ctx, bs[1] == bs[2]);656/* The condition's size has to match the other two arguments' size, so657* convert down if necessary.658*/659if (bs[1] == 16) {660struct hash_entry *prev_entry =661_mesa_hash_table_search(ctx->sel_cond_conversions, src[0]);662if (prev_entry) {663cond = prev_entry->data;664} else {665cond = ir3_COV(b, cond, TYPE_U32, TYPE_U16);666_mesa_hash_table_insert(ctx->sel_cond_conversions, src[0], cond);667}668}669670if (bs[1] != 16)671dst[0] = ir3_SEL_B32(b, src[1], 0, cond, 0, src[2], 0);672else673dst[0] = ir3_SEL_B16(b, src[1], 0, cond, 0, src[2], 0);674break;675}676case nir_op_bit_count: {677// TODO, we need to do this 16b at a time on a5xx+a6xx.. need to678// double check on earlier gen's. Once half-precision support is679// in place, this should probably move to a NIR lowering pass:680struct ir3_instruction *hi, *lo;681682hi = ir3_COV(b, ir3_SHR_B(b, src[0], 0, create_immed(b, 16), 0), TYPE_U32,683TYPE_U16);684lo = ir3_COV(b, src[0], TYPE_U32, TYPE_U16);685686hi = ir3_CBITS_B(b, hi, 0);687lo = ir3_CBITS_B(b, lo, 0);688689// TODO maybe the builders should default to making dst half-precision690// if the src's were half precision, to make this less awkward.. otoh691// we should probably just do this lowering in NIR.692hi->dsts[0]->flags |= IR3_REG_HALF;693lo->dsts[0]->flags |= IR3_REG_HALF;694695dst[0] = ir3_ADD_S(b, hi, 0, lo, 0);696dst[0]->dsts[0]->flags |= IR3_REG_HALF;697dst[0] = ir3_COV(b, dst[0], TYPE_U16, TYPE_U32);698break;699}700case nir_op_ifind_msb: {701struct ir3_instruction *cmp;702dst[0] = ir3_CLZ_S(b, src[0], 0);703cmp = ir3_CMPS_S(b, dst[0], 0, create_immed(b, 0), 0);704cmp->cat2.condition = IR3_COND_GE;705dst[0] = ir3_SEL_B32(b, ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0),7060, cmp, 0, dst[0], 0);707break;708}709case nir_op_ufind_msb:710dst[0] = ir3_CLZ_B(b, src[0], 0);711dst[0] = ir3_SEL_B32(b, ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0),7120, src[0], 0, dst[0], 0);713break;714case nir_op_find_lsb:715dst[0] = ir3_BFREV_B(b, src[0], 0);716dst[0] = ir3_CLZ_B(b, dst[0], 0);717break;718case nir_op_bitfield_reverse:719dst[0] = ir3_BFREV_B(b, src[0], 0);720break;721722default:723ir3_context_error(ctx, "Unhandled ALU op: %s\n",724nir_op_infos[alu->op].name);725break;726}727728if (nir_alu_type_get_base_type(info->output_type) == nir_type_bool) {729assert(nir_dest_bit_size(alu->dest.dest) == 1 || alu->op == nir_op_b2b32);730assert(dst_sz == 1);731} else {732/* 1-bit values stored in 32-bit registers are only valid for certain733* ALU ops.734*/735switch (alu->op) {736case nir_op_iand:737case nir_op_ior:738case nir_op_ixor:739case nir_op_inot:740case nir_op_bcsel:741break;742default:743compile_assert(ctx, nir_dest_bit_size(alu->dest.dest) != 1);744}745}746747ir3_put_dst(ctx, &alu->dest.dest);748}749750static void751emit_intrinsic_load_ubo_ldc(struct ir3_context *ctx, nir_intrinsic_instr *intr,752struct ir3_instruction **dst)753{754struct ir3_block *b = ctx->block;755756unsigned ncomp = intr->num_components;757struct ir3_instruction *offset = ir3_get_src(ctx, &intr->src[1])[0];758struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[0])[0];759struct ir3_instruction *ldc = ir3_LDC(b, idx, 0, offset, 0);760ldc->dsts[0]->wrmask = MASK(ncomp);761ldc->cat6.iim_val = ncomp;762ldc->cat6.d = nir_intrinsic_component(intr);763ldc->cat6.type = TYPE_U32;764765ir3_handle_bindless_cat6(ldc, intr->src[0]);766if (ldc->flags & IR3_INSTR_B)767ctx->so->bindless_ubo = true;768ir3_handle_nonuniform(ldc, intr);769770ir3_split_dest(b, dst, ldc, 0, ncomp);771}772773/* handles direct/indirect UBO reads: */774static void775emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr,776struct ir3_instruction **dst)777{778struct ir3_block *b = ctx->block;779struct ir3_instruction *base_lo, *base_hi, *addr, *src0, *src1;780const struct ir3_const_state *const_state = ir3_const_state(ctx->so);781unsigned ubo = regid(const_state->offsets.ubo, 0);782const unsigned ptrsz = ir3_pointer_size(ctx->compiler);783784int off = 0;785786/* First src is ubo index, which could either be an immed or not: */787src0 = ir3_get_src(ctx, &intr->src[0])[0];788if (is_same_type_mov(src0) && (src0->srcs[0]->flags & IR3_REG_IMMED)) {789base_lo = create_uniform(b, ubo + (src0->srcs[0]->iim_val * ptrsz));790base_hi = create_uniform(b, ubo + (src0->srcs[0]->iim_val * ptrsz) + 1);791} else {792base_lo = create_uniform_indirect(b, ubo, TYPE_U32,793ir3_get_addr0(ctx, src0, ptrsz));794base_hi = create_uniform_indirect(b, ubo + 1, TYPE_U32,795ir3_get_addr0(ctx, src0, ptrsz));796797/* NOTE: since relative addressing is used, make sure constlen is798* at least big enough to cover all the UBO addresses, since the799* assembler won't know what the max address reg is.800*/801ctx->so->constlen =802MAX2(ctx->so->constlen,803const_state->offsets.ubo + (ctx->s->info.num_ubos * ptrsz));804}805806/* note: on 32bit gpu's base_hi is ignored and DCE'd */807addr = base_lo;808809if (nir_src_is_const(intr->src[1])) {810off += nir_src_as_uint(intr->src[1]);811} else {812/* For load_ubo_indirect, second src is indirect offset: */813src1 = ir3_get_src(ctx, &intr->src[1])[0];814815/* and add offset to addr: */816addr = ir3_ADD_S(b, addr, 0, src1, 0);817}818819/* if offset is to large to encode in the ldg, split it out: */820if ((off + (intr->num_components * 4)) > 1024) {821/* split out the minimal amount to improve the odds that822* cp can fit the immediate in the add.s instruction:823*/824unsigned off2 = off + (intr->num_components * 4) - 1024;825addr = ir3_ADD_S(b, addr, 0, create_immed(b, off2), 0);826off -= off2;827}828829if (ptrsz == 2) {830struct ir3_instruction *carry;831832/* handle 32b rollover, ie:833* if (addr < base_lo)834* base_hi++835*/836carry = ir3_CMPS_U(b, addr, 0, base_lo, 0);837carry->cat2.condition = IR3_COND_LT;838base_hi = ir3_ADD_S(b, base_hi, 0, carry, 0);839840addr = ir3_collect(ctx, addr, base_hi);841}842843for (int i = 0; i < intr->num_components; i++) {844struct ir3_instruction *load =845ir3_LDG(b, addr, 0, create_immed(b, off + i * 4), 0,846create_immed(b, 1), 0); /* num components */847load->cat6.type = TYPE_U32;848dst[i] = load;849}850}851852/* src[] = { block_index } */853static void854emit_intrinsic_ssbo_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,855struct ir3_instruction **dst)856{857if (ir3_bindless_resource(intr->src[0])) {858struct ir3_block *b = ctx->block;859struct ir3_instruction *ibo = ir3_ssbo_to_ibo(ctx, intr->src[0]);860struct ir3_instruction *resinfo = ir3_RESINFO(b, ibo, 0);861resinfo->cat6.iim_val = 1;862resinfo->cat6.d = 1;863resinfo->cat6.type = TYPE_U32;864resinfo->cat6.typed = false;865/* resinfo has no writemask and always writes out 3 components */866resinfo->dsts[0]->wrmask = MASK(3);867ir3_handle_bindless_cat6(resinfo, intr->src[0]);868struct ir3_instruction *resinfo_dst;869ir3_split_dest(b, &resinfo_dst, resinfo, 0, 1);870/* Unfortunately resinfo returns the array length, i.e. in dwords,871* while NIR expects us to return the size in bytes.872*873* TODO: fix this in NIR.874*/875*dst = ir3_SHL_B(b, resinfo_dst, 0, create_immed(b, 2), 0);876return;877}878879/* SSBO size stored as a const starting at ssbo_sizes: */880const struct ir3_const_state *const_state = ir3_const_state(ctx->so);881unsigned blk_idx = nir_src_as_uint(intr->src[0]);882unsigned idx = regid(const_state->offsets.ssbo_sizes, 0) +883const_state->ssbo_size.off[blk_idx];884885debug_assert(const_state->ssbo_size.mask & (1 << blk_idx));886887dst[0] = create_uniform(ctx->block, idx);888}889890/* src[] = { offset }. const_index[] = { base } */891static void892emit_intrinsic_load_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr,893struct ir3_instruction **dst)894{895struct ir3_block *b = ctx->block;896struct ir3_instruction *ldl, *offset;897unsigned base;898899offset = ir3_get_src(ctx, &intr->src[0])[0];900base = nir_intrinsic_base(intr);901902ldl = ir3_LDL(b, offset, 0, create_immed(b, base), 0,903create_immed(b, intr->num_components), 0);904905ldl->cat6.type = utype_dst(intr->dest);906ldl->dsts[0]->wrmask = MASK(intr->num_components);907908ldl->barrier_class = IR3_BARRIER_SHARED_R;909ldl->barrier_conflict = IR3_BARRIER_SHARED_W;910911ir3_split_dest(b, dst, ldl, 0, intr->num_components);912}913914/* src[] = { value, offset }. const_index[] = { base, write_mask } */915static void916emit_intrinsic_store_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)917{918struct ir3_block *b = ctx->block;919struct ir3_instruction *stl, *offset;920struct ir3_instruction *const *value;921unsigned base, wrmask, ncomp;922923value = ir3_get_src(ctx, &intr->src[0]);924offset = ir3_get_src(ctx, &intr->src[1])[0];925926base = nir_intrinsic_base(intr);927wrmask = nir_intrinsic_write_mask(intr);928ncomp = ffs(~wrmask) - 1;929930assert(wrmask == BITFIELD_MASK(intr->num_components));931932stl = ir3_STL(b, offset, 0, ir3_create_collect(ctx, value, ncomp), 0,933create_immed(b, ncomp), 0);934stl->cat6.dst_offset = base;935stl->cat6.type = utype_src(intr->src[0]);936stl->barrier_class = IR3_BARRIER_SHARED_W;937stl->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;938939array_insert(b, b->keeps, stl);940}941942/* src[] = { offset }. const_index[] = { base } */943static void944emit_intrinsic_load_shared_ir3(struct ir3_context *ctx,945nir_intrinsic_instr *intr,946struct ir3_instruction **dst)947{948struct ir3_block *b = ctx->block;949struct ir3_instruction *load, *offset;950unsigned base;951952offset = ir3_get_src(ctx, &intr->src[0])[0];953base = nir_intrinsic_base(intr);954955load = ir3_LDLW(b, offset, 0, create_immed(b, base), 0,956create_immed(b, intr->num_components), 0);957958/* for a650, use LDL for tess ctrl inputs: */959if (ctx->so->type == MESA_SHADER_TESS_CTRL && ctx->compiler->tess_use_shared)960load->opc = OPC_LDL;961962load->cat6.type = utype_dst(intr->dest);963load->dsts[0]->wrmask = MASK(intr->num_components);964965load->barrier_class = IR3_BARRIER_SHARED_R;966load->barrier_conflict = IR3_BARRIER_SHARED_W;967968ir3_split_dest(b, dst, load, 0, intr->num_components);969}970971/* src[] = { value, offset }. const_index[] = { base } */972static void973emit_intrinsic_store_shared_ir3(struct ir3_context *ctx,974nir_intrinsic_instr *intr)975{976struct ir3_block *b = ctx->block;977struct ir3_instruction *store, *offset;978struct ir3_instruction *const *value;979980value = ir3_get_src(ctx, &intr->src[0]);981offset = ir3_get_src(ctx, &intr->src[1])[0];982983store = ir3_STLW(b, offset, 0,984ir3_create_collect(ctx, value, intr->num_components), 0,985create_immed(b, intr->num_components), 0);986987/* for a650, use STL for vertex outputs used by tess ctrl shader: */988if (ctx->so->type == MESA_SHADER_VERTEX && ctx->so->key.tessellation &&989ctx->compiler->tess_use_shared)990store->opc = OPC_STL;991992store->cat6.dst_offset = nir_intrinsic_base(intr);993store->cat6.type = utype_src(intr->src[0]);994store->barrier_class = IR3_BARRIER_SHARED_W;995store->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;996997array_insert(b, b->keeps, store);998}9991000/*1001* CS shared variable atomic intrinsics1002*1003* All of the shared variable atomic memory operations read a value from1004* memory, compute a new value using one of the operations below, write the1005* new value to memory, and return the original value read.1006*1007* All operations take 2 sources except CompSwap that takes 3. These1008* sources represent:1009*1010* 0: The offset into the shared variable storage region that the atomic1011* operation will operate on.1012* 1: The data parameter to the atomic function (i.e. the value to add1013* in shared_atomic_add, etc).1014* 2: For CompSwap only: the second data parameter.1015*/1016static struct ir3_instruction *1017emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)1018{1019struct ir3_block *b = ctx->block;1020struct ir3_instruction *atomic, *src0, *src1;1021type_t type = TYPE_U32;10221023src0 = ir3_get_src(ctx, &intr->src[0])[0]; /* offset */1024src1 = ir3_get_src(ctx, &intr->src[1])[0]; /* value */10251026switch (intr->intrinsic) {1027case nir_intrinsic_shared_atomic_add:1028atomic = ir3_ATOMIC_ADD(b, src0, 0, src1, 0);1029break;1030case nir_intrinsic_shared_atomic_imin:1031atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);1032type = TYPE_S32;1033break;1034case nir_intrinsic_shared_atomic_umin:1035atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);1036break;1037case nir_intrinsic_shared_atomic_imax:1038atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);1039type = TYPE_S32;1040break;1041case nir_intrinsic_shared_atomic_umax:1042atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);1043break;1044case nir_intrinsic_shared_atomic_and:1045atomic = ir3_ATOMIC_AND(b, src0, 0, src1, 0);1046break;1047case nir_intrinsic_shared_atomic_or:1048atomic = ir3_ATOMIC_OR(b, src0, 0, src1, 0);1049break;1050case nir_intrinsic_shared_atomic_xor:1051atomic = ir3_ATOMIC_XOR(b, src0, 0, src1, 0);1052break;1053case nir_intrinsic_shared_atomic_exchange:1054atomic = ir3_ATOMIC_XCHG(b, src0, 0, src1, 0);1055break;1056case nir_intrinsic_shared_atomic_comp_swap:1057/* for cmpxchg, src1 is [ui]vec2(data, compare): */1058src1 = ir3_collect(ctx, ir3_get_src(ctx, &intr->src[2])[0], src1);1059atomic = ir3_ATOMIC_CMPXCHG(b, src0, 0, src1, 0);1060break;1061default:1062unreachable("boo");1063}10641065atomic->cat6.iim_val = 1;1066atomic->cat6.d = 1;1067atomic->cat6.type = type;1068atomic->barrier_class = IR3_BARRIER_SHARED_W;1069atomic->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;10701071/* even if nothing consume the result, we can't DCE the instruction: */1072array_insert(b, b->keeps, atomic);10731074return atomic;1075}10761077/* src[] = { offset }. */1078static void1079emit_intrinsic_load_scratch(struct ir3_context *ctx, nir_intrinsic_instr *intr,1080struct ir3_instruction **dst)1081{1082struct ir3_block *b = ctx->block;1083struct ir3_instruction *ldp, *offset;10841085offset = ir3_get_src(ctx, &intr->src[0])[0];10861087ldp = ir3_LDP(b, offset, 0, create_immed(b, 0), 0,1088create_immed(b, intr->num_components), 0);10891090ldp->cat6.type = utype_dst(intr->dest);1091ldp->dsts[0]->wrmask = MASK(intr->num_components);10921093ldp->barrier_class = IR3_BARRIER_PRIVATE_R;1094ldp->barrier_conflict = IR3_BARRIER_PRIVATE_W;10951096ir3_split_dest(b, dst, ldp, 0, intr->num_components);1097}10981099/* src[] = { value, offset }. const_index[] = { write_mask } */1100static void1101emit_intrinsic_store_scratch(struct ir3_context *ctx, nir_intrinsic_instr *intr)1102{1103struct ir3_block *b = ctx->block;1104struct ir3_instruction *stp, *offset;1105struct ir3_instruction *const *value;1106unsigned wrmask, ncomp;11071108value = ir3_get_src(ctx, &intr->src[0]);1109offset = ir3_get_src(ctx, &intr->src[1])[0];11101111wrmask = nir_intrinsic_write_mask(intr);1112ncomp = ffs(~wrmask) - 1;11131114assert(wrmask == BITFIELD_MASK(intr->num_components));11151116stp = ir3_STP(b, offset, 0, ir3_create_collect(ctx, value, ncomp), 0,1117create_immed(b, ncomp), 0);1118stp->cat6.dst_offset = 0;1119stp->cat6.type = utype_src(intr->src[0]);1120stp->barrier_class = IR3_BARRIER_PRIVATE_W;1121stp->barrier_conflict = IR3_BARRIER_PRIVATE_R | IR3_BARRIER_PRIVATE_W;11221123array_insert(b, b->keeps, stp);1124}11251126struct tex_src_info {1127/* For prefetch */1128unsigned tex_base, samp_base, tex_idx, samp_idx;1129/* For normal tex instructions */1130unsigned base, combined_idx, a1_val, flags;1131struct ir3_instruction *samp_tex;1132};11331134/* TODO handle actual indirect/dynamic case.. which is going to be weird1135* to handle with the image_mapping table..1136*/1137static struct tex_src_info1138get_image_samp_tex_src(struct ir3_context *ctx, nir_intrinsic_instr *intr)1139{1140struct ir3_block *b = ctx->block;1141struct tex_src_info info = {0};1142nir_intrinsic_instr *bindless_tex = ir3_bindless_resource(intr->src[0]);1143ctx->so->bindless_tex = true;11441145if (bindless_tex) {1146/* Bindless case */1147info.flags |= IR3_INSTR_B;11481149/* Gather information required to determine which encoding to1150* choose as well as for prefetch.1151*/1152info.tex_base = nir_intrinsic_desc_set(bindless_tex);1153bool tex_const = nir_src_is_const(bindless_tex->src[0]);1154if (tex_const)1155info.tex_idx = nir_src_as_uint(bindless_tex->src[0]);1156info.samp_idx = 0;11571158/* Choose encoding. */1159if (tex_const && info.tex_idx < 256) {1160if (info.tex_idx < 16) {1161/* Everything fits within the instruction */1162info.base = info.tex_base;1163info.combined_idx = info.samp_idx | (info.tex_idx << 4);1164} else {1165info.base = info.tex_base;1166info.a1_val = info.tex_idx << 3;1167info.combined_idx = 0;1168info.flags |= IR3_INSTR_A1EN;1169}1170info.samp_tex = NULL;1171} else {1172info.flags |= IR3_INSTR_S2EN;1173info.base = info.tex_base;11741175/* Note: the indirect source is now a vec2 instead of hvec2 */1176struct ir3_instruction *texture, *sampler;11771178texture = ir3_get_src(ctx, &intr->src[0])[0];1179sampler = create_immed(b, 0);1180info.samp_tex = ir3_collect(ctx, texture, sampler);1181}1182} else {1183info.flags |= IR3_INSTR_S2EN;1184unsigned slot = nir_src_as_uint(intr->src[0]);1185unsigned tex_idx = ir3_image_to_tex(&ctx->so->image_mapping, slot);1186struct ir3_instruction *texture, *sampler;11871188texture = create_immed_typed(ctx->block, tex_idx, TYPE_U16);1189sampler = create_immed_typed(ctx->block, tex_idx, TYPE_U16);11901191info.samp_tex = ir3_collect(ctx, sampler, texture);1192}11931194return info;1195}11961197static struct ir3_instruction *1198emit_sam(struct ir3_context *ctx, opc_t opc, struct tex_src_info info,1199type_t type, unsigned wrmask, struct ir3_instruction *src0,1200struct ir3_instruction *src1)1201{1202struct ir3_instruction *sam, *addr;1203if (info.flags & IR3_INSTR_A1EN) {1204addr = ir3_get_addr1(ctx, info.a1_val);1205}1206sam = ir3_SAM(ctx->block, opc, type, 0b1111, info.flags, info.samp_tex, src0,1207src1);1208if (info.flags & IR3_INSTR_A1EN) {1209ir3_instr_set_address(sam, addr);1210}1211if (info.flags & IR3_INSTR_B) {1212sam->cat5.tex_base = info.base;1213sam->cat5.samp = info.combined_idx;1214}1215return sam;1216}12171218/* src[] = { deref, coord, sample_index }. const_index[] = {} */1219static void1220emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr,1221struct ir3_instruction **dst)1222{1223struct ir3_block *b = ctx->block;1224struct tex_src_info info = get_image_samp_tex_src(ctx, intr);1225struct ir3_instruction *sam;1226struct ir3_instruction *const *src0 = ir3_get_src(ctx, &intr->src[1]);1227struct ir3_instruction *coords[4];1228unsigned flags, ncoords = ir3_get_image_coords(intr, &flags);1229type_t type = ir3_get_type_for_image_intrinsic(intr);12301231/* hmm, this seems a bit odd, but it is what blob does and (at least1232* a5xx) just faults on bogus addresses otherwise:1233*/1234if (flags & IR3_INSTR_3D) {1235flags &= ~IR3_INSTR_3D;1236flags |= IR3_INSTR_A;1237}1238info.flags |= flags;12391240for (unsigned i = 0; i < ncoords; i++)1241coords[i] = src0[i];12421243if (ncoords == 1)1244coords[ncoords++] = create_immed(b, 0);12451246sam = emit_sam(ctx, OPC_ISAM, info, type, 0b1111,1247ir3_create_collect(ctx, coords, ncoords), NULL);12481249ir3_handle_nonuniform(sam, intr);12501251sam->barrier_class = IR3_BARRIER_IMAGE_R;1252sam->barrier_conflict = IR3_BARRIER_IMAGE_W;12531254ir3_split_dest(b, dst, sam, 0, 4);1255}12561257/* A4xx version of image_size, see ir3_a6xx.c for newer resinfo version. */1258void1259emit_intrinsic_image_size_tex(struct ir3_context *ctx,1260nir_intrinsic_instr *intr,1261struct ir3_instruction **dst)1262{1263struct ir3_block *b = ctx->block;1264struct tex_src_info info = get_image_samp_tex_src(ctx, intr);1265struct ir3_instruction *sam, *lod;1266unsigned flags, ncoords = ir3_get_image_coords(intr, &flags);1267type_t dst_type = nir_dest_bit_size(intr->dest) == 16 ? TYPE_U16 : TYPE_U32;12681269info.flags |= flags;1270assert(nir_src_as_uint(intr->src[1]) == 0);1271lod = create_immed(b, 0);1272sam = emit_sam(ctx, OPC_GETSIZE, info, dst_type, 0b1111, lod, NULL);12731274/* Array size actually ends up in .w rather than .z. This doesn't1275* matter for miplevel 0, but for higher mips the value in z is1276* minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is1277* returned, which means that we have to add 1 to it for arrays for1278* a3xx.1279*1280* Note use a temporary dst and then copy, since the size of the dst1281* array that is passed in is based on nir's understanding of the1282* result size, not the hardware's1283*/1284struct ir3_instruction *tmp[4];12851286ir3_split_dest(b, tmp, sam, 0, 4);12871288for (unsigned i = 0; i < ncoords; i++)1289dst[i] = tmp[i];12901291if (flags & IR3_INSTR_A) {1292if (ctx->compiler->levels_add_one) {1293dst[ncoords - 1] = ir3_ADD_U(b, tmp[3], 0, create_immed(b, 1), 0);1294} else {1295dst[ncoords - 1] = ir3_MOV(b, tmp[3], TYPE_U32);1296}1297}1298}12991300static void1301emit_control_barrier(struct ir3_context *ctx)1302{1303/* Hull shaders dispatch 32 wide so an entire patch will always1304* fit in a single warp and execute in lock-step. Consequently,1305* we don't need to do anything for TCS barriers. Emitting1306* barrier instruction will deadlock.1307*/1308if (ctx->so->type == MESA_SHADER_TESS_CTRL)1309return;13101311struct ir3_block *b = ctx->block;1312struct ir3_instruction *barrier = ir3_BAR(b);1313barrier->cat7.g = true;1314if (ctx->compiler->gpu_id < 600)1315barrier->cat7.l = true;1316barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY;1317barrier->barrier_class = IR3_BARRIER_EVERYTHING;1318array_insert(b, b->keeps, barrier);1319}13201321static void1322emit_intrinsic_barrier(struct ir3_context *ctx, nir_intrinsic_instr *intr)1323{1324struct ir3_block *b = ctx->block;1325struct ir3_instruction *barrier;13261327/* TODO: find out why there is a major difference of .l usage1328* between a5xx and a6xx,1329*/13301331switch (intr->intrinsic) {1332case nir_intrinsic_control_barrier:1333emit_control_barrier(ctx);1334return;1335case nir_intrinsic_scoped_barrier: {1336nir_scope exec_scope = nir_intrinsic_execution_scope(intr);1337nir_variable_mode modes = nir_intrinsic_memory_modes(intr);13381339if (ctx->so->type == MESA_SHADER_TESS_CTRL) {1340/* Remove mode corresponding to nir_intrinsic_memory_barrier_tcs_patch,1341* because hull shaders dispatch 32 wide so an entire patch will1342* always fit in a single warp and execute in lock-step.1343*1344* TODO: memory barrier also tells us not to reorder stores, this1345* information is lost here (backend doesn't reorder stores so we1346* are safe for now).1347*/1348modes &= ~nir_var_shader_out;1349}13501351assert(!(modes & nir_var_shader_out));13521353if ((modes &1354(nir_var_mem_shared | nir_var_mem_ssbo | nir_var_mem_global))) {1355barrier = ir3_FENCE(b);1356barrier->cat7.r = true;1357barrier->cat7.w = true;13581359if (modes & (nir_var_mem_ssbo | nir_var_mem_global)) {1360barrier->cat7.g = true;1361}13621363if (ctx->compiler->gpu_id > 600) {1364if (modes & nir_var_mem_ssbo) {1365barrier->cat7.l = true;1366}1367} else {1368if (modes & (nir_var_mem_shared | nir_var_mem_ssbo)) {1369barrier->cat7.l = true;1370}1371}13721373barrier->barrier_class = 0;1374barrier->barrier_conflict = 0;13751376if (modes & nir_var_mem_shared) {1377barrier->barrier_class |= IR3_BARRIER_SHARED_W;1378barrier->barrier_conflict |=1379IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;1380}13811382if (modes & (nir_var_mem_ssbo | nir_var_mem_global)) {1383barrier->barrier_class |= IR3_BARRIER_BUFFER_W;1384barrier->barrier_conflict |=1385IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;1386}13871388/* TODO: check for image mode when it has a separate one */1389if (modes & nir_var_mem_ssbo) {1390barrier->barrier_class |= IR3_BARRIER_IMAGE_W;1391barrier->barrier_conflict |=1392IR3_BARRIER_IMAGE_W | IR3_BARRIER_IMAGE_R;1393}1394array_insert(b, b->keeps, barrier);1395}13961397if (exec_scope >= NIR_SCOPE_WORKGROUP) {1398emit_control_barrier(ctx);1399}14001401return;1402}1403case nir_intrinsic_memory_barrier_tcs_patch:1404/* Not applicable, see explanation for scoped_barrier + shader_out */1405return;1406case nir_intrinsic_memory_barrier_buffer:1407barrier = ir3_FENCE(b);1408barrier->cat7.g = true;1409if (ctx->compiler->gpu_id > 600)1410barrier->cat7.l = true;1411barrier->cat7.r = true;1412barrier->cat7.w = true;1413barrier->barrier_class = IR3_BARRIER_BUFFER_W;1414barrier->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;1415break;1416case nir_intrinsic_memory_barrier_image:1417barrier = ir3_FENCE(b);1418barrier->cat7.g = true;1419barrier->cat7.l = true;1420barrier->cat7.r = true;1421barrier->cat7.w = true;1422barrier->barrier_class = IR3_BARRIER_IMAGE_W;1423barrier->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;1424break;1425case nir_intrinsic_memory_barrier_shared:1426barrier = ir3_FENCE(b);1427if (ctx->compiler->gpu_id < 600)1428barrier->cat7.l = true;1429barrier->cat7.r = true;1430barrier->cat7.w = true;1431barrier->barrier_class = IR3_BARRIER_SHARED_W;1432barrier->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;1433break;1434case nir_intrinsic_memory_barrier:1435case nir_intrinsic_group_memory_barrier:1436barrier = ir3_FENCE(b);1437barrier->cat7.g = true;1438barrier->cat7.l = true;1439barrier->cat7.r = true;1440barrier->cat7.w = true;1441barrier->barrier_class =1442IR3_BARRIER_SHARED_W | IR3_BARRIER_IMAGE_W | IR3_BARRIER_BUFFER_W;1443barrier->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W |1444IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W |1445IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;1446break;1447default:1448unreachable("boo");1449}14501451/* make sure barrier doesn't get DCE'd */1452array_insert(b, b->keeps, barrier);1453}14541455static void1456add_sysval_input_compmask(struct ir3_context *ctx, gl_system_value slot,1457unsigned compmask, struct ir3_instruction *instr)1458{1459struct ir3_shader_variant *so = ctx->so;1460unsigned n = so->inputs_count++;14611462assert(instr->opc == OPC_META_INPUT);1463instr->input.inidx = n;1464instr->input.sysval = slot;14651466so->inputs[n].sysval = true;1467so->inputs[n].slot = slot;1468so->inputs[n].compmask = compmask;1469so->total_in++;14701471so->sysval_in += util_last_bit(compmask);1472}14731474static struct ir3_instruction *1475create_sysval_input(struct ir3_context *ctx, gl_system_value slot,1476unsigned compmask)1477{1478assert(compmask);1479struct ir3_instruction *sysval = create_input(ctx, compmask);1480add_sysval_input_compmask(ctx, slot, compmask, sysval);1481return sysval;1482}14831484static struct ir3_instruction *1485get_barycentric(struct ir3_context *ctx, enum ir3_bary bary)1486{1487static const gl_system_value sysval_base =1488SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;14891490STATIC_ASSERT(sysval_base + IJ_PERSP_PIXEL ==1491SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL);1492STATIC_ASSERT(sysval_base + IJ_PERSP_SAMPLE ==1493SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE);1494STATIC_ASSERT(sysval_base + IJ_PERSP_CENTROID ==1495SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID);1496STATIC_ASSERT(sysval_base + IJ_PERSP_SIZE ==1497SYSTEM_VALUE_BARYCENTRIC_PERSP_SIZE);1498STATIC_ASSERT(sysval_base + IJ_LINEAR_PIXEL ==1499SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL);1500STATIC_ASSERT(sysval_base + IJ_LINEAR_CENTROID ==1501SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID);1502STATIC_ASSERT(sysval_base + IJ_LINEAR_SAMPLE ==1503SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE);15041505if (!ctx->ij[bary]) {1506struct ir3_instruction *xy[2];1507struct ir3_instruction *ij;15081509ij = create_sysval_input(ctx, sysval_base + bary, 0x3);1510ir3_split_dest(ctx->block, xy, ij, 0, 2);15111512ctx->ij[bary] = ir3_create_collect(ctx, xy, 2);1513}15141515return ctx->ij[bary];1516}15171518/* TODO: make this a common NIR helper?1519* there is a nir_system_value_from_intrinsic but it takes nir_intrinsic_op so1520* it can't be extended to work with this1521*/1522static gl_system_value1523nir_intrinsic_barycentric_sysval(nir_intrinsic_instr *intr)1524{1525enum glsl_interp_mode interp_mode = nir_intrinsic_interp_mode(intr);1526gl_system_value sysval;15271528switch (intr->intrinsic) {1529case nir_intrinsic_load_barycentric_pixel:1530if (interp_mode == INTERP_MODE_NOPERSPECTIVE)1531sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;1532else1533sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;1534break;1535case nir_intrinsic_load_barycentric_centroid:1536if (interp_mode == INTERP_MODE_NOPERSPECTIVE)1537sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID;1538else1539sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID;1540break;1541case nir_intrinsic_load_barycentric_sample:1542if (interp_mode == INTERP_MODE_NOPERSPECTIVE)1543sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE;1544else1545sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE;1546break;1547default:1548unreachable("invalid barycentric intrinsic");1549}15501551return sysval;1552}15531554static void1555emit_intrinsic_barycentric(struct ir3_context *ctx, nir_intrinsic_instr *intr,1556struct ir3_instruction **dst)1557{1558gl_system_value sysval = nir_intrinsic_barycentric_sysval(intr);15591560if (!ctx->so->key.msaa) {1561switch (sysval) {1562case SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE:1563sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;1564break;1565case SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID:1566if (ctx->compiler->gpu_id < 600)1567sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;1568break;1569case SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE:1570sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;1571break;1572case SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID:1573if (ctx->compiler->gpu_id < 600)1574sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;1575break;1576default:1577break;1578}1579}15801581enum ir3_bary bary = sysval - SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;15821583struct ir3_instruction *ij = get_barycentric(ctx, bary);1584ir3_split_dest(ctx->block, dst, ij, 0, 2);1585}15861587static struct ir3_instruction *1588get_frag_coord(struct ir3_context *ctx, nir_intrinsic_instr *intr)1589{1590if (!ctx->frag_coord) {1591struct ir3_block *b = ctx->in_block;1592struct ir3_instruction *xyzw[4];1593struct ir3_instruction *hw_frag_coord;15941595hw_frag_coord = create_sysval_input(ctx, SYSTEM_VALUE_FRAG_COORD, 0xf);1596ir3_split_dest(b, xyzw, hw_frag_coord, 0, 4);15971598/* for frag_coord.xy, we get unsigned values.. we need1599* to subtract (integer) 8 and divide by 16 (right-1600* shift by 4) then convert to float:1601*1602* sub.s tmp, src, 81603* shr.b tmp, tmp, 41604* mov.u32f32 dst, tmp1605*1606*/1607for (int i = 0; i < 2; i++) {1608xyzw[i] = ir3_COV(b, xyzw[i], TYPE_U32, TYPE_F32);1609xyzw[i] =1610ir3_MUL_F(b, xyzw[i], 0, create_immed(b, fui(1.0 / 16.0)), 0);1611}16121613ctx->frag_coord = ir3_create_collect(ctx, xyzw, 4);1614}16151616ctx->so->fragcoord_compmask |= nir_ssa_def_components_read(&intr->dest.ssa);16171618return ctx->frag_coord;1619}16201621static void setup_input(struct ir3_context *ctx, nir_intrinsic_instr *intr);1622static void setup_output(struct ir3_context *ctx, nir_intrinsic_instr *intr);16231624static void1625emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)1626{1627const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];1628struct ir3_instruction **dst;1629struct ir3_instruction *const *src;1630struct ir3_block *b = ctx->block;1631unsigned dest_components = nir_intrinsic_dest_components(intr);1632int idx;16331634if (info->has_dest) {1635dst = ir3_get_dst(ctx, &intr->dest, dest_components);1636} else {1637dst = NULL;1638}16391640const struct ir3_const_state *const_state = ir3_const_state(ctx->so);1641const unsigned primitive_param = const_state->offsets.primitive_param * 4;1642const unsigned primitive_map = const_state->offsets.primitive_map * 4;16431644switch (intr->intrinsic) {1645case nir_intrinsic_load_uniform:1646idx = nir_intrinsic_base(intr);1647if (nir_src_is_const(intr->src[0])) {1648idx += nir_src_as_uint(intr->src[0]);1649for (int i = 0; i < dest_components; i++) {1650dst[i] = create_uniform_typed(1651b, idx + i,1652nir_dest_bit_size(intr->dest) == 16 ? TYPE_F16 : TYPE_F32);1653}1654} else {1655src = ir3_get_src(ctx, &intr->src[0]);1656for (int i = 0; i < dest_components; i++) {1657dst[i] = create_uniform_indirect(1658b, idx + i,1659nir_dest_bit_size(intr->dest) == 16 ? TYPE_F16 : TYPE_F32,1660ir3_get_addr0(ctx, src[0], 1));1661}1662/* NOTE: if relative addressing is used, we set1663* constlen in the compiler (to worst-case value)1664* since we don't know in the assembler what the max1665* addr reg value can be:1666*/1667ctx->so->constlen =1668MAX2(ctx->so->constlen, const_state->ubo_state.size / 16);1669}1670break;16711672case nir_intrinsic_load_vs_primitive_stride_ir3:1673dst[0] = create_uniform(b, primitive_param + 0);1674break;1675case nir_intrinsic_load_vs_vertex_stride_ir3:1676dst[0] = create_uniform(b, primitive_param + 1);1677break;1678case nir_intrinsic_load_hs_patch_stride_ir3:1679dst[0] = create_uniform(b, primitive_param + 2);1680break;1681case nir_intrinsic_load_patch_vertices_in:1682dst[0] = create_uniform(b, primitive_param + 3);1683break;1684case nir_intrinsic_load_tess_param_base_ir3:1685dst[0] = create_uniform(b, primitive_param + 4);1686dst[1] = create_uniform(b, primitive_param + 5);1687break;1688case nir_intrinsic_load_tess_factor_base_ir3:1689dst[0] = create_uniform(b, primitive_param + 6);1690dst[1] = create_uniform(b, primitive_param + 7);1691break;16921693case nir_intrinsic_load_primitive_location_ir3:1694idx = nir_intrinsic_driver_location(intr);1695dst[0] = create_uniform(b, primitive_map + idx);1696break;16971698case nir_intrinsic_load_gs_header_ir3:1699dst[0] = ctx->gs_header;1700break;1701case nir_intrinsic_load_tcs_header_ir3:1702dst[0] = ctx->tcs_header;1703break;17041705case nir_intrinsic_load_primitive_id:1706dst[0] = ctx->primitive_id;1707break;17081709case nir_intrinsic_load_tess_coord:1710if (!ctx->tess_coord) {1711ctx->tess_coord =1712create_sysval_input(ctx, SYSTEM_VALUE_TESS_COORD, 0x3);1713}1714ir3_split_dest(b, dst, ctx->tess_coord, 0, 2);17151716/* Unused, but ir3_put_dst() below wants to free something */1717dst[2] = create_immed(b, 0);1718break;17191720case nir_intrinsic_end_patch_ir3:1721assert(ctx->so->type == MESA_SHADER_TESS_CTRL);1722struct ir3_instruction *end = ir3_PREDE(b);1723array_insert(b, b->keeps, end);17241725end->barrier_class = IR3_BARRIER_EVERYTHING;1726end->barrier_conflict = IR3_BARRIER_EVERYTHING;1727break;17281729case nir_intrinsic_store_global_ir3:1730ctx->funcs->emit_intrinsic_store_global_ir3(ctx, intr);1731break;1732case nir_intrinsic_load_global_ir3:1733ctx->funcs->emit_intrinsic_load_global_ir3(ctx, intr, dst);1734break;17351736case nir_intrinsic_load_ubo:1737emit_intrinsic_load_ubo(ctx, intr, dst);1738break;1739case nir_intrinsic_load_ubo_vec4:1740emit_intrinsic_load_ubo_ldc(ctx, intr, dst);1741break;1742case nir_intrinsic_load_frag_coord:1743ir3_split_dest(b, dst, get_frag_coord(ctx, intr), 0, 4);1744break;1745case nir_intrinsic_load_sample_pos_from_id: {1746/* NOTE: blob seems to always use TYPE_F16 and then cov.f16f32,1747* but that doesn't seem necessary.1748*/1749struct ir3_instruction *offset =1750ir3_RGETPOS(b, ir3_get_src(ctx, &intr->src[0])[0], 0);1751offset->dsts[0]->wrmask = 0x3;1752offset->cat5.type = TYPE_F32;17531754ir3_split_dest(b, dst, offset, 0, 2);17551756break;1757}1758case nir_intrinsic_load_size_ir3:1759if (!ctx->ij[IJ_PERSP_SIZE]) {1760ctx->ij[IJ_PERSP_SIZE] =1761create_sysval_input(ctx, SYSTEM_VALUE_BARYCENTRIC_PERSP_SIZE, 0x1);1762}1763dst[0] = ctx->ij[IJ_PERSP_SIZE];1764break;1765case nir_intrinsic_load_barycentric_centroid:1766case nir_intrinsic_load_barycentric_sample:1767case nir_intrinsic_load_barycentric_pixel:1768emit_intrinsic_barycentric(ctx, intr, dst);1769break;1770case nir_intrinsic_load_interpolated_input:1771case nir_intrinsic_load_input:1772setup_input(ctx, intr);1773break;1774/* All SSBO intrinsics should have been lowered by 'lower_io_offsets'1775* pass and replaced by an ir3-specifc version that adds the1776* dword-offset in the last source.1777*/1778case nir_intrinsic_load_ssbo_ir3:1779ctx->funcs->emit_intrinsic_load_ssbo(ctx, intr, dst);1780break;1781case nir_intrinsic_store_ssbo_ir3:1782if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&1783!ctx->s->info.fs.early_fragment_tests)1784ctx->so->no_earlyz = true;1785ctx->funcs->emit_intrinsic_store_ssbo(ctx, intr);1786break;1787case nir_intrinsic_get_ssbo_size:1788emit_intrinsic_ssbo_size(ctx, intr, dst);1789break;1790case nir_intrinsic_ssbo_atomic_add_ir3:1791case nir_intrinsic_ssbo_atomic_imin_ir3:1792case nir_intrinsic_ssbo_atomic_umin_ir3:1793case nir_intrinsic_ssbo_atomic_imax_ir3:1794case nir_intrinsic_ssbo_atomic_umax_ir3:1795case nir_intrinsic_ssbo_atomic_and_ir3:1796case nir_intrinsic_ssbo_atomic_or_ir3:1797case nir_intrinsic_ssbo_atomic_xor_ir3:1798case nir_intrinsic_ssbo_atomic_exchange_ir3:1799case nir_intrinsic_ssbo_atomic_comp_swap_ir3:1800if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&1801!ctx->s->info.fs.early_fragment_tests)1802ctx->so->no_earlyz = true;1803dst[0] = ctx->funcs->emit_intrinsic_atomic_ssbo(ctx, intr);1804break;1805case nir_intrinsic_load_shared:1806emit_intrinsic_load_shared(ctx, intr, dst);1807break;1808case nir_intrinsic_store_shared:1809emit_intrinsic_store_shared(ctx, intr);1810break;1811case nir_intrinsic_shared_atomic_add:1812case nir_intrinsic_shared_atomic_imin:1813case nir_intrinsic_shared_atomic_umin:1814case nir_intrinsic_shared_atomic_imax:1815case nir_intrinsic_shared_atomic_umax:1816case nir_intrinsic_shared_atomic_and:1817case nir_intrinsic_shared_atomic_or:1818case nir_intrinsic_shared_atomic_xor:1819case nir_intrinsic_shared_atomic_exchange:1820case nir_intrinsic_shared_atomic_comp_swap:1821dst[0] = emit_intrinsic_atomic_shared(ctx, intr);1822break;1823case nir_intrinsic_load_scratch:1824emit_intrinsic_load_scratch(ctx, intr, dst);1825break;1826case nir_intrinsic_store_scratch:1827emit_intrinsic_store_scratch(ctx, intr);1828break;1829case nir_intrinsic_image_load:1830emit_intrinsic_load_image(ctx, intr, dst);1831break;1832case nir_intrinsic_bindless_image_load:1833/* Bindless uses the IBO state, which doesn't have swizzle filled out,1834* so using isam doesn't work.1835*1836* TODO: can we use isam if we fill out more fields?1837*/1838ctx->funcs->emit_intrinsic_load_image(ctx, intr, dst);1839break;1840case nir_intrinsic_image_store:1841case nir_intrinsic_bindless_image_store:1842if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&1843!ctx->s->info.fs.early_fragment_tests)1844ctx->so->no_earlyz = true;1845ctx->funcs->emit_intrinsic_store_image(ctx, intr);1846break;1847case nir_intrinsic_image_size:1848case nir_intrinsic_bindless_image_size:1849ctx->funcs->emit_intrinsic_image_size(ctx, intr, dst);1850break;1851case nir_intrinsic_image_atomic_add:1852case nir_intrinsic_bindless_image_atomic_add:1853case nir_intrinsic_image_atomic_imin:1854case nir_intrinsic_bindless_image_atomic_imin:1855case nir_intrinsic_image_atomic_umin:1856case nir_intrinsic_bindless_image_atomic_umin:1857case nir_intrinsic_image_atomic_imax:1858case nir_intrinsic_bindless_image_atomic_imax:1859case nir_intrinsic_image_atomic_umax:1860case nir_intrinsic_bindless_image_atomic_umax:1861case nir_intrinsic_image_atomic_and:1862case nir_intrinsic_bindless_image_atomic_and:1863case nir_intrinsic_image_atomic_or:1864case nir_intrinsic_bindless_image_atomic_or:1865case nir_intrinsic_image_atomic_xor:1866case nir_intrinsic_bindless_image_atomic_xor:1867case nir_intrinsic_image_atomic_exchange:1868case nir_intrinsic_bindless_image_atomic_exchange:1869case nir_intrinsic_image_atomic_comp_swap:1870case nir_intrinsic_bindless_image_atomic_comp_swap:1871if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&1872!ctx->s->info.fs.early_fragment_tests)1873ctx->so->no_earlyz = true;1874dst[0] = ctx->funcs->emit_intrinsic_atomic_image(ctx, intr);1875break;1876case nir_intrinsic_scoped_barrier:1877case nir_intrinsic_control_barrier:1878case nir_intrinsic_memory_barrier:1879case nir_intrinsic_group_memory_barrier:1880case nir_intrinsic_memory_barrier_buffer:1881case nir_intrinsic_memory_barrier_image:1882case nir_intrinsic_memory_barrier_shared:1883case nir_intrinsic_memory_barrier_tcs_patch:1884emit_intrinsic_barrier(ctx, intr);1885/* note that blk ptr no longer valid, make that obvious: */1886b = NULL;1887break;1888case nir_intrinsic_store_output:1889setup_output(ctx, intr);1890break;1891case nir_intrinsic_load_base_vertex:1892case nir_intrinsic_load_first_vertex:1893if (!ctx->basevertex) {1894ctx->basevertex = create_driver_param(ctx, IR3_DP_VTXID_BASE);1895}1896dst[0] = ctx->basevertex;1897break;1898case nir_intrinsic_load_draw_id:1899if (!ctx->draw_id) {1900ctx->draw_id = create_driver_param(ctx, IR3_DP_DRAWID);1901}1902dst[0] = ctx->draw_id;1903break;1904case nir_intrinsic_load_base_instance:1905if (!ctx->base_instance) {1906ctx->base_instance = create_driver_param(ctx, IR3_DP_INSTID_BASE);1907}1908dst[0] = ctx->base_instance;1909break;1910case nir_intrinsic_load_view_index:1911if (!ctx->view_index) {1912ctx->view_index =1913create_sysval_input(ctx, SYSTEM_VALUE_VIEW_INDEX, 0x1);1914}1915dst[0] = ctx->view_index;1916break;1917case nir_intrinsic_load_vertex_id_zero_base:1918case nir_intrinsic_load_vertex_id:1919if (!ctx->vertex_id) {1920gl_system_value sv = (intr->intrinsic == nir_intrinsic_load_vertex_id)1921? SYSTEM_VALUE_VERTEX_ID1922: SYSTEM_VALUE_VERTEX_ID_ZERO_BASE;1923ctx->vertex_id = create_sysval_input(ctx, sv, 0x1);1924}1925dst[0] = ctx->vertex_id;1926break;1927case nir_intrinsic_load_instance_id:1928if (!ctx->instance_id) {1929ctx->instance_id =1930create_sysval_input(ctx, SYSTEM_VALUE_INSTANCE_ID, 0x1);1931}1932dst[0] = ctx->instance_id;1933break;1934case nir_intrinsic_load_sample_id:1935ctx->so->per_samp = true;1936FALLTHROUGH;1937case nir_intrinsic_load_sample_id_no_per_sample:1938if (!ctx->samp_id) {1939ctx->samp_id = create_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_ID, 0x1);1940ctx->samp_id->dsts[0]->flags |= IR3_REG_HALF;1941}1942dst[0] = ir3_COV(b, ctx->samp_id, TYPE_U16, TYPE_U32);1943break;1944case nir_intrinsic_load_sample_mask_in:1945if (!ctx->samp_mask_in) {1946ctx->samp_mask_in =1947create_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_MASK_IN, 0x1);1948}1949dst[0] = ctx->samp_mask_in;1950break;1951case nir_intrinsic_load_user_clip_plane:1952idx = nir_intrinsic_ucp_id(intr);1953for (int i = 0; i < dest_components; i++) {1954unsigned n = idx * 4 + i;1955dst[i] = create_driver_param(ctx, IR3_DP_UCP0_X + n);1956}1957break;1958case nir_intrinsic_load_front_face:1959if (!ctx->frag_face) {1960ctx->so->frag_face = true;1961ctx->frag_face =1962create_sysval_input(ctx, SYSTEM_VALUE_FRONT_FACE, 0x1);1963ctx->frag_face->dsts[0]->flags |= IR3_REG_HALF;1964}1965/* for fragface, we get -1 for back and 0 for front. However this is1966* the inverse of what nir expects (where ~0 is true).1967*/1968dst[0] = ir3_CMPS_S(b, ctx->frag_face, 0,1969create_immed_typed(b, 0, TYPE_U16), 0);1970dst[0]->cat2.condition = IR3_COND_EQ;1971break;1972case nir_intrinsic_load_local_invocation_id:1973if (!ctx->local_invocation_id) {1974ctx->local_invocation_id =1975create_sysval_input(ctx, SYSTEM_VALUE_LOCAL_INVOCATION_ID, 0x7);1976}1977ir3_split_dest(b, dst, ctx->local_invocation_id, 0, 3);1978break;1979case nir_intrinsic_load_workgroup_id:1980case nir_intrinsic_load_workgroup_id_zero_base:1981if (!ctx->work_group_id) {1982ctx->work_group_id =1983create_sysval_input(ctx, SYSTEM_VALUE_WORKGROUP_ID, 0x7);1984ctx->work_group_id->dsts[0]->flags |= IR3_REG_SHARED;1985}1986ir3_split_dest(b, dst, ctx->work_group_id, 0, 3);1987break;1988case nir_intrinsic_load_base_workgroup_id:1989for (int i = 0; i < dest_components; i++) {1990dst[i] = create_driver_param(ctx, IR3_DP_BASE_GROUP_X + i);1991}1992break;1993case nir_intrinsic_load_num_workgroups:1994for (int i = 0; i < dest_components; i++) {1995dst[i] = create_driver_param(ctx, IR3_DP_NUM_WORK_GROUPS_X + i);1996}1997break;1998case nir_intrinsic_load_workgroup_size:1999for (int i = 0; i < dest_components; i++) {2000dst[i] = create_driver_param(ctx, IR3_DP_LOCAL_GROUP_SIZE_X + i);2001}2002break;2003case nir_intrinsic_load_subgroup_size:2004dst[0] = create_driver_param(ctx, IR3_DP_SUBGROUP_SIZE);2005break;2006case nir_intrinsic_load_subgroup_id_shift_ir3:2007dst[0] = create_driver_param(ctx, IR3_DP_SUBGROUP_ID_SHIFT);2008break;2009case nir_intrinsic_discard_if:2010case nir_intrinsic_discard:2011case nir_intrinsic_demote:2012case nir_intrinsic_demote_if:2013case nir_intrinsic_terminate:2014case nir_intrinsic_terminate_if: {2015struct ir3_instruction *cond, *kill;20162017if (intr->intrinsic == nir_intrinsic_discard_if ||2018intr->intrinsic == nir_intrinsic_demote_if ||2019intr->intrinsic == nir_intrinsic_terminate_if) {2020/* conditional discard: */2021src = ir3_get_src(ctx, &intr->src[0]);2022cond = src[0];2023} else {2024/* unconditional discard: */2025cond = create_immed(b, 1);2026}20272028/* NOTE: only cmps.*.* can write p0.x: */2029cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0);2030cond->cat2.condition = IR3_COND_NE;20312032/* condition always goes in predicate register: */2033cond->dsts[0]->num = regid(REG_P0, 0);2034cond->dsts[0]->flags &= ~IR3_REG_SSA;20352036if (intr->intrinsic == nir_intrinsic_demote ||2037intr->intrinsic == nir_intrinsic_demote_if) {2038kill = ir3_DEMOTE(b, cond, 0);2039} else {2040kill = ir3_KILL(b, cond, 0);2041}20422043/* Side-effects should not be moved on a different side of the kill */2044kill->barrier_class = IR3_BARRIER_IMAGE_W | IR3_BARRIER_BUFFER_W;2045kill->barrier_conflict = IR3_BARRIER_IMAGE_W | IR3_BARRIER_BUFFER_W;2046kill->srcs[0]->num = regid(REG_P0, 0);2047array_insert(ctx->ir, ctx->ir->predicates, kill);20482049array_insert(b, b->keeps, kill);2050ctx->so->has_kill = true;20512052break;2053}20542055case nir_intrinsic_cond_end_ir3: {2056struct ir3_instruction *cond, *kill;20572058src = ir3_get_src(ctx, &intr->src[0]);2059cond = src[0];20602061/* NOTE: only cmps.*.* can write p0.x: */2062cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0);2063cond->cat2.condition = IR3_COND_NE;20642065/* condition always goes in predicate register: */2066cond->dsts[0]->num = regid(REG_P0, 0);20672068kill = ir3_PREDT(b, cond, 0);20692070kill->barrier_class = IR3_BARRIER_EVERYTHING;2071kill->barrier_conflict = IR3_BARRIER_EVERYTHING;20722073array_insert(ctx->ir, ctx->ir->predicates, kill);2074array_insert(b, b->keeps, kill);2075break;2076}20772078case nir_intrinsic_vote_any:2079case nir_intrinsic_vote_all: {2080struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];2081struct ir3_instruction *pred = ir3_get_predicate(ctx, src);2082if (intr->intrinsic == nir_intrinsic_vote_any)2083dst[0] = ir3_ANY_MACRO(ctx->block, pred, 0);2084else2085dst[0] = ir3_ALL_MACRO(ctx->block, pred, 0);2086dst[0]->srcs[0]->num = regid(REG_P0, 0);2087array_insert(ctx->ir, ctx->ir->predicates, dst[0]);2088break;2089}2090case nir_intrinsic_elect:2091dst[0] = ir3_ELECT_MACRO(ctx->block);2092/* This may expand to a divergent if/then, so allocate stack space for2093* it.2094*/2095ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);2096break;20972098case nir_intrinsic_read_invocation_cond_ir3: {2099struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];2100struct ir3_instruction *cond = ir3_get_src(ctx, &intr->src[1])[0];2101dst[0] = ir3_READ_COND_MACRO(ctx->block, ir3_get_predicate(ctx, cond), 0,2102src, 0);2103dst[0]->dsts[0]->flags |= IR3_REG_SHARED;2104dst[0]->srcs[0]->num = regid(REG_P0, 0);2105array_insert(ctx->ir, ctx->ir->predicates, dst[0]);2106ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);2107break;2108}21092110case nir_intrinsic_read_first_invocation: {2111struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];2112dst[0] = ir3_READ_FIRST_MACRO(ctx->block, src, 0);2113dst[0]->dsts[0]->flags |= IR3_REG_SHARED;2114ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);2115break;2116}21172118case nir_intrinsic_ballot: {2119struct ir3_instruction *ballot;2120unsigned components = intr->dest.ssa.num_components;2121if (nir_src_is_const(intr->src[0]) && nir_src_as_bool(intr->src[0])) {2122/* ballot(true) is just MOVMSK */2123ballot = ir3_MOVMSK(ctx->block, components);2124} else {2125struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];2126struct ir3_instruction *pred = ir3_get_predicate(ctx, src);2127ballot = ir3_BALLOT_MACRO(ctx->block, pred, components);2128ballot->srcs[0]->num = regid(REG_P0, 0);2129array_insert(ctx->ir, ctx->ir->predicates, ballot);2130ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);2131}2132ir3_split_dest(ctx->block, dst, ballot, 0, components);2133break;2134}21352136case nir_intrinsic_load_shared_ir3:2137emit_intrinsic_load_shared_ir3(ctx, intr, dst);2138break;2139case nir_intrinsic_store_shared_ir3:2140emit_intrinsic_store_shared_ir3(ctx, intr);2141break;2142case nir_intrinsic_bindless_resource_ir3:2143dst[0] = ir3_get_src(ctx, &intr->src[0])[0];2144break;2145default:2146ir3_context_error(ctx, "Unhandled intrinsic type: %s\n",2147nir_intrinsic_infos[intr->intrinsic].name);2148break;2149}21502151if (info->has_dest)2152ir3_put_dst(ctx, &intr->dest);2153}21542155static void2156emit_load_const(struct ir3_context *ctx, nir_load_const_instr *instr)2157{2158struct ir3_instruction **dst =2159ir3_get_dst_ssa(ctx, &instr->def, instr->def.num_components);21602161if (instr->def.bit_size == 16) {2162for (int i = 0; i < instr->def.num_components; i++)2163dst[i] = create_immed_typed(ctx->block, instr->value[i].u16, TYPE_U16);2164} else {2165for (int i = 0; i < instr->def.num_components; i++)2166dst[i] = create_immed_typed(ctx->block, instr->value[i].u32, TYPE_U32);2167}2168}21692170static void2171emit_undef(struct ir3_context *ctx, nir_ssa_undef_instr *undef)2172{2173struct ir3_instruction **dst =2174ir3_get_dst_ssa(ctx, &undef->def, undef->def.num_components);2175type_t type = (undef->def.bit_size == 16) ? TYPE_U16 : TYPE_U32;21762177/* backend doesn't want undefined instructions, so just plug2178* in 0.0..2179*/2180for (int i = 0; i < undef->def.num_components; i++)2181dst[i] = create_immed_typed(ctx->block, fui(0.0), type);2182}21832184/*2185* texture fetch/sample instructions:2186*/21872188static type_t2189get_tex_dest_type(nir_tex_instr *tex)2190{2191type_t type;21922193switch (tex->dest_type) {2194case nir_type_float32:2195return TYPE_F32;2196case nir_type_float16:2197return TYPE_F16;2198case nir_type_int32:2199return TYPE_S32;2200case nir_type_int16:2201return TYPE_S16;2202case nir_type_bool32:2203case nir_type_uint32:2204return TYPE_U32;2205case nir_type_bool16:2206case nir_type_uint16:2207return TYPE_U16;2208case nir_type_invalid:2209default:2210unreachable("bad dest_type");2211}22122213return type;2214}22152216static void2217tex_info(nir_tex_instr *tex, unsigned *flagsp, unsigned *coordsp)2218{2219unsigned coords =2220glsl_get_sampler_dim_coordinate_components(tex->sampler_dim);2221unsigned flags = 0;22222223/* note: would use tex->coord_components.. except txs.. also,2224* since array index goes after shadow ref, we don't want to2225* count it:2226*/2227if (coords == 3)2228flags |= IR3_INSTR_3D;22292230if (tex->is_shadow && tex->op != nir_texop_lod)2231flags |= IR3_INSTR_S;22322233if (tex->is_array && tex->op != nir_texop_lod)2234flags |= IR3_INSTR_A;22352236*flagsp = flags;2237*coordsp = coords;2238}22392240/* Gets the sampler/texture idx as a hvec2. Which could either be dynamic2241* or immediate (in which case it will get lowered later to a non .s2en2242* version of the tex instruction which encode tex/samp as immediates:2243*/2244static struct tex_src_info2245get_tex_samp_tex_src(struct ir3_context *ctx, nir_tex_instr *tex)2246{2247struct ir3_block *b = ctx->block;2248struct tex_src_info info = {0};2249int texture_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_handle);2250int sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle);2251struct ir3_instruction *texture, *sampler;22522253if (texture_idx >= 0 || sampler_idx >= 0) {2254/* Bindless case */2255info.flags |= IR3_INSTR_B;22562257if (tex->texture_non_uniform || tex->sampler_non_uniform)2258info.flags |= IR3_INSTR_NONUNIF;22592260/* Gather information required to determine which encoding to2261* choose as well as for prefetch.2262*/2263nir_intrinsic_instr *bindless_tex = NULL;2264bool tex_const;2265if (texture_idx >= 0) {2266ctx->so->bindless_tex = true;2267bindless_tex = ir3_bindless_resource(tex->src[texture_idx].src);2268assert(bindless_tex);2269info.tex_base = nir_intrinsic_desc_set(bindless_tex);2270tex_const = nir_src_is_const(bindless_tex->src[0]);2271if (tex_const)2272info.tex_idx = nir_src_as_uint(bindless_tex->src[0]);2273} else {2274/* To simplify some of the logic below, assume the index is2275* constant 0 when it's not enabled.2276*/2277tex_const = true;2278info.tex_idx = 0;2279}2280nir_intrinsic_instr *bindless_samp = NULL;2281bool samp_const;2282if (sampler_idx >= 0) {2283ctx->so->bindless_samp = true;2284bindless_samp = ir3_bindless_resource(tex->src[sampler_idx].src);2285assert(bindless_samp);2286info.samp_base = nir_intrinsic_desc_set(bindless_samp);2287samp_const = nir_src_is_const(bindless_samp->src[0]);2288if (samp_const)2289info.samp_idx = nir_src_as_uint(bindless_samp->src[0]);2290} else {2291samp_const = true;2292info.samp_idx = 0;2293}22942295/* Choose encoding. */2296if (tex_const && samp_const && info.tex_idx < 256 &&2297info.samp_idx < 256) {2298if (info.tex_idx < 16 && info.samp_idx < 16 &&2299(!bindless_tex || !bindless_samp ||2300info.tex_base == info.samp_base)) {2301/* Everything fits within the instruction */2302info.base = info.tex_base;2303info.combined_idx = info.samp_idx | (info.tex_idx << 4);2304} else {2305info.base = info.tex_base;2306info.a1_val = info.tex_idx << 3 | info.samp_base;2307info.combined_idx = info.samp_idx;2308info.flags |= IR3_INSTR_A1EN;2309}2310info.samp_tex = NULL;2311} else {2312info.flags |= IR3_INSTR_S2EN;2313/* In the indirect case, we only use a1.x to store the sampler2314* base if it differs from the texture base.2315*/2316if (!bindless_tex || !bindless_samp ||2317info.tex_base == info.samp_base) {2318info.base = info.tex_base;2319} else {2320info.base = info.tex_base;2321info.a1_val = info.samp_base;2322info.flags |= IR3_INSTR_A1EN;2323}23242325/* Note: the indirect source is now a vec2 instead of hvec2, and2326* for some reason the texture and sampler are swapped.2327*/2328struct ir3_instruction *texture, *sampler;23292330if (bindless_tex) {2331texture = ir3_get_src(ctx, &tex->src[texture_idx].src)[0];2332} else {2333texture = create_immed(b, 0);2334}23352336if (bindless_samp) {2337sampler = ir3_get_src(ctx, &tex->src[sampler_idx].src)[0];2338} else {2339sampler = create_immed(b, 0);2340}2341info.samp_tex = ir3_collect(ctx, texture, sampler);2342}2343} else {2344info.flags |= IR3_INSTR_S2EN;2345texture_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_offset);2346sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset);2347if (texture_idx >= 0) {2348texture = ir3_get_src(ctx, &tex->src[texture_idx].src)[0];2349texture = ir3_COV(ctx->block, texture, TYPE_U32, TYPE_U16);2350} else {2351/* TODO what to do for dynamic case? I guess we only need the2352* max index for astc srgb workaround so maybe not a problem2353* to worry about if we don't enable indirect samplers for2354* a4xx?2355*/2356ctx->max_texture_index =2357MAX2(ctx->max_texture_index, tex->texture_index);2358texture = create_immed_typed(ctx->block, tex->texture_index, TYPE_U16);2359info.tex_idx = tex->texture_index;2360}23612362if (sampler_idx >= 0) {2363sampler = ir3_get_src(ctx, &tex->src[sampler_idx].src)[0];2364sampler = ir3_COV(ctx->block, sampler, TYPE_U32, TYPE_U16);2365} else {2366sampler = create_immed_typed(ctx->block, tex->sampler_index, TYPE_U16);2367info.samp_idx = tex->texture_index;2368}23692370info.samp_tex = ir3_collect(ctx, sampler, texture);2371}23722373return info;2374}23752376static void2377emit_tex(struct ir3_context *ctx, nir_tex_instr *tex)2378{2379struct ir3_block *b = ctx->block;2380struct ir3_instruction **dst, *sam, *src0[12], *src1[4];2381struct ir3_instruction *const *coord, *const *off, *const *ddx, *const *ddy;2382struct ir3_instruction *lod, *compare, *proj, *sample_index;2383struct tex_src_info info = {0};2384bool has_bias = false, has_lod = false, has_proj = false, has_off = false;2385unsigned i, coords, flags, ncomp;2386unsigned nsrc0 = 0, nsrc1 = 0;2387type_t type;2388opc_t opc = 0;23892390ncomp = nir_dest_num_components(tex->dest);23912392coord = off = ddx = ddy = NULL;2393lod = proj = compare = sample_index = NULL;23942395dst = ir3_get_dst(ctx, &tex->dest, ncomp);23962397for (unsigned i = 0; i < tex->num_srcs; i++) {2398switch (tex->src[i].src_type) {2399case nir_tex_src_coord:2400coord = ir3_get_src(ctx, &tex->src[i].src);2401break;2402case nir_tex_src_bias:2403lod = ir3_get_src(ctx, &tex->src[i].src)[0];2404has_bias = true;2405break;2406case nir_tex_src_lod:2407lod = ir3_get_src(ctx, &tex->src[i].src)[0];2408has_lod = true;2409break;2410case nir_tex_src_comparator: /* shadow comparator */2411compare = ir3_get_src(ctx, &tex->src[i].src)[0];2412break;2413case nir_tex_src_projector:2414proj = ir3_get_src(ctx, &tex->src[i].src)[0];2415has_proj = true;2416break;2417case nir_tex_src_offset:2418off = ir3_get_src(ctx, &tex->src[i].src);2419has_off = true;2420break;2421case nir_tex_src_ddx:2422ddx = ir3_get_src(ctx, &tex->src[i].src);2423break;2424case nir_tex_src_ddy:2425ddy = ir3_get_src(ctx, &tex->src[i].src);2426break;2427case nir_tex_src_ms_index:2428sample_index = ir3_get_src(ctx, &tex->src[i].src)[0];2429break;2430case nir_tex_src_texture_offset:2431case nir_tex_src_sampler_offset:2432case nir_tex_src_texture_handle:2433case nir_tex_src_sampler_handle:2434/* handled in get_tex_samp_src() */2435break;2436default:2437ir3_context_error(ctx, "Unhandled NIR tex src type: %d\n",2438tex->src[i].src_type);2439return;2440}2441}24422443switch (tex->op) {2444case nir_texop_tex_prefetch:2445compile_assert(ctx, !has_bias);2446compile_assert(ctx, !has_lod);2447compile_assert(ctx, !compare);2448compile_assert(ctx, !has_proj);2449compile_assert(ctx, !has_off);2450compile_assert(ctx, !ddx);2451compile_assert(ctx, !ddy);2452compile_assert(ctx, !sample_index);2453compile_assert(2454ctx, nir_tex_instr_src_index(tex, nir_tex_src_texture_offset) < 0);2455compile_assert(2456ctx, nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset) < 0);24572458if (ctx->so->num_sampler_prefetch < ctx->prefetch_limit) {2459opc = OPC_META_TEX_PREFETCH;2460ctx->so->num_sampler_prefetch++;2461break;2462}2463FALLTHROUGH;2464case nir_texop_tex:2465opc = has_lod ? OPC_SAML : OPC_SAM;2466break;2467case nir_texop_txb:2468opc = OPC_SAMB;2469break;2470case nir_texop_txl:2471opc = OPC_SAML;2472break;2473case nir_texop_txd:2474opc = OPC_SAMGQ;2475break;2476case nir_texop_txf:2477opc = OPC_ISAML;2478break;2479case nir_texop_lod:2480opc = OPC_GETLOD;2481break;2482case nir_texop_tg4:2483/* NOTE: a4xx might need to emulate gather w/ txf (this is2484* what blob does, seems gather is broken?), and a3xx did2485* not support it (but probably could also emulate).2486*/2487switch (tex->component) {2488case 0:2489opc = OPC_GATHER4R;2490break;2491case 1:2492opc = OPC_GATHER4G;2493break;2494case 2:2495opc = OPC_GATHER4B;2496break;2497case 3:2498opc = OPC_GATHER4A;2499break;2500}2501break;2502case nir_texop_txf_ms_fb:2503case nir_texop_txf_ms:2504opc = OPC_ISAMM;2505break;2506default:2507ir3_context_error(ctx, "Unhandled NIR tex type: %d\n", tex->op);2508return;2509}25102511tex_info(tex, &flags, &coords);25122513/*2514* lay out the first argument in the proper order:2515* - actual coordinates first2516* - shadow reference2517* - array index2518* - projection w2519* - starting at offset 4, dpdx.xy, dpdy.xy2520*2521* bias/lod go into the second arg2522*/25232524/* insert tex coords: */2525for (i = 0; i < coords; i++)2526src0[i] = coord[i];25272528nsrc0 = i;25292530/* scale up integer coords for TXF based on the LOD */2531if (ctx->compiler->unminify_coords && (opc == OPC_ISAML)) {2532assert(has_lod);2533for (i = 0; i < coords; i++)2534src0[i] = ir3_SHL_B(b, src0[i], 0, lod, 0);2535}25362537if (coords == 1) {2538/* hw doesn't do 1d, so we treat it as 2d with2539* height of 1, and patch up the y coord.2540*/2541if (is_isam(opc)) {2542src0[nsrc0++] = create_immed(b, 0);2543} else {2544src0[nsrc0++] = create_immed(b, fui(0.5));2545}2546}25472548if (tex->is_shadow && tex->op != nir_texop_lod)2549src0[nsrc0++] = compare;25502551if (tex->is_array && tex->op != nir_texop_lod) {2552struct ir3_instruction *idx = coord[coords];25532554/* the array coord for cube arrays needs 0.5 added to it */2555if (ctx->compiler->array_index_add_half && !is_isam(opc))2556idx = ir3_ADD_F(b, idx, 0, create_immed(b, fui(0.5)), 0);25572558src0[nsrc0++] = idx;2559}25602561if (has_proj) {2562src0[nsrc0++] = proj;2563flags |= IR3_INSTR_P;2564}25652566/* pad to 4, then ddx/ddy: */2567if (tex->op == nir_texop_txd) {2568while (nsrc0 < 4)2569src0[nsrc0++] = create_immed(b, fui(0.0));2570for (i = 0; i < coords; i++)2571src0[nsrc0++] = ddx[i];2572if (coords < 2)2573src0[nsrc0++] = create_immed(b, fui(0.0));2574for (i = 0; i < coords; i++)2575src0[nsrc0++] = ddy[i];2576if (coords < 2)2577src0[nsrc0++] = create_immed(b, fui(0.0));2578}25792580/* NOTE a3xx (and possibly a4xx?) might be different, using isaml2581* with scaled x coord according to requested sample:2582*/2583if (opc == OPC_ISAMM) {2584if (ctx->compiler->txf_ms_with_isaml) {2585/* the samples are laid out in x dimension as2586* 0 1 2 32587* x_ms = (x << ms) + sample_index;2588*/2589struct ir3_instruction *ms;2590ms = create_immed(b, (ctx->samples >> (2 * tex->texture_index)) & 3);25912592src0[0] = ir3_SHL_B(b, src0[0], 0, ms, 0);2593src0[0] = ir3_ADD_U(b, src0[0], 0, sample_index, 0);25942595opc = OPC_ISAML;2596} else {2597src0[nsrc0++] = sample_index;2598}2599}26002601/*2602* second argument (if applicable):2603* - offsets2604* - lod2605* - bias2606*/2607if (has_off | has_lod | has_bias) {2608if (has_off) {2609unsigned off_coords = coords;2610if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)2611off_coords--;2612for (i = 0; i < off_coords; i++)2613src1[nsrc1++] = off[i];2614if (off_coords < 2)2615src1[nsrc1++] = create_immed(b, fui(0.0));2616flags |= IR3_INSTR_O;2617}26182619if (has_lod | has_bias)2620src1[nsrc1++] = lod;2621}26222623type = get_tex_dest_type(tex);26242625if (opc == OPC_GETLOD)2626type = TYPE_S32;26272628if (tex->op == nir_texop_txf_ms_fb) {2629/* only expect a single txf_ms_fb per shader: */2630compile_assert(ctx, !ctx->so->fb_read);2631compile_assert(ctx, ctx->so->type == MESA_SHADER_FRAGMENT);26322633ctx->so->fb_read = true;2634info.samp_tex = ir3_collect(2635ctx, create_immed_typed(ctx->block, ctx->so->num_samp, TYPE_U16),2636create_immed_typed(ctx->block, ctx->so->num_samp, TYPE_U16));2637info.flags = IR3_INSTR_S2EN;26382639ctx->so->num_samp++;2640} else {2641info = get_tex_samp_tex_src(ctx, tex);2642}26432644struct ir3_instruction *col0 = ir3_create_collect(ctx, src0, nsrc0);2645struct ir3_instruction *col1 = ir3_create_collect(ctx, src1, nsrc1);26462647if (opc == OPC_META_TEX_PREFETCH) {2648int idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);26492650compile_assert(ctx, tex->src[idx].src.is_ssa);26512652sam = ir3_SAM(b, opc, type, MASK(ncomp), 0, NULL,2653get_barycentric(ctx, IJ_PERSP_PIXEL), 0);2654sam->prefetch.input_offset = ir3_nir_coord_offset(tex->src[idx].src.ssa);2655/* make sure not to add irrelevant flags like S2EN */2656sam->flags = flags | (info.flags & IR3_INSTR_B);2657sam->prefetch.tex = info.tex_idx;2658sam->prefetch.samp = info.samp_idx;2659sam->prefetch.tex_base = info.tex_base;2660sam->prefetch.samp_base = info.samp_base;2661} else {2662info.flags |= flags;2663sam = emit_sam(ctx, opc, info, type, MASK(ncomp), col0, col1);2664}26652666if ((ctx->astc_srgb & (1 << tex->texture_index)) &&2667!nir_tex_instr_is_query(tex)) {2668assert(opc != OPC_META_TEX_PREFETCH);26692670/* only need first 3 components: */2671sam->dsts[0]->wrmask = 0x7;2672ir3_split_dest(b, dst, sam, 0, 3);26732674/* we need to sample the alpha separately with a non-ASTC2675* texture state:2676*/2677sam = ir3_SAM(b, opc, type, 0b1000, flags | info.flags, info.samp_tex,2678col0, col1);26792680array_insert(ctx->ir, ctx->ir->astc_srgb, sam);26812682/* fixup .w component: */2683ir3_split_dest(b, &dst[3], sam, 3, 1);2684} else {2685/* normal (non-workaround) case: */2686ir3_split_dest(b, dst, sam, 0, ncomp);2687}26882689/* GETLOD returns results in 4.8 fixed point */2690if (opc == OPC_GETLOD) {2691struct ir3_instruction *factor = create_immed(b, fui(1.0 / 256));26922693compile_assert(ctx, tex->dest_type == nir_type_float32);2694for (i = 0; i < 2; i++) {2695dst[i] =2696ir3_MUL_F(b, ir3_COV(b, dst[i], TYPE_S32, TYPE_F32), 0, factor, 0);2697}2698}26992700ir3_put_dst(ctx, &tex->dest);2701}27022703static void2704emit_tex_info(struct ir3_context *ctx, nir_tex_instr *tex, unsigned idx)2705{2706struct ir3_block *b = ctx->block;2707struct ir3_instruction **dst, *sam;2708type_t dst_type = get_tex_dest_type(tex);2709struct tex_src_info info = get_tex_samp_tex_src(ctx, tex);27102711dst = ir3_get_dst(ctx, &tex->dest, 1);27122713sam = emit_sam(ctx, OPC_GETINFO, info, dst_type, 1 << idx, NULL, NULL);27142715/* even though there is only one component, since it ends2716* up in .y/.z/.w rather than .x, we need a split_dest()2717*/2718ir3_split_dest(b, dst, sam, idx, 1);27192720/* The # of levels comes from getinfo.z. We need to add 1 to it, since2721* the value in TEX_CONST_0 is zero-based.2722*/2723if (ctx->compiler->levels_add_one)2724dst[0] = ir3_ADD_U(b, dst[0], 0, create_immed(b, 1), 0);27252726ir3_put_dst(ctx, &tex->dest);2727}27282729static void2730emit_tex_txs(struct ir3_context *ctx, nir_tex_instr *tex)2731{2732struct ir3_block *b = ctx->block;2733struct ir3_instruction **dst, *sam;2734struct ir3_instruction *lod;2735unsigned flags, coords;2736type_t dst_type = get_tex_dest_type(tex);2737struct tex_src_info info = get_tex_samp_tex_src(ctx, tex);27382739tex_info(tex, &flags, &coords);2740info.flags |= flags;27412742/* Actually we want the number of dimensions, not coordinates. This2743* distinction only matters for cubes.2744*/2745if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)2746coords = 2;27472748dst = ir3_get_dst(ctx, &tex->dest, 4);27492750int lod_idx = nir_tex_instr_src_index(tex, nir_tex_src_lod);2751compile_assert(ctx, lod_idx >= 0);27522753lod = ir3_get_src(ctx, &tex->src[lod_idx].src)[0];27542755if (tex->sampler_dim != GLSL_SAMPLER_DIM_BUF) {2756sam = emit_sam(ctx, OPC_GETSIZE, info, dst_type, 0b1111, lod, NULL);2757} else {2758/*2759* The maximum value which OPC_GETSIZE could return for one dimension2760* is 0x007ff0, however sampler buffer could be much bigger.2761* Blob uses OPC_GETBUF for them.2762*/2763sam = emit_sam(ctx, OPC_GETBUF, info, dst_type, 0b1111, NULL, NULL);2764}27652766ir3_split_dest(b, dst, sam, 0, 4);27672768/* Array size actually ends up in .w rather than .z. This doesn't2769* matter for miplevel 0, but for higher mips the value in z is2770* minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is2771* returned, which means that we have to add 1 to it for arrays.2772*/2773if (tex->is_array) {2774if (ctx->compiler->levels_add_one) {2775dst[coords] = ir3_ADD_U(b, dst[3], 0, create_immed(b, 1), 0);2776} else {2777dst[coords] = ir3_MOV(b, dst[3], TYPE_U32);2778}2779}27802781ir3_put_dst(ctx, &tex->dest);2782}27832784/* phi instructions are left partially constructed. We don't resolve2785* their srcs until the end of the shader, since (eg. loops) one of2786* the phi's srcs might be defined after the phi due to back edges in2787* the CFG.2788*/2789static void2790emit_phi(struct ir3_context *ctx, nir_phi_instr *nphi)2791{2792struct ir3_instruction *phi, **dst;27932794/* NOTE: phi's should be lowered to scalar at this point */2795compile_assert(ctx, nphi->dest.ssa.num_components == 1);27962797dst = ir3_get_dst(ctx, &nphi->dest, 1);27982799phi = ir3_instr_create(ctx->block, OPC_META_PHI, 1,2800exec_list_length(&nphi->srcs));2801__ssa_dst(phi);2802phi->phi.nphi = nphi;28032804dst[0] = phi;28052806ir3_put_dst(ctx, &nphi->dest);2807}28082809static struct ir3_block *get_block(struct ir3_context *ctx,2810const nir_block *nblock);28112812static struct ir3_instruction *2813read_phi_src(struct ir3_context *ctx, struct ir3_block *blk,2814struct ir3_instruction *phi, nir_phi_instr *nphi)2815{2816if (!blk->nblock) {2817struct ir3_instruction *continue_phi =2818ir3_instr_create(blk, OPC_META_PHI, 1, blk->predecessors_count);2819__ssa_dst(continue_phi)->flags = phi->dsts[0]->flags;28202821for (unsigned i = 0; i < blk->predecessors_count; i++) {2822struct ir3_instruction *src =2823read_phi_src(ctx, blk->predecessors[i], phi, nphi);2824if (src)2825__ssa_src(continue_phi, src, 0);2826else2827ir3_src_create(continue_phi, INVALID_REG, phi->dsts[0]->flags);2828}28292830return continue_phi;2831}28322833nir_foreach_phi_src (nsrc, nphi) {2834if (blk->nblock == nsrc->pred) {2835if (nsrc->src.ssa->parent_instr->type == nir_instr_type_ssa_undef) {2836/* Create an ir3 undef */2837return NULL;2838} else {2839return ir3_get_src(ctx, &nsrc->src)[0];2840}2841}2842}28432844unreachable("couldn't find phi node ir3 block");2845return NULL;2846}28472848static void2849resolve_phis(struct ir3_context *ctx, struct ir3_block *block)2850{2851foreach_instr (phi, &block->instr_list) {2852if (phi->opc != OPC_META_PHI)2853break;28542855nir_phi_instr *nphi = phi->phi.nphi;28562857if (!nphi) /* skip continue phis created above */2858continue;28592860for (unsigned i = 0; i < block->predecessors_count; i++) {2861struct ir3_block *pred = block->predecessors[i];2862struct ir3_instruction *src = read_phi_src(ctx, pred, phi, nphi);2863if (src) {2864__ssa_src(phi, src, 0);2865} else {2866/* Create an ir3 undef */2867ir3_src_create(phi, INVALID_REG, phi->dsts[0]->flags);2868}2869}2870}2871}28722873static void2874emit_jump(struct ir3_context *ctx, nir_jump_instr *jump)2875{2876switch (jump->type) {2877case nir_jump_break:2878case nir_jump_continue:2879case nir_jump_return:2880/* I *think* we can simply just ignore this, and use the2881* successor block link to figure out where we need to2882* jump to for break/continue2883*/2884break;2885default:2886ir3_context_error(ctx, "Unhandled NIR jump type: %d\n", jump->type);2887break;2888}2889}28902891static void2892emit_instr(struct ir3_context *ctx, nir_instr *instr)2893{2894switch (instr->type) {2895case nir_instr_type_alu:2896emit_alu(ctx, nir_instr_as_alu(instr));2897break;2898case nir_instr_type_deref:2899/* ignored, handled as part of the intrinsic they are src to */2900break;2901case nir_instr_type_intrinsic:2902emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));2903break;2904case nir_instr_type_load_const:2905emit_load_const(ctx, nir_instr_as_load_const(instr));2906break;2907case nir_instr_type_ssa_undef:2908emit_undef(ctx, nir_instr_as_ssa_undef(instr));2909break;2910case nir_instr_type_tex: {2911nir_tex_instr *tex = nir_instr_as_tex(instr);2912/* couple tex instructions get special-cased:2913*/2914switch (tex->op) {2915case nir_texop_txs:2916emit_tex_txs(ctx, tex);2917break;2918case nir_texop_query_levels:2919emit_tex_info(ctx, tex, 2);2920break;2921case nir_texop_texture_samples:2922emit_tex_info(ctx, tex, 3);2923break;2924default:2925emit_tex(ctx, tex);2926break;2927}2928break;2929}2930case nir_instr_type_jump:2931emit_jump(ctx, nir_instr_as_jump(instr));2932break;2933case nir_instr_type_phi:2934emit_phi(ctx, nir_instr_as_phi(instr));2935break;2936case nir_instr_type_call:2937case nir_instr_type_parallel_copy:2938ir3_context_error(ctx, "Unhandled NIR instruction type: %d\n",2939instr->type);2940break;2941}2942}29432944static struct ir3_block *2945get_block(struct ir3_context *ctx, const nir_block *nblock)2946{2947struct ir3_block *block;2948struct hash_entry *hentry;29492950hentry = _mesa_hash_table_search(ctx->block_ht, nblock);2951if (hentry)2952return hentry->data;29532954block = ir3_block_create(ctx->ir);2955block->nblock = nblock;2956_mesa_hash_table_insert(ctx->block_ht, nblock, block);29572958return block;2959}29602961static struct ir3_block *2962get_block_or_continue(struct ir3_context *ctx, const nir_block *nblock)2963{2964struct hash_entry *hentry;29652966hentry = _mesa_hash_table_search(ctx->continue_block_ht, nblock);2967if (hentry)2968return hentry->data;29692970return get_block(ctx, nblock);2971}29722973static struct ir3_block *2974create_continue_block(struct ir3_context *ctx, const nir_block *nblock)2975{2976struct ir3_block *block = ir3_block_create(ctx->ir);2977block->nblock = NULL;2978_mesa_hash_table_insert(ctx->continue_block_ht, nblock, block);2979return block;2980}29812982static void2983emit_block(struct ir3_context *ctx, nir_block *nblock)2984{2985ctx->block = get_block(ctx, nblock);29862987list_addtail(&ctx->block->node, &ctx->ir->block_list);29882989ctx->block->loop_id = ctx->loop_id;29902991/* re-emit addr register in each block if needed: */2992for (int i = 0; i < ARRAY_SIZE(ctx->addr0_ht); i++) {2993_mesa_hash_table_destroy(ctx->addr0_ht[i], NULL);2994ctx->addr0_ht[i] = NULL;2995}29962997_mesa_hash_table_u64_destroy(ctx->addr1_ht);2998ctx->addr1_ht = NULL;29993000nir_foreach_instr (instr, nblock) {3001ctx->cur_instr = instr;3002emit_instr(ctx, instr);3003ctx->cur_instr = NULL;3004if (ctx->error)3005return;3006}30073008for (int i = 0; i < ARRAY_SIZE(ctx->block->successors); i++) {3009if (nblock->successors[i]) {3010ctx->block->successors[i] =3011get_block_or_continue(ctx, nblock->successors[i]);3012ctx->block->physical_successors[i] = ctx->block->successors[i];3013}3014}30153016_mesa_hash_table_clear(ctx->sel_cond_conversions, NULL);3017}30183019static void emit_cf_list(struct ir3_context *ctx, struct exec_list *list);30203021static void3022emit_if(struct ir3_context *ctx, nir_if *nif)3023{3024struct ir3_instruction *condition = ir3_get_src(ctx, &nif->condition)[0];30253026if (condition->opc == OPC_ANY_MACRO && condition->block == ctx->block) {3027ctx->block->condition = ssa(condition->srcs[0]);3028ctx->block->brtype = IR3_BRANCH_ANY;3029} else if (condition->opc == OPC_ALL_MACRO &&3030condition->block == ctx->block) {3031ctx->block->condition = ssa(condition->srcs[0]);3032ctx->block->brtype = IR3_BRANCH_ALL;3033} else if (condition->opc == OPC_ELECT_MACRO &&3034condition->block == ctx->block) {3035ctx->block->condition = NULL;3036ctx->block->brtype = IR3_BRANCH_GETONE;3037} else {3038ctx->block->condition = ir3_get_predicate(ctx, condition);3039ctx->block->brtype = IR3_BRANCH_COND;3040}30413042emit_cf_list(ctx, &nif->then_list);3043emit_cf_list(ctx, &nif->else_list);30443045struct ir3_block *last_then = get_block(ctx, nir_if_last_then_block(nif));3046struct ir3_block *first_else = get_block(ctx, nir_if_first_else_block(nif));3047assert(last_then->physical_successors[0] &&3048!last_then->physical_successors[1]);3049last_then->physical_successors[1] = first_else;30503051struct ir3_block *last_else = get_block(ctx, nir_if_last_else_block(nif));3052struct ir3_block *after_if =3053get_block(ctx, nir_cf_node_as_block(nir_cf_node_next(&nif->cf_node)));3054last_else->physical_successors[0] = after_if;3055}30563057static void3058emit_loop(struct ir3_context *ctx, nir_loop *nloop)3059{3060unsigned old_loop_id = ctx->loop_id;3061ctx->loop_id = ctx->so->loops + 1;30623063struct nir_block *nstart = nir_loop_first_block(nloop);3064struct ir3_block *continue_blk = NULL;30653066/* There's always one incoming edge from outside the loop, and if there3067* are more than two backedges from inside the loop (so more than 2 total3068* edges) then we need to create a continue block after the loop to ensure3069* that control reconverges at the end of each loop iteration.3070*/3071if (nstart->predecessors->entries > 2) {3072continue_blk = create_continue_block(ctx, nstart);3073}30743075emit_cf_list(ctx, &nloop->body);30763077if (continue_blk) {3078struct ir3_block *start = get_block(ctx, nstart);3079continue_blk->successors[0] = start;3080continue_blk->physical_successors[0] = start;3081list_addtail(&continue_blk->node, &ctx->ir->block_list);3082}30833084ctx->so->loops++;3085ctx->loop_id = old_loop_id;3086}30873088static void3089stack_push(struct ir3_context *ctx)3090{3091ctx->stack++;3092ctx->max_stack = MAX2(ctx->max_stack, ctx->stack);3093}30943095static void3096stack_pop(struct ir3_context *ctx)3097{3098compile_assert(ctx, ctx->stack > 0);3099ctx->stack--;3100}31013102static void3103emit_cf_list(struct ir3_context *ctx, struct exec_list *list)3104{3105foreach_list_typed (nir_cf_node, node, node, list) {3106switch (node->type) {3107case nir_cf_node_block:3108emit_block(ctx, nir_cf_node_as_block(node));3109break;3110case nir_cf_node_if:3111stack_push(ctx);3112emit_if(ctx, nir_cf_node_as_if(node));3113stack_pop(ctx);3114break;3115case nir_cf_node_loop:3116stack_push(ctx);3117emit_loop(ctx, nir_cf_node_as_loop(node));3118stack_pop(ctx);3119break;3120case nir_cf_node_function:3121ir3_context_error(ctx, "TODO\n");3122break;3123}3124}3125}31263127/* emit stream-out code. At this point, the current block is the original3128* (nir) end block, and nir ensures that all flow control paths terminate3129* into the end block. We re-purpose the original end block to generate3130* the 'if (vtxcnt < maxvtxcnt)' condition, then append the conditional3131* block holding stream-out write instructions, followed by the new end3132* block:3133*3134* blockOrigEnd {3135* p0.x = (vtxcnt < maxvtxcnt)3136* // succs: blockStreamOut, blockNewEnd3137* }3138* blockStreamOut {3139* // preds: blockOrigEnd3140* ... stream-out instructions ...3141* // succs: blockNewEnd3142* }3143* blockNewEnd {3144* // preds: blockOrigEnd, blockStreamOut3145* }3146*/3147static void3148emit_stream_out(struct ir3_context *ctx)3149{3150struct ir3 *ir = ctx->ir;3151struct ir3_stream_output_info *strmout = &ctx->so->shader->stream_output;3152struct ir3_block *orig_end_block, *stream_out_block, *new_end_block;3153struct ir3_instruction *vtxcnt, *maxvtxcnt, *cond;3154struct ir3_instruction *bases[IR3_MAX_SO_BUFFERS];31553156/* create vtxcnt input in input block at top of shader,3157* so that it is seen as live over the entire duration3158* of the shader:3159*/3160vtxcnt = create_sysval_input(ctx, SYSTEM_VALUE_VERTEX_CNT, 0x1);3161maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX);31623163/* at this point, we are at the original 'end' block,3164* re-purpose this block to stream-out condition, then3165* append stream-out block and new-end block3166*/3167orig_end_block = ctx->block;31683169// maybe w/ store_global intrinsic, we could do this3170// stuff in nir->nir pass31713172stream_out_block = ir3_block_create(ir);3173list_addtail(&stream_out_block->node, &ir->block_list);31743175new_end_block = ir3_block_create(ir);3176list_addtail(&new_end_block->node, &ir->block_list);31773178orig_end_block->successors[0] = stream_out_block;3179orig_end_block->successors[1] = new_end_block;31803181stream_out_block->successors[0] = new_end_block;31823183/* setup 'if (vtxcnt < maxvtxcnt)' condition: */3184cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0);3185cond->dsts[0]->num = regid(REG_P0, 0);3186cond->dsts[0]->flags &= ~IR3_REG_SSA;3187cond->cat2.condition = IR3_COND_LT;31883189/* condition goes on previous block to the conditional,3190* since it is used to pick which of the two successor3191* paths to take:3192*/3193orig_end_block->condition = cond;31943195/* switch to stream_out_block to generate the stream-out3196* instructions:3197*/3198ctx->block = stream_out_block;31993200/* Calculate base addresses based on vtxcnt. Instructions3201* generated for bases not used in following loop will be3202* stripped out in the backend.3203*/3204for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) {3205const struct ir3_const_state *const_state = ir3_const_state(ctx->so);3206unsigned stride = strmout->stride[i];3207struct ir3_instruction *base, *off;32083209base = create_uniform(ctx->block, regid(const_state->offsets.tfbo, i));32103211/* 24-bit should be enough: */3212off = ir3_MUL_U24(ctx->block, vtxcnt, 0,3213create_immed(ctx->block, stride * 4), 0);32143215bases[i] = ir3_ADD_S(ctx->block, off, 0, base, 0);3216}32173218/* Generate the per-output store instructions: */3219for (unsigned i = 0; i < strmout->num_outputs; i++) {3220for (unsigned j = 0; j < strmout->output[i].num_components; j++) {3221unsigned c = j + strmout->output[i].start_component;3222struct ir3_instruction *base, *out, *stg;32233224base = bases[strmout->output[i].output_buffer];3225out = ctx->outputs[regid(strmout->output[i].register_index, c)];32263227stg = ir3_STG(3228ctx->block, base, 0,3229create_immed(ctx->block, (strmout->output[i].dst_offset + j) * 4),32300, out, 0, create_immed(ctx->block, 1), 0);3231stg->cat6.type = TYPE_U32;32323233array_insert(ctx->block, ctx->block->keeps, stg);3234}3235}32363237/* and finally switch to the new_end_block: */3238ctx->block = new_end_block;3239}32403241static void3242setup_predecessors(struct ir3 *ir)3243{3244foreach_block (block, &ir->block_list) {3245for (int i = 0; i < ARRAY_SIZE(block->successors); i++) {3246if (block->successors[i])3247ir3_block_add_predecessor(block->successors[i], block);3248if (block->physical_successors[i])3249ir3_block_add_physical_predecessor(block->physical_successors[i],3250block);3251}3252}3253}32543255static void3256emit_function(struct ir3_context *ctx, nir_function_impl *impl)3257{3258nir_metadata_require(impl, nir_metadata_block_index);32593260compile_assert(ctx, ctx->stack == 0);32613262emit_cf_list(ctx, &impl->body);3263emit_block(ctx, impl->end_block);32643265compile_assert(ctx, ctx->stack == 0);32663267/* at this point, we should have a single empty block,3268* into which we emit the 'end' instruction.3269*/3270compile_assert(ctx, list_is_empty(&ctx->block->instr_list));32713272/* If stream-out (aka transform-feedback) enabled, emit the3273* stream-out instructions, followed by a new empty block (into3274* which the 'end' instruction lands).3275*3276* NOTE: it is done in this order, rather than inserting before3277* we emit end_block, because NIR guarantees that all blocks3278* flow into end_block, and that end_block has no successors.3279* So by re-purposing end_block as the first block of stream-3280* out, we guarantee that all exit paths flow into the stream-3281* out instructions.3282*/3283if ((ctx->compiler->gpu_id < 500) &&3284(ctx->so->shader->stream_output.num_outputs > 0) &&3285!ctx->so->binning_pass) {3286debug_assert(ctx->so->type == MESA_SHADER_VERTEX);3287emit_stream_out(ctx);3288}32893290setup_predecessors(ctx->ir);3291foreach_block (block, &ctx->ir->block_list) {3292resolve_phis(ctx, block);3293}3294}32953296static void3297setup_input(struct ir3_context *ctx, nir_intrinsic_instr *intr)3298{3299struct ir3_shader_variant *so = ctx->so;3300struct ir3_instruction *coord = NULL;33013302if (intr->intrinsic == nir_intrinsic_load_interpolated_input)3303coord = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), 2);33043305compile_assert(ctx, nir_src_is_const(intr->src[coord ? 1 : 0]));33063307unsigned frac = nir_intrinsic_component(intr);3308unsigned offset = nir_src_as_uint(intr->src[coord ? 1 : 0]);3309unsigned ncomp = nir_intrinsic_dest_components(intr);3310unsigned n = nir_intrinsic_base(intr) + offset;3311unsigned slot = nir_intrinsic_io_semantics(intr).location + offset;3312unsigned compmask;33133314/* Inputs are loaded using ldlw or ldg for other stages. */3315compile_assert(ctx, ctx->so->type == MESA_SHADER_FRAGMENT ||3316ctx->so->type == MESA_SHADER_VERTEX);33173318if (ctx->so->type == MESA_SHADER_FRAGMENT)3319compmask = BITFIELD_MASK(ncomp) << frac;3320else3321compmask = BITFIELD_MASK(ncomp + frac);33223323/* for a4xx+ rasterflat */3324if (so->inputs[n].rasterflat && ctx->so->key.rasterflat)3325coord = NULL;33263327so->total_in += util_bitcount(compmask & ~so->inputs[n].compmask);33283329so->inputs[n].slot = slot;3330so->inputs[n].compmask |= compmask;3331so->inputs_count = MAX2(so->inputs_count, n + 1);3332compile_assert(ctx, so->inputs_count < ARRAY_SIZE(so->inputs));3333so->inputs[n].flat = !coord;33343335if (ctx->so->type == MESA_SHADER_FRAGMENT) {3336compile_assert(ctx, slot != VARYING_SLOT_POS);33373338so->inputs[n].bary = true;33393340for (int i = 0; i < ncomp; i++) {3341unsigned idx = (n * 4) + i + frac;3342ctx->last_dst[i] = create_frag_input(ctx, coord, idx);3343}3344} else {3345struct ir3_instruction *input = NULL;33463347foreach_input (in, ctx->ir) {3348if (in->input.inidx == n) {3349input = in;3350break;3351}3352}33533354if (!input) {3355input = create_input(ctx, compmask);3356input->input.inidx = n;3357} else {3358/* For aliased inputs, just append to the wrmask.. ie. if we3359* first see a vec2 index at slot N, and then later a vec4,3360* the wrmask of the resulting overlapped vec2 and vec4 is 0xf3361*/3362input->dsts[0]->wrmask |= compmask;3363}33643365for (int i = 0; i < ncomp + frac; i++) {3366unsigned idx = (n * 4) + i;3367compile_assert(ctx, idx < ctx->ninputs);33683369/* fixup the src wrmask to avoid validation fail */3370if (ctx->inputs[idx] && (ctx->inputs[idx] != input)) {3371ctx->inputs[idx]->srcs[0]->wrmask = input->dsts[0]->wrmask;3372continue;3373}33743375ir3_split_dest(ctx->block, &ctx->inputs[idx], input, i, 1);3376}33773378for (int i = 0; i < ncomp; i++) {3379unsigned idx = (n * 4) + i + frac;3380ctx->last_dst[i] = ctx->inputs[idx];3381}3382}3383}33843385/* Initially we assign non-packed inloc's for varyings, as we don't really3386* know up-front which components will be unused. After all the compilation3387* stages we scan the shader to see which components are actually used, and3388* re-pack the inlocs to eliminate unneeded varyings.3389*/3390static void3391pack_inlocs(struct ir3_context *ctx)3392{3393struct ir3_shader_variant *so = ctx->so;3394uint8_t used_components[so->inputs_count];33953396memset(used_components, 0, sizeof(used_components));33973398/*3399* First Step: scan shader to find which bary.f/ldlv remain:3400*/34013402foreach_block (block, &ctx->ir->block_list) {3403foreach_instr (instr, &block->instr_list) {3404if (is_input(instr)) {3405unsigned inloc = instr->srcs[0]->iim_val;3406unsigned i = inloc / 4;3407unsigned j = inloc % 4;34083409compile_assert(ctx, instr->srcs[0]->flags & IR3_REG_IMMED);3410compile_assert(ctx, i < so->inputs_count);34113412used_components[i] |= 1 << j;3413} else if (instr->opc == OPC_META_TEX_PREFETCH) {3414for (int n = 0; n < 2; n++) {3415unsigned inloc = instr->prefetch.input_offset + n;3416unsigned i = inloc / 4;3417unsigned j = inloc % 4;34183419compile_assert(ctx, i < so->inputs_count);34203421used_components[i] |= 1 << j;3422}3423}3424}3425}34263427/*3428* Second Step: reassign varying inloc/slots:3429*/34303431unsigned actual_in = 0;3432unsigned inloc = 0;34333434/* for clip+cull distances, unused components can't be eliminated because3435* they're read by fixed-function, even if there's a hole. Note that3436* clip/cull distance arrays must be declared in the FS, so we can just3437* use the NIR clip/cull distances to avoid reading ucp_enables in the3438* shader key.3439*/3440unsigned clip_cull_size =3441ctx->so->shader->nir->info.clip_distance_array_size +3442ctx->so->shader->nir->info.cull_distance_array_size;3443unsigned clip_cull_mask = MASK(clip_cull_size);34443445for (unsigned i = 0; i < so->inputs_count; i++) {3446unsigned compmask = 0, maxcomp = 0;34473448so->inputs[i].inloc = inloc;3449so->inputs[i].bary = false;34503451if (so->inputs[i].slot == VARYING_SLOT_CLIP_DIST0 ||3452so->inputs[i].slot == VARYING_SLOT_CLIP_DIST1) {3453if (so->inputs[i].slot == VARYING_SLOT_CLIP_DIST0)3454compmask = clip_cull_mask & 0xf;3455else3456compmask = clip_cull_mask >> 4;3457used_components[i] = compmask;3458}34593460for (unsigned j = 0; j < 4; j++) {3461if (!(used_components[i] & (1 << j)))3462continue;34633464compmask |= (1 << j);3465actual_in++;3466maxcomp = j + 1;34673468/* at this point, since used_components[i] mask is only3469* considering varyings (ie. not sysvals) we know this3470* is a varying:3471*/3472so->inputs[i].bary = true;3473}34743475if (so->inputs[i].bary) {3476so->varying_in++;3477so->inputs[i].compmask = (1 << maxcomp) - 1;3478inloc += maxcomp;3479}3480}34813482/*3483* Third Step: reassign packed inloc's:3484*/34853486foreach_block (block, &ctx->ir->block_list) {3487foreach_instr (instr, &block->instr_list) {3488if (is_input(instr)) {3489unsigned inloc = instr->srcs[0]->iim_val;3490unsigned i = inloc / 4;3491unsigned j = inloc % 4;34923493instr->srcs[0]->iim_val = so->inputs[i].inloc + j;3494} else if (instr->opc == OPC_META_TEX_PREFETCH) {3495unsigned i = instr->prefetch.input_offset / 4;3496unsigned j = instr->prefetch.input_offset % 4;3497instr->prefetch.input_offset = so->inputs[i].inloc + j;3498}3499}3500}3501}35023503static void3504setup_output(struct ir3_context *ctx, nir_intrinsic_instr *intr)3505{3506struct ir3_shader_variant *so = ctx->so;3507nir_io_semantics io = nir_intrinsic_io_semantics(intr);35083509compile_assert(ctx, nir_src_is_const(intr->src[1]));35103511unsigned offset = nir_src_as_uint(intr->src[1]);3512unsigned n = nir_intrinsic_base(intr) + offset;3513unsigned frac = nir_intrinsic_component(intr);3514unsigned ncomp = nir_intrinsic_src_components(intr, 0);35153516/* For per-view variables, each user-facing slot corresponds to multiple3517* views, each with a corresponding driver_location, and the offset is for3518* the driver_location. To properly figure out of the slot, we'd need to3519* plumb through the number of views. However, for now we only use3520* per-view with gl_Position, so we assume that the variable is not an3521* array or matrix (so there are no indirect accesses to the variable3522* itself) and the indirect offset corresponds to the view.3523*/3524unsigned slot = io.location + (io.per_view ? 0 : offset);35253526if (ctx->so->type == MESA_SHADER_FRAGMENT) {3527switch (slot) {3528case FRAG_RESULT_DEPTH:3529so->writes_pos = true;3530break;3531case FRAG_RESULT_COLOR:3532if (!ctx->s->info.fs.color_is_dual_source) {3533so->color0_mrt = 1;3534} else {3535slot = FRAG_RESULT_DATA0 + io.dual_source_blend_index;3536}3537break;3538case FRAG_RESULT_SAMPLE_MASK:3539so->writes_smask = true;3540break;3541case FRAG_RESULT_STENCIL:3542so->writes_stencilref = true;3543break;3544default:3545slot += io.dual_source_blend_index; /* For dual-src blend */3546if (slot >= FRAG_RESULT_DATA0)3547break;3548ir3_context_error(ctx, "unknown FS output name: %s\n",3549gl_frag_result_name(slot));3550}3551} else if (ctx->so->type == MESA_SHADER_VERTEX ||3552ctx->so->type == MESA_SHADER_TESS_EVAL ||3553ctx->so->type == MESA_SHADER_GEOMETRY) {3554switch (slot) {3555case VARYING_SLOT_POS:3556so->writes_pos = true;3557break;3558case VARYING_SLOT_PSIZ:3559so->writes_psize = true;3560break;3561case VARYING_SLOT_PRIMITIVE_ID:3562case VARYING_SLOT_GS_VERTEX_FLAGS_IR3:3563debug_assert(ctx->so->type == MESA_SHADER_GEOMETRY);3564FALLTHROUGH;3565case VARYING_SLOT_COL0:3566case VARYING_SLOT_COL1:3567case VARYING_SLOT_BFC0:3568case VARYING_SLOT_BFC1:3569case VARYING_SLOT_FOGC:3570case VARYING_SLOT_CLIP_DIST0:3571case VARYING_SLOT_CLIP_DIST1:3572case VARYING_SLOT_CLIP_VERTEX:3573case VARYING_SLOT_LAYER:3574case VARYING_SLOT_VIEWPORT:3575break;3576default:3577if (slot >= VARYING_SLOT_VAR0)3578break;3579if ((VARYING_SLOT_TEX0 <= slot) && (slot <= VARYING_SLOT_TEX7))3580break;3581ir3_context_error(ctx, "unknown %s shader output name: %s\n",3582_mesa_shader_stage_to_string(ctx->so->type),3583gl_varying_slot_name_for_stage(slot, ctx->so->type));3584}3585} else {3586ir3_context_error(ctx, "unknown shader type: %d\n", ctx->so->type);3587}35883589so->outputs_count = MAX2(so->outputs_count, n + 1);3590compile_assert(ctx, so->outputs_count < ARRAY_SIZE(so->outputs));35913592so->outputs[n].slot = slot;3593if (io.per_view)3594so->outputs[n].view = offset;35953596for (int i = 0; i < ncomp; i++) {3597unsigned idx = (n * 4) + i + frac;3598compile_assert(ctx, idx < ctx->noutputs);3599ctx->outputs[idx] = create_immed(ctx->block, fui(0.0));3600}36013602/* if varying packing doesn't happen, we could end up in a situation3603* with "holes" in the output, and since the per-generation code that3604* sets up varying linkage registers doesn't expect to have more than3605* one varying per vec4 slot, pad the holes.3606*3607* Note that this should probably generate a performance warning of3608* some sort.3609*/3610for (int i = 0; i < frac; i++) {3611unsigned idx = (n * 4) + i;3612if (!ctx->outputs[idx]) {3613ctx->outputs[idx] = create_immed(ctx->block, fui(0.0));3614}3615}36163617struct ir3_instruction *const *src = ir3_get_src(ctx, &intr->src[0]);3618for (int i = 0; i < ncomp; i++) {3619unsigned idx = (n * 4) + i + frac;3620ctx->outputs[idx] = src[i];3621}3622}36233624static bool3625uses_load_input(struct ir3_shader_variant *so)3626{3627return so->type == MESA_SHADER_VERTEX || so->type == MESA_SHADER_FRAGMENT;3628}36293630static bool3631uses_store_output(struct ir3_shader_variant *so)3632{3633switch (so->type) {3634case MESA_SHADER_VERTEX:3635return !so->key.has_gs && !so->key.tessellation;3636case MESA_SHADER_TESS_EVAL:3637return !so->key.has_gs;3638case MESA_SHADER_GEOMETRY:3639case MESA_SHADER_FRAGMENT:3640return true;3641case MESA_SHADER_TESS_CTRL:3642case MESA_SHADER_COMPUTE:3643return false;3644default:3645unreachable("unknown stage");3646}3647}36483649static void3650emit_instructions(struct ir3_context *ctx)3651{3652nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s);36533654/* some varying setup which can't be done in setup_input(): */3655if (ctx->so->type == MESA_SHADER_FRAGMENT) {3656nir_foreach_shader_in_variable (var, ctx->s) {3657/* if any varyings have 'sample' qualifer, that triggers us3658* to run in per-sample mode:3659*/3660if (var->data.sample)3661ctx->so->per_samp = true;36623663/* set rasterflat flag for front/back color */3664if (var->data.interpolation == INTERP_MODE_NONE) {3665switch (var->data.location) {3666case VARYING_SLOT_COL0:3667case VARYING_SLOT_COL1:3668case VARYING_SLOT_BFC0:3669case VARYING_SLOT_BFC1:3670ctx->so->inputs[var->data.driver_location].rasterflat = true;3671break;3672default:3673break;3674}3675}3676}3677}36783679if (uses_load_input(ctx->so)) {3680ctx->so->inputs_count = ctx->s->num_inputs;3681compile_assert(ctx, ctx->so->inputs_count < ARRAY_SIZE(ctx->so->inputs));3682ctx->ninputs = ctx->s->num_inputs * 4;3683ctx->inputs = rzalloc_array(ctx, struct ir3_instruction *, ctx->ninputs);3684} else {3685ctx->ninputs = 0;3686ctx->so->inputs_count = 0;3687}36883689if (uses_store_output(ctx->so)) {3690ctx->noutputs = ctx->s->num_outputs * 4;3691ctx->outputs =3692rzalloc_array(ctx, struct ir3_instruction *, ctx->noutputs);3693} else {3694ctx->noutputs = 0;3695}36963697ctx->ir = ir3_create(ctx->compiler, ctx->so);36983699/* Create inputs in first block: */3700ctx->block = get_block(ctx, nir_start_block(fxn));3701ctx->in_block = ctx->block;37023703/* for fragment shader, the vcoord input register is used as the3704* base for bary.f varying fetch instrs:3705*3706* TODO defer creating ctx->ij_pixel and corresponding sysvals3707* until emit_intrinsic when we know they are actually needed.3708* For now, we defer creating ctx->ij_centroid, etc, since we3709* only need ij_pixel for "old style" varying inputs (ie.3710* tgsi_to_nir)3711*/3712if (ctx->so->type == MESA_SHADER_FRAGMENT) {3713ctx->ij[IJ_PERSP_PIXEL] = create_input(ctx, 0x3);3714}37153716/* Defer add_sysval_input() stuff until after setup_inputs(),3717* because sysvals need to be appended after varyings:3718*/3719if (ctx->ij[IJ_PERSP_PIXEL]) {3720add_sysval_input_compmask(ctx, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL, 0x3,3721ctx->ij[IJ_PERSP_PIXEL]);3722}37233724/* Tesselation shaders always need primitive ID for indexing the3725* BO. Geometry shaders don't always need it but when they do it has be3726* delivered and unclobbered in the VS. To make things easy, we always3727* make room for it in VS/DS.3728*/3729bool has_tess = ctx->so->key.tessellation != IR3_TESS_NONE;3730bool has_gs = ctx->so->key.has_gs;3731switch (ctx->so->type) {3732case MESA_SHADER_VERTEX:3733if (has_tess) {3734ctx->tcs_header =3735create_sysval_input(ctx, SYSTEM_VALUE_TCS_HEADER_IR3, 0x1);3736ctx->primitive_id =3737create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);3738} else if (has_gs) {3739ctx->gs_header =3740create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1);3741ctx->primitive_id =3742create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);3743}3744break;3745case MESA_SHADER_TESS_CTRL:3746ctx->tcs_header =3747create_sysval_input(ctx, SYSTEM_VALUE_TCS_HEADER_IR3, 0x1);3748ctx->primitive_id =3749create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);3750break;3751case MESA_SHADER_TESS_EVAL:3752if (has_gs)3753ctx->gs_header =3754create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1);3755ctx->primitive_id =3756create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);3757break;3758case MESA_SHADER_GEOMETRY:3759ctx->gs_header =3760create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1);3761ctx->primitive_id =3762create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);3763break;3764default:3765break;3766}37673768/* Find # of samplers. Just assume that we'll be reading from images.. if3769* it is write-only we don't have to count it, but after lowering derefs3770* is too late to compact indices for that.3771*/3772ctx->so->num_samp =3773BITSET_LAST_BIT(ctx->s->info.textures_used) + ctx->s->info.num_images;37743775/* Save off clip+cull information. */3776ctx->so->clip_mask = MASK(ctx->s->info.clip_distance_array_size);3777ctx->so->cull_mask = MASK(ctx->s->info.cull_distance_array_size)3778<< ctx->s->info.clip_distance_array_size;37793780ctx->so->pvtmem_size = ctx->s->scratch_size;3781ctx->so->shared_size = ctx->s->info.shared_size;37823783/* NOTE: need to do something more clever when we support >1 fxn */3784nir_foreach_register (reg, &fxn->registers) {3785ir3_declare_array(ctx, reg);3786}3787/* And emit the body: */3788ctx->impl = fxn;3789emit_function(ctx, fxn);3790}37913792/* Fixup tex sampler state for astc/srgb workaround instructions. We3793* need to assign the tex state indexes for these after we know the3794* max tex index.3795*/3796static void3797fixup_astc_srgb(struct ir3_context *ctx)3798{3799struct ir3_shader_variant *so = ctx->so;3800/* indexed by original tex idx, value is newly assigned alpha sampler3801* state tex idx. Zero is invalid since there is at least one sampler3802* if we get here.3803*/3804unsigned alt_tex_state[16] = {0};3805unsigned tex_idx = ctx->max_texture_index + 1;3806unsigned idx = 0;38073808so->astc_srgb.base = tex_idx;38093810for (unsigned i = 0; i < ctx->ir->astc_srgb_count; i++) {3811struct ir3_instruction *sam = ctx->ir->astc_srgb[i];38123813compile_assert(ctx, sam->cat5.tex < ARRAY_SIZE(alt_tex_state));38143815if (alt_tex_state[sam->cat5.tex] == 0) {3816/* assign new alternate/alpha tex state slot: */3817alt_tex_state[sam->cat5.tex] = tex_idx++;3818so->astc_srgb.orig_idx[idx++] = sam->cat5.tex;3819so->astc_srgb.count++;3820}38213822sam->cat5.tex = alt_tex_state[sam->cat5.tex];3823}3824}38253826static bool3827output_slot_used_for_binning(gl_varying_slot slot)3828{3829return slot == VARYING_SLOT_POS || slot == VARYING_SLOT_PSIZ ||3830slot == VARYING_SLOT_CLIP_DIST0 || slot == VARYING_SLOT_CLIP_DIST1 ||3831slot == VARYING_SLOT_VIEWPORT;3832}38333834static struct ir3_instruction *3835find_end(struct ir3 *ir)3836{3837foreach_block_rev (block, &ir->block_list) {3838foreach_instr_rev (instr, &block->instr_list) {3839if (instr->opc == OPC_END || instr->opc == OPC_CHMASK)3840return instr;3841}3842}3843unreachable("couldn't find end instruction");3844}38453846static void3847fixup_binning_pass(struct ir3_context *ctx, struct ir3_instruction *end)3848{3849struct ir3_shader_variant *so = ctx->so;3850unsigned i, j;38513852/* first pass, remove unused outputs from the IR level outputs: */3853for (i = 0, j = 0; i < end->srcs_count; i++) {3854unsigned outidx = end->end.outidxs[i];3855unsigned slot = so->outputs[outidx].slot;38563857if (output_slot_used_for_binning(slot)) {3858end->srcs[j] = end->srcs[i];3859end->end.outidxs[j] = end->end.outidxs[i];3860j++;3861}3862}3863end->srcs_count = j;38643865/* second pass, cleanup the unused slots in ir3_shader_variant::outputs3866* table:3867*/3868for (i = 0, j = 0; i < so->outputs_count; i++) {3869unsigned slot = so->outputs[i].slot;38703871if (output_slot_used_for_binning(slot)) {3872so->outputs[j] = so->outputs[i];38733874/* fixup outidx to point to new output table entry: */3875for (unsigned k = 0; k < end->srcs_count; k++) {3876if (end->end.outidxs[k] == i) {3877end->end.outidxs[k] = j;3878break;3879}3880}38813882j++;3883}3884}3885so->outputs_count = j;3886}38873888static void3889collect_tex_prefetches(struct ir3_context *ctx, struct ir3 *ir)3890{3891unsigned idx = 0;38923893/* Collect sampling instructions eligible for pre-dispatch. */3894foreach_block (block, &ir->block_list) {3895foreach_instr_safe (instr, &block->instr_list) {3896if (instr->opc == OPC_META_TEX_PREFETCH) {3897assert(idx < ARRAY_SIZE(ctx->so->sampler_prefetch));3898struct ir3_sampler_prefetch *fetch =3899&ctx->so->sampler_prefetch[idx];3900idx++;39013902if (instr->flags & IR3_INSTR_B) {3903fetch->cmd = IR3_SAMPLER_BINDLESS_PREFETCH_CMD;3904/* In bindless mode, the index is actually the base */3905fetch->tex_id = instr->prefetch.tex_base;3906fetch->samp_id = instr->prefetch.samp_base;3907fetch->tex_bindless_id = instr->prefetch.tex;3908fetch->samp_bindless_id = instr->prefetch.samp;3909} else {3910fetch->cmd = IR3_SAMPLER_PREFETCH_CMD;3911fetch->tex_id = instr->prefetch.tex;3912fetch->samp_id = instr->prefetch.samp;3913}3914fetch->wrmask = instr->dsts[0]->wrmask;3915fetch->dst = instr->dsts[0]->num;3916fetch->src = instr->prefetch.input_offset;39173918/* These are the limits on a5xx/a6xx, we might need to3919* revisit if SP_FS_PREFETCH[n] changes on later gens:3920*/3921assert(fetch->dst <= 0x3f);3922assert(fetch->tex_id <= 0x1f);3923assert(fetch->samp_id < 0xf);39243925ctx->so->total_in =3926MAX2(ctx->so->total_in, instr->prefetch.input_offset + 2);39273928fetch->half_precision = !!(instr->dsts[0]->flags & IR3_REG_HALF);39293930/* Remove the prefetch placeholder instruction: */3931list_delinit(&instr->node);3932}3933}3934}3935}39363937int3938ir3_compile_shader_nir(struct ir3_compiler *compiler,3939struct ir3_shader_variant *so)3940{3941struct ir3_context *ctx;3942struct ir3 *ir;3943int ret = 0, max_bary;3944bool progress;39453946assert(!so->ir);39473948ctx = ir3_context_init(compiler, so);3949if (!ctx) {3950DBG("INIT failed!");3951ret = -1;3952goto out;3953}39543955emit_instructions(ctx);39563957if (ctx->error) {3958DBG("EMIT failed!");3959ret = -1;3960goto out;3961}39623963ir = so->ir = ctx->ir;39643965/* Vertex shaders in a tessellation or geometry pipeline treat END as a3966* NOP and has an epilogue that writes the VS outputs to local storage, to3967* be read by the HS. Then it resets execution mask (chmask) and chains3968* to the next shader (chsh). There are also a few output values which we3969* must send to the next stage via registers, and in order for both stages3970* to agree on the register used we must force these to be in specific3971* registers.3972*/3973if ((so->type == MESA_SHADER_VERTEX &&3974(so->key.has_gs || so->key.tessellation)) ||3975(so->type == MESA_SHADER_TESS_EVAL && so->key.has_gs)) {3976struct ir3_instruction *outputs[3];3977unsigned outidxs[3];3978unsigned regids[3];3979unsigned outputs_count = 0;39803981if (ctx->primitive_id) {3982unsigned n = so->outputs_count++;3983so->outputs[n].slot = VARYING_SLOT_PRIMITIVE_ID;39843985struct ir3_instruction *out = ir3_collect(ctx, ctx->primitive_id);3986outputs[outputs_count] = out;3987outidxs[outputs_count] = n;3988regids[outputs_count] = regid(0, 1);3989outputs_count++;3990}39913992if (ctx->gs_header) {3993unsigned n = so->outputs_count++;3994so->outputs[n].slot = VARYING_SLOT_GS_HEADER_IR3;3995struct ir3_instruction *out = ir3_collect(ctx, ctx->gs_header);3996outputs[outputs_count] = out;3997outidxs[outputs_count] = n;3998regids[outputs_count] = regid(0, 0);3999outputs_count++;4000}40014002if (ctx->tcs_header) {4003unsigned n = so->outputs_count++;4004so->outputs[n].slot = VARYING_SLOT_TCS_HEADER_IR3;4005struct ir3_instruction *out = ir3_collect(ctx, ctx->tcs_header);4006outputs[outputs_count] = out;4007outidxs[outputs_count] = n;4008regids[outputs_count] = regid(0, 0);4009outputs_count++;4010}40114012struct ir3_instruction *chmask =4013ir3_instr_create(ctx->block, OPC_CHMASK, 0, outputs_count);4014chmask->barrier_class = IR3_BARRIER_EVERYTHING;4015chmask->barrier_conflict = IR3_BARRIER_EVERYTHING;40164017for (unsigned i = 0; i < outputs_count; i++)4018__ssa_src(chmask, outputs[i], 0)->num = regids[i];40194020chmask->end.outidxs = ralloc_array(chmask, unsigned, outputs_count);4021memcpy(chmask->end.outidxs, outidxs, sizeof(unsigned) * outputs_count);40224023array_insert(ctx->block, ctx->block->keeps, chmask);40244025struct ir3_instruction *chsh = ir3_CHSH(ctx->block);4026chsh->barrier_class = IR3_BARRIER_EVERYTHING;4027chsh->barrier_conflict = IR3_BARRIER_EVERYTHING;4028} else {4029assert((ctx->noutputs % 4) == 0);4030unsigned outidxs[ctx->noutputs / 4];4031struct ir3_instruction *outputs[ctx->noutputs / 4];4032unsigned outputs_count = 0;40334034struct ir3_block *old_block = ctx->block;4035/* Insert these collect's in the block before the end-block if4036* possible, so that any moves they generate can be shuffled around to4037* reduce nop's:4038*/4039if (ctx->block->predecessors_count == 1)4040ctx->block = ctx->block->predecessors[0];40414042/* Setup IR level outputs, which are "collects" that gather4043* the scalar components of outputs.4044*/4045for (unsigned i = 0; i < ctx->noutputs; i += 4) {4046unsigned ncomp = 0;4047/* figure out the # of components written:4048*4049* TODO do we need to handle holes, ie. if .x and .z4050* components written, but .y component not written?4051*/4052for (unsigned j = 0; j < 4; j++) {4053if (!ctx->outputs[i + j])4054break;4055ncomp++;4056}40574058/* Note that in some stages, like TCS, store_output is4059* lowered to memory writes, so no components of the4060* are "written" from the PoV of traditional store-4061* output instructions:4062*/4063if (!ncomp)4064continue;40654066struct ir3_instruction *out =4067ir3_create_collect(ctx, &ctx->outputs[i], ncomp);40684069int outidx = i / 4;4070assert(outidx < so->outputs_count);40714072outidxs[outputs_count] = outidx;4073outputs[outputs_count] = out;4074outputs_count++;4075}40764077/* for a6xx+, binning and draw pass VS use same VBO state, so we4078* need to make sure not to remove any inputs that are used by4079* the nonbinning VS.4080*/4081if (ctx->compiler->gpu_id >= 600 && so->binning_pass &&4082so->type == MESA_SHADER_VERTEX) {4083for (int i = 0; i < ctx->ninputs; i++) {4084struct ir3_instruction *in = ctx->inputs[i];40854086if (!in)4087continue;40884089unsigned n = i / 4;4090unsigned c = i % 4;40914092debug_assert(n < so->nonbinning->inputs_count);40934094if (so->nonbinning->inputs[n].sysval)4095continue;40964097/* be sure to keep inputs, even if only used in VS */4098if (so->nonbinning->inputs[n].compmask & (1 << c))4099array_insert(in->block, in->block->keeps, in);4100}4101}41024103ctx->block = old_block;41044105struct ir3_instruction *end =4106ir3_instr_create(ctx->block, OPC_END, 0, outputs_count);41074108for (unsigned i = 0; i < outputs_count; i++) {4109__ssa_src(end, outputs[i], 0);4110}41114112end->end.outidxs = ralloc_array(end, unsigned, outputs_count);4113memcpy(end->end.outidxs, outidxs, sizeof(unsigned) * outputs_count);41144115array_insert(ctx->block, ctx->block->keeps, end);41164117/* at this point, for binning pass, throw away unneeded outputs: */4118if (so->binning_pass && (ctx->compiler->gpu_id < 600))4119fixup_binning_pass(ctx, end);4120}41214122ir3_debug_print(ir, "AFTER: nir->ir3");4123ir3_validate(ir);41244125IR3_PASS(ir, ir3_array_to_ssa);41264127do {4128progress = false;41294130progress |= IR3_PASS(ir, ir3_cf);4131progress |= IR3_PASS(ir, ir3_cp, so);4132progress |= IR3_PASS(ir, ir3_cse);4133progress |= IR3_PASS(ir, ir3_dce, so);4134} while (progress);41354136/* at this point, for binning pass, throw away unneeded outputs:4137* Note that for a6xx and later, we do this after ir3_cp to ensure4138* that the uniform/constant layout for BS and VS matches, so that4139* we can re-use same VS_CONST state group.4140*/4141if (so->binning_pass && (ctx->compiler->gpu_id >= 600)) {4142fixup_binning_pass(ctx, find_end(ctx->so->ir));4143/* cleanup the result of removing unneeded outputs: */4144while (IR3_PASS(ir, ir3_dce, so)) {4145}4146}41474148IR3_PASS(ir, ir3_sched_add_deps);41494150/* At this point, all the dead code should be long gone: */4151assert(!IR3_PASS(ir, ir3_dce, so));41524153ret = ir3_sched(ir);4154if (ret) {4155DBG("SCHED failed!");4156goto out;4157}41584159ir3_debug_print(ir, "AFTER: ir3_sched");41604161if (IR3_PASS(ir, ir3_cp_postsched)) {4162/* cleanup the result of removing unneeded mov's: */4163while (IR3_PASS(ir, ir3_dce, so)) {4164}4165}41664167/* Pre-assign VS inputs on a6xx+ binning pass shader, to align4168* with draw pass VS, so binning and draw pass can both use the4169* same VBO state.4170*4171* Note that VS inputs are expected to be full precision.4172*/4173bool pre_assign_inputs = (ir->compiler->gpu_id >= 600) &&4174(ir->type == MESA_SHADER_VERTEX) &&4175so->binning_pass;41764177if (pre_assign_inputs) {4178foreach_input (in, ir) {4179assert(in->opc == OPC_META_INPUT);4180unsigned inidx = in->input.inidx;41814182in->dsts[0]->num = so->nonbinning->inputs[inidx].regid;4183}4184} else if (ctx->tcs_header) {4185/* We need to have these values in the same registers between VS and TCS4186* since the VS chains to TCS and doesn't get the sysvals redelivered.4187*/41884189ctx->tcs_header->dsts[0]->num = regid(0, 0);4190ctx->primitive_id->dsts[0]->num = regid(0, 1);4191} else if (ctx->gs_header) {4192/* We need to have these values in the same registers between producer4193* (VS or DS) and GS since the producer chains to GS and doesn't get4194* the sysvals redelivered.4195*/41964197ctx->gs_header->dsts[0]->num = regid(0, 0);4198ctx->primitive_id->dsts[0]->num = regid(0, 1);4199} else if (so->num_sampler_prefetch) {4200assert(so->type == MESA_SHADER_FRAGMENT);4201int idx = 0;42024203foreach_input (instr, ir) {4204if (instr->input.sysval != SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL)4205continue;42064207assert(idx < 2);4208instr->dsts[0]->num = idx;4209idx++;4210}4211}42124213ret = ir3_ra(so);42144215if (ret) {4216mesa_loge("ir3_ra() failed!");4217goto out;4218}42194220IR3_PASS(ir, ir3_postsched, so);42214222IR3_PASS(ir, ir3_lower_subgroups);42234224if (so->type == MESA_SHADER_FRAGMENT)4225pack_inlocs(ctx);42264227/*4228* Fixup inputs/outputs to point to the actual registers assigned:4229*4230* 1) initialize to r63.x (invalid/unused)4231* 2) iterate IR level inputs/outputs and update the variants4232* inputs/outputs table based on the assigned registers for4233* the remaining inputs/outputs.4234*/42354236for (unsigned i = 0; i < so->inputs_count; i++)4237so->inputs[i].regid = INVALID_REG;4238for (unsigned i = 0; i < so->outputs_count; i++)4239so->outputs[i].regid = INVALID_REG;42404241struct ir3_instruction *end = find_end(so->ir);42424243for (unsigned i = 0; i < end->srcs_count; i++) {4244unsigned outidx = end->end.outidxs[i];4245struct ir3_register *reg = end->srcs[i];42464247so->outputs[outidx].regid = reg->num;4248so->outputs[outidx].half = !!(reg->flags & IR3_REG_HALF);4249}42504251foreach_input (in, ir) {4252assert(in->opc == OPC_META_INPUT);4253unsigned inidx = in->input.inidx;42544255if (pre_assign_inputs && !so->inputs[inidx].sysval) {4256if (VALIDREG(so->nonbinning->inputs[inidx].regid)) {4257compile_assert(4258ctx, in->dsts[0]->num == so->nonbinning->inputs[inidx].regid);4259compile_assert(ctx, !!(in->dsts[0]->flags & IR3_REG_HALF) ==4260so->nonbinning->inputs[inidx].half);4261}4262so->inputs[inidx].regid = so->nonbinning->inputs[inidx].regid;4263so->inputs[inidx].half = so->nonbinning->inputs[inidx].half;4264} else {4265so->inputs[inidx].regid = in->dsts[0]->num;4266so->inputs[inidx].half = !!(in->dsts[0]->flags & IR3_REG_HALF);4267}4268}42694270if (ctx->astc_srgb)4271fixup_astc_srgb(ctx);42724273/* We need to do legalize after (for frag shader's) the "bary.f"4274* offsets (inloc) have been assigned.4275*/4276IR3_PASS(ir, ir3_legalize, so, &max_bary);42774278/* Set (ss)(sy) on first TCS and GEOMETRY instructions, since we don't4279* know what we might have to wait on when coming in from VS chsh.4280*/4281if (so->type == MESA_SHADER_TESS_CTRL || so->type == MESA_SHADER_GEOMETRY) {4282foreach_block (block, &ir->block_list) {4283foreach_instr (instr, &block->instr_list) {4284instr->flags |= IR3_INSTR_SS | IR3_INSTR_SY;4285break;4286}4287}4288}42894290so->branchstack = ctx->max_stack;42914292/* Note that actual_in counts inputs that are not bary.f'd for FS: */4293if (so->type == MESA_SHADER_FRAGMENT)4294so->total_in = max_bary + 1;42954296/* Collect sampling instructions eligible for pre-dispatch. */4297collect_tex_prefetches(ctx, ir);42984299if (so->type == MESA_SHADER_FRAGMENT &&4300ctx->s->info.fs.needs_quad_helper_invocations)4301so->need_pixlod = true;43024303if (so->type == MESA_SHADER_COMPUTE) {4304so->local_size[0] = ctx->s->info.workgroup_size[0];4305so->local_size[1] = ctx->s->info.workgroup_size[1];4306so->local_size[2] = ctx->s->info.workgroup_size[2];4307so->local_size_variable = ctx->s->info.workgroup_size_variable;4308}43094310out:4311if (ret) {4312if (so->ir)4313ir3_destroy(so->ir);4314so->ir = NULL;4315}4316ir3_context_free(ctx);43174318return ret;4319}432043214322