Path: blob/21.2-virgl/src/gallium/drivers/freedreno/a2xx/ir2_nir.c
4574 views
/*1* Copyright (C) 2018 Jonathan Marek <[email protected]>2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,19* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE20* SOFTWARE.21*22* Authors:23* Jonathan Marek <[email protected]>24*/2526#include "ir2_private.h"2728#include "fd2_program.h"29#include "freedreno_util.h"3031static const nir_shader_compiler_options options = {32.lower_fpow = true,33.lower_flrp32 = true,34.lower_fmod = true,35.lower_fdiv = true,36.lower_fceil = true,37.fuse_ffma16 = true,38.fuse_ffma32 = true,39.fuse_ffma64 = true,40/* .fdot_replicates = true, it is replicated, but it makes things worse */41.lower_all_io_to_temps = true,42.vertex_id_zero_based = true, /* its not implemented anyway */43.lower_bitops = true,44.lower_rotate = true,45.lower_vector_cmp = true,46.lower_fdph = true,47.has_fsub = true,48.has_isub = true,49.lower_insert_byte = true,50.lower_insert_word = true,51};5253const nir_shader_compiler_options *54ir2_get_compiler_options(void)55{56return &options;57}5859#define OPT(nir, pass, ...) \60({ \61bool this_progress = false; \62NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \63this_progress; \64})65#define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)6667static void68ir2_optimize_loop(nir_shader *s)69{70bool progress;71do {72progress = false;7374OPT_V(s, nir_lower_vars_to_ssa);75progress |= OPT(s, nir_opt_copy_prop_vars);76progress |= OPT(s, nir_copy_prop);77progress |= OPT(s, nir_opt_dce);78progress |= OPT(s, nir_opt_cse);79/* progress |= OPT(s, nir_opt_gcm, true); */80progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true, true);81progress |= OPT(s, nir_opt_intrinsics);82progress |= OPT(s, nir_opt_algebraic);83progress |= OPT(s, nir_opt_constant_folding);84progress |= OPT(s, nir_opt_dead_cf);85if (OPT(s, nir_opt_trivial_continues)) {86progress |= true;87/* If nir_opt_trivial_continues makes progress, then we need to clean88* things up if we want any hope of nir_opt_if or nir_opt_loop_unroll89* to make progress.90*/91OPT(s, nir_copy_prop);92OPT(s, nir_opt_dce);93}94progress |= OPT(s, nir_opt_loop_unroll, nir_var_all);95progress |= OPT(s, nir_opt_if, false);96progress |= OPT(s, nir_opt_remove_phis);97progress |= OPT(s, nir_opt_undef);9899} while (progress);100}101102/* trig workarounds is the same as ir3.. but we don't want to include ir3 */103bool ir3_nir_apply_trig_workarounds(nir_shader *shader);104105int106ir2_optimize_nir(nir_shader *s, bool lower)107{108struct nir_lower_tex_options tex_options = {109.lower_txp = ~0u,110.lower_rect = 0,111};112113if (FD_DBG(DISASM)) {114debug_printf("----------------------\n");115nir_print_shader(s, stdout);116debug_printf("----------------------\n");117}118119OPT_V(s, nir_lower_regs_to_ssa);120OPT_V(s, nir_lower_vars_to_ssa);121OPT_V(s, nir_lower_indirect_derefs, nir_var_shader_in | nir_var_shader_out,122UINT32_MAX);123124if (lower) {125OPT_V(s, ir3_nir_apply_trig_workarounds);126OPT_V(s, nir_lower_tex, &tex_options);127}128129ir2_optimize_loop(s);130131OPT_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL);132OPT_V(s, nir_opt_sink, nir_move_const_undef);133134/* TODO we dont want to get shaders writing to depth for depth textures */135if (s->info.stage == MESA_SHADER_FRAGMENT) {136nir_foreach_shader_out_variable (var, s) {137if (var->data.location == FRAG_RESULT_DEPTH)138return -1;139}140}141142return 0;143}144145static struct ir2_src146load_const(struct ir2_context *ctx, float *value_f, unsigned ncomp)147{148struct fd2_shader_stateobj *so = ctx->so;149unsigned imm_ncomp, swiz, idx, i, j;150uint32_t *value = (uint32_t *)value_f;151152/* try to merge with existing immediate (TODO: try with neg) */153for (idx = 0; idx < so->num_immediates; idx++) {154swiz = 0;155imm_ncomp = so->immediates[idx].ncomp;156for (i = 0; i < ncomp; i++) {157for (j = 0; j < imm_ncomp; j++) {158if (value[i] == so->immediates[idx].val[j])159break;160}161if (j == imm_ncomp) {162if (j == 4)163break;164so->immediates[idx].val[imm_ncomp++] = value[i];165}166swiz |= swiz_set(j, i);167}168/* matched all components */169if (i == ncomp)170break;171}172173/* need to allocate new immediate */174if (idx == so->num_immediates) {175swiz = 0;176imm_ncomp = 0;177for (i = 0; i < ncomp; i++) {178for (j = 0; j < imm_ncomp; j++) {179if (value[i] == ctx->so->immediates[idx].val[j])180break;181}182if (j == imm_ncomp) {183so->immediates[idx].val[imm_ncomp++] = value[i];184}185swiz |= swiz_set(j, i);186}187so->num_immediates++;188}189so->immediates[idx].ncomp = imm_ncomp;190191if (ncomp == 1)192swiz = swiz_merge(swiz, IR2_SWIZZLE_XXXX);193194return ir2_src(so->first_immediate + idx, swiz, IR2_SRC_CONST);195}196197struct ir2_src198ir2_zero(struct ir2_context *ctx)199{200return load_const(ctx, (float[]){0.0f}, 1);201}202203static void204update_range(struct ir2_context *ctx, struct ir2_reg *reg)205{206if (!reg->initialized) {207reg->initialized = true;208reg->loop_depth = ctx->loop_depth;209}210211if (ctx->loop_depth > reg->loop_depth) {212reg->block_idx_free = ctx->loop_last_block[reg->loop_depth + 1];213} else {214reg->loop_depth = ctx->loop_depth;215reg->block_idx_free = -1;216}217218/* for regs we want to free at the end of the loop in any case219* XXX dont do this for ssa220*/221if (reg->loop_depth)222reg->block_idx_free = ctx->loop_last_block[reg->loop_depth];223}224225static struct ir2_src226make_src(struct ir2_context *ctx, nir_src src)227{228struct ir2_src res = {};229struct ir2_reg *reg;230231nir_const_value *const_value = nir_src_as_const_value(src);232233if (const_value) {234assert(src.is_ssa);235float c[src.ssa->num_components];236nir_const_value_to_array(c, const_value, src.ssa->num_components, f32);237return load_const(ctx, c, src.ssa->num_components);238}239240if (!src.is_ssa) {241res.num = src.reg.reg->index;242res.type = IR2_SRC_REG;243reg = &ctx->reg[res.num];244} else {245assert(ctx->ssa_map[src.ssa->index] >= 0);246res.num = ctx->ssa_map[src.ssa->index];247res.type = IR2_SRC_SSA;248reg = &ctx->instr[res.num].ssa;249}250251update_range(ctx, reg);252return res;253}254255static void256set_index(struct ir2_context *ctx, nir_dest *dst, struct ir2_instr *instr)257{258struct ir2_reg *reg = &instr->ssa;259260if (dst->is_ssa) {261ctx->ssa_map[dst->ssa.index] = instr->idx;262} else {263assert(instr->is_ssa);264reg = &ctx->reg[dst->reg.reg->index];265266instr->is_ssa = false;267instr->reg = reg;268}269update_range(ctx, reg);270}271272static struct ir2_instr *273ir2_instr_create(struct ir2_context *ctx, int type)274{275struct ir2_instr *instr;276277instr = &ctx->instr[ctx->instr_count++];278instr->idx = ctx->instr_count - 1;279instr->type = type;280instr->block_idx = ctx->block_idx;281instr->pred = ctx->pred;282instr->is_ssa = true;283return instr;284}285286static struct ir2_instr *287instr_create_alu(struct ir2_context *ctx, nir_op opcode, unsigned ncomp)288{289/* emit_alu will fixup instrs that don't map directly */290static const struct ir2_opc {291int8_t scalar, vector;292} nir_ir2_opc[nir_num_opcodes + 1] = {293[0 ... nir_num_opcodes - 1] = {-1, -1},294295[nir_op_mov] = {MAXs, MAXv},296[nir_op_fneg] = {MAXs, MAXv},297[nir_op_fabs] = {MAXs, MAXv},298[nir_op_fsat] = {MAXs, MAXv},299[nir_op_fsign] = {-1, CNDGTEv},300[nir_op_fadd] = {ADDs, ADDv},301[nir_op_fsub] = {ADDs, ADDv},302[nir_op_fmul] = {MULs, MULv},303[nir_op_ffma] = {-1, MULADDv},304[nir_op_fmax] = {MAXs, MAXv},305[nir_op_fmin] = {MINs, MINv},306[nir_op_ffloor] = {FLOORs, FLOORv},307[nir_op_ffract] = {FRACs, FRACv},308[nir_op_ftrunc] = {TRUNCs, TRUNCv},309[nir_op_fdot2] = {-1, DOT2ADDv},310[nir_op_fdot3] = {-1, DOT3v},311[nir_op_fdot4] = {-1, DOT4v},312[nir_op_sge] = {-1, SETGTEv},313[nir_op_slt] = {-1, SETGTv},314[nir_op_sne] = {-1, SETNEv},315[nir_op_seq] = {-1, SETEv},316[nir_op_fcsel] = {-1, CNDEv},317[nir_op_frsq] = {RECIPSQ_IEEE, -1},318[nir_op_frcp] = {RECIP_IEEE, -1},319[nir_op_flog2] = {LOG_IEEE, -1},320[nir_op_fexp2] = {EXP_IEEE, -1},321[nir_op_fsqrt] = {SQRT_IEEE, -1},322[nir_op_fcos] = {COS, -1},323[nir_op_fsin] = {SIN, -1},324/* no fsat, fneg, fabs since source mods deal with those */325326/* so we can use this function with non-nir op */327#define ir2_op_cube nir_num_opcodes328[ir2_op_cube] = {-1, CUBEv},329};330331struct ir2_opc op = nir_ir2_opc[opcode];332assert(op.vector >= 0 || op.scalar >= 0);333334struct ir2_instr *instr = ir2_instr_create(ctx, IR2_ALU);335instr->alu.vector_opc = op.vector;336instr->alu.scalar_opc = op.scalar;337instr->alu.export = -1;338instr->alu.write_mask = (1 << ncomp) - 1;339instr->src_count =340opcode == ir2_op_cube ? 2 : nir_op_infos[opcode].num_inputs;341instr->ssa.ncomp = ncomp;342return instr;343}344345static struct ir2_instr *346instr_create_alu_reg(struct ir2_context *ctx, nir_op opcode, uint8_t write_mask,347struct ir2_instr *share_reg)348{349struct ir2_instr *instr;350struct ir2_reg *reg;351352reg = share_reg ? share_reg->reg : &ctx->reg[ctx->reg_count++];353reg->ncomp = MAX2(reg->ncomp, util_logbase2(write_mask) + 1);354355instr = instr_create_alu(ctx, opcode, util_bitcount(write_mask));356instr->alu.write_mask = write_mask;357instr->reg = reg;358instr->is_ssa = false;359return instr;360}361362static struct ir2_instr *363instr_create_alu_dest(struct ir2_context *ctx, nir_op opcode, nir_dest *dst)364{365struct ir2_instr *instr;366instr = instr_create_alu(ctx, opcode, nir_dest_num_components(*dst));367set_index(ctx, dst, instr);368return instr;369}370371static struct ir2_instr *372ir2_instr_create_fetch(struct ir2_context *ctx, nir_dest *dst,373instr_fetch_opc_t opc)374{375struct ir2_instr *instr = ir2_instr_create(ctx, IR2_FETCH);376instr->fetch.opc = opc;377instr->src_count = 1;378instr->ssa.ncomp = nir_dest_num_components(*dst);379set_index(ctx, dst, instr);380return instr;381}382383static struct ir2_src384make_src_noconst(struct ir2_context *ctx, nir_src src)385{386struct ir2_instr *instr;387388if (nir_src_as_const_value(src)) {389assert(src.is_ssa);390instr = instr_create_alu(ctx, nir_op_mov, src.ssa->num_components);391instr->src[0] = make_src(ctx, src);392return ir2_src(instr->idx, 0, IR2_SRC_SSA);393}394395return make_src(ctx, src);396}397398static void399emit_alu(struct ir2_context *ctx, nir_alu_instr *alu)400{401const nir_op_info *info = &nir_op_infos[alu->op];402nir_dest *dst = &alu->dest.dest;403struct ir2_instr *instr;404struct ir2_src tmp;405unsigned ncomp;406407/* get the number of dst components */408if (dst->is_ssa) {409ncomp = dst->ssa.num_components;410} else {411ncomp = 0;412for (int i = 0; i < 4; i++)413ncomp += !!(alu->dest.write_mask & 1 << i);414}415416instr = instr_create_alu(ctx, alu->op, ncomp);417set_index(ctx, dst, instr);418instr->alu.saturate = alu->dest.saturate;419instr->alu.write_mask = alu->dest.write_mask;420421for (int i = 0; i < info->num_inputs; i++) {422nir_alu_src *src = &alu->src[i];423424/* compress swizzle with writemask when applicable */425unsigned swiz = 0, j = 0;426for (int i = 0; i < 4; i++) {427if (!(alu->dest.write_mask & 1 << i) && !info->output_size)428continue;429swiz |= swiz_set(src->swizzle[i], j++);430}431432instr->src[i] = make_src(ctx, src->src);433instr->src[i].swizzle = swiz_merge(instr->src[i].swizzle, swiz);434instr->src[i].negate = src->negate;435instr->src[i].abs = src->abs;436}437438/* workarounds for NIR ops that don't map directly to a2xx ops */439switch (alu->op) {440case nir_op_fneg:441instr->src[0].negate = 1;442break;443case nir_op_fabs:444instr->src[0].abs = 1;445break;446case nir_op_fsat:447instr->alu.saturate = 1;448break;449case nir_op_slt:450tmp = instr->src[0];451instr->src[0] = instr->src[1];452instr->src[1] = tmp;453break;454case nir_op_fcsel:455tmp = instr->src[1];456instr->src[1] = instr->src[2];457instr->src[2] = tmp;458break;459case nir_op_fsub:460instr->src[1].negate = !instr->src[1].negate;461break;462case nir_op_fdot2:463instr->src_count = 3;464instr->src[2] = ir2_zero(ctx);465break;466case nir_op_fsign: {467/* we need an extra instruction to deal with the zero case */468struct ir2_instr *tmp;469470/* tmp = x == 0 ? 0 : 1 */471tmp = instr_create_alu(ctx, nir_op_fcsel, ncomp);472tmp->src[0] = instr->src[0];473tmp->src[1] = ir2_zero(ctx);474tmp->src[2] = load_const(ctx, (float[]){1.0f}, 1);475476/* result = x >= 0 ? tmp : -tmp */477instr->src[1] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);478instr->src[2] = instr->src[1];479instr->src[2].negate = true;480instr->src_count = 3;481} break;482default:483break;484}485}486487static void488load_input(struct ir2_context *ctx, nir_dest *dst, unsigned idx)489{490struct ir2_instr *instr;491int slot = -1;492493if (ctx->so->type == MESA_SHADER_VERTEX) {494instr = ir2_instr_create_fetch(ctx, dst, 0);495instr->src[0] = ir2_src(0, 0, IR2_SRC_INPUT);496instr->fetch.vtx.const_idx = 20 + (idx / 3);497instr->fetch.vtx.const_idx_sel = idx % 3;498return;499}500501/* get slot from idx */502nir_foreach_shader_in_variable (var, ctx->nir) {503if (var->data.driver_location == idx) {504slot = var->data.location;505break;506}507}508assert(slot >= 0);509510switch (slot) {511case VARYING_SLOT_POS:512/* need to extract xy with abs and add tile offset on a20x513* zw from fragcoord input (w inverted in fragment shader)514* TODO: only components that are required by fragment shader515*/516instr = instr_create_alu_reg(517ctx, ctx->so->is_a20x ? nir_op_fadd : nir_op_mov, 3, NULL);518instr->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);519instr->src[0].abs = true;520/* on a20x, C64 contains the tile offset */521instr->src[1] = ir2_src(64, 0, IR2_SRC_CONST);522523instr = instr_create_alu_reg(ctx, nir_op_mov, 4, instr);524instr->src[0] = ir2_src(ctx->f->fragcoord, 0, IR2_SRC_INPUT);525526instr = instr_create_alu_reg(ctx, nir_op_frcp, 8, instr);527instr->src[0] = ir2_src(ctx->f->fragcoord, IR2_SWIZZLE_Y, IR2_SRC_INPUT);528529unsigned reg_idx = instr->reg - ctx->reg; /* XXX */530instr = instr_create_alu_dest(ctx, nir_op_mov, dst);531instr->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);532break;533default:534instr = instr_create_alu_dest(ctx, nir_op_mov, dst);535instr->src[0] = ir2_src(idx, 0, IR2_SRC_INPUT);536break;537}538}539540static unsigned541output_slot(struct ir2_context *ctx, nir_intrinsic_instr *intr)542{543int slot = -1;544unsigned idx = nir_intrinsic_base(intr);545nir_foreach_shader_out_variable (var, ctx->nir) {546if (var->data.driver_location == idx) {547slot = var->data.location;548break;549}550}551assert(slot != -1);552return slot;553}554555static void556store_output(struct ir2_context *ctx, nir_src src, unsigned slot,557unsigned ncomp)558{559struct ir2_instr *instr;560unsigned idx = 0;561562if (ctx->so->type == MESA_SHADER_VERTEX) {563switch (slot) {564case VARYING_SLOT_POS:565ctx->position = make_src(ctx, src);566idx = 62;567break;568case VARYING_SLOT_PSIZ:569ctx->so->writes_psize = true;570idx = 63;571break;572default:573/* find matching slot from fragment shader input */574for (idx = 0; idx < ctx->f->inputs_count; idx++)575if (ctx->f->inputs[idx].slot == slot)576break;577if (idx == ctx->f->inputs_count)578return;579}580} else if (slot != FRAG_RESULT_COLOR && slot != FRAG_RESULT_DATA0) {581/* only color output is implemented */582return;583}584585instr = instr_create_alu(ctx, nir_op_mov, ncomp);586instr->src[0] = make_src(ctx, src);587instr->alu.export = idx;588}589590static void591emit_intrinsic(struct ir2_context *ctx, nir_intrinsic_instr *intr)592{593struct ir2_instr *instr;594ASSERTED nir_const_value *const_offset;595unsigned idx;596597switch (intr->intrinsic) {598case nir_intrinsic_load_input:599load_input(ctx, &intr->dest, nir_intrinsic_base(intr));600break;601case nir_intrinsic_store_output:602store_output(ctx, intr->src[0], output_slot(ctx, intr),603intr->num_components);604break;605case nir_intrinsic_load_uniform:606const_offset = nir_src_as_const_value(intr->src[0]);607assert(const_offset); /* TODO can be false in ES2? */608idx = nir_intrinsic_base(intr);609idx += (uint32_t)const_offset[0].f32;610instr = instr_create_alu_dest(ctx, nir_op_mov, &intr->dest);611instr->src[0] = ir2_src(idx, 0, IR2_SRC_CONST);612break;613case nir_intrinsic_discard:614case nir_intrinsic_discard_if:615instr = ir2_instr_create(ctx, IR2_ALU);616instr->alu.vector_opc = VECTOR_NONE;617if (intr->intrinsic == nir_intrinsic_discard_if) {618instr->alu.scalar_opc = KILLNEs;619instr->src[0] = make_src(ctx, intr->src[0]);620} else {621instr->alu.scalar_opc = KILLEs;622instr->src[0] = ir2_zero(ctx);623}624instr->alu.export = -1;625instr->src_count = 1;626ctx->so->has_kill = true;627break;628case nir_intrinsic_load_front_face:629/* gl_FrontFacing is in the sign of param.x630* rcp required because otherwise we can't differentiate -0.0 and +0.0631*/632ctx->so->need_param = true;633634struct ir2_instr *tmp = instr_create_alu(ctx, nir_op_frcp, 1);635tmp->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);636637instr = instr_create_alu_dest(ctx, nir_op_sge, &intr->dest);638instr->src[0] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);639instr->src[1] = ir2_zero(ctx);640break;641case nir_intrinsic_load_point_coord:642/* param.zw (note: abs might be needed like fragcoord in param.xy?) */643ctx->so->need_param = true;644645instr = instr_create_alu_dest(ctx, nir_op_mov, &intr->dest);646instr->src[0] =647ir2_src(ctx->f->inputs_count, IR2_SWIZZLE_ZW, IR2_SRC_INPUT);648break;649default:650compile_error(ctx, "unimplemented intr %d\n", intr->intrinsic);651break;652}653}654655static void656emit_tex(struct ir2_context *ctx, nir_tex_instr *tex)657{658bool is_rect = false, is_cube = false;659struct ir2_instr *instr;660nir_src *coord, *lod_bias;661662coord = lod_bias = NULL;663664for (unsigned i = 0; i < tex->num_srcs; i++) {665switch (tex->src[i].src_type) {666case nir_tex_src_coord:667coord = &tex->src[i].src;668break;669case nir_tex_src_bias:670case nir_tex_src_lod:671assert(!lod_bias);672lod_bias = &tex->src[i].src;673break;674default:675compile_error(ctx, "Unhandled NIR tex src type: %d\n",676tex->src[i].src_type);677return;678}679}680681switch (tex->op) {682case nir_texop_tex:683case nir_texop_txb:684case nir_texop_txl:685break;686default:687compile_error(ctx, "unimplemented texop %d\n", tex->op);688return;689}690691switch (tex->sampler_dim) {692case GLSL_SAMPLER_DIM_2D:693break;694case GLSL_SAMPLER_DIM_RECT:695is_rect = true;696break;697case GLSL_SAMPLER_DIM_CUBE:698is_cube = true;699break;700default:701compile_error(ctx, "unimplemented sampler %d\n", tex->sampler_dim);702return;703}704705struct ir2_src src_coord = make_src_noconst(ctx, *coord);706707/* for cube maps708* tmp = cube(coord)709* tmp.xy = tmp.xy / |tmp.z| + 1.5710* coord = tmp.xyw711*/712if (is_cube) {713struct ir2_instr *rcp, *coord_xy;714unsigned reg_idx;715716instr = instr_create_alu_reg(ctx, ir2_op_cube, 15, NULL);717instr->src[0] = src_coord;718instr->src[0].swizzle = IR2_SWIZZLE_ZZXY;719instr->src[1] = src_coord;720instr->src[1].swizzle = IR2_SWIZZLE_YXZZ;721722reg_idx = instr->reg - ctx->reg; /* hacky */723724rcp = instr_create_alu(ctx, nir_op_frcp, 1);725rcp->src[0] = ir2_src(reg_idx, IR2_SWIZZLE_Z, IR2_SRC_REG);726rcp->src[0].abs = true;727728coord_xy = instr_create_alu_reg(ctx, nir_op_ffma, 3, instr);729coord_xy->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);730coord_xy->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);731coord_xy->src[2] = load_const(ctx, (float[]){1.5f}, 1);732733src_coord = ir2_src(reg_idx, 0, IR2_SRC_REG);734/* TODO: lod/bias transformed by src_coord.z ? */735}736737instr = ir2_instr_create_fetch(ctx, &tex->dest, TEX_FETCH);738instr->src[0] = src_coord;739instr->src[0].swizzle = is_cube ? IR2_SWIZZLE_YXW : 0;740instr->fetch.tex.is_cube = is_cube;741instr->fetch.tex.is_rect = is_rect;742instr->fetch.tex.samp_id = tex->sampler_index;743744/* for lod/bias, we insert an extra src for the backend to deal with */745if (lod_bias) {746instr->src[1] = make_src_noconst(ctx, *lod_bias);747/* backend will use 2-3 components so apply swizzle */748swiz_merge_p(&instr->src[1].swizzle, IR2_SWIZZLE_XXXX);749instr->src_count = 2;750}751}752753static void754setup_input(struct ir2_context *ctx, nir_variable *in)755{756struct fd2_shader_stateobj *so = ctx->so;757ASSERTED unsigned array_len = MAX2(glsl_get_length(in->type), 1);758unsigned n = in->data.driver_location;759unsigned slot = in->data.location;760761assert(array_len == 1);762763/* handle later */764if (ctx->so->type == MESA_SHADER_VERTEX)765return;766767if (ctx->so->type != MESA_SHADER_FRAGMENT)768compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);769770n = ctx->f->inputs_count++;771772/* half of fragcoord from param reg, half from a varying */773if (slot == VARYING_SLOT_POS) {774ctx->f->fragcoord = n;775so->need_param = true;776}777778ctx->f->inputs[n].slot = slot;779ctx->f->inputs[n].ncomp = glsl_get_components(in->type);780781/* in->data.interpolation?782* opengl ES 2.0 can't do flat mode, but we still get it from GALLIUM_HUD783*/784}785786static void787emit_undef(struct ir2_context *ctx, nir_ssa_undef_instr *undef)788{789/* TODO we don't want to emit anything for undefs */790791struct ir2_instr *instr;792793instr = instr_create_alu_dest(794ctx, nir_op_mov, &(nir_dest){.ssa = undef->def, .is_ssa = true});795instr->src[0] = ir2_src(0, 0, IR2_SRC_CONST);796}797798static void799emit_instr(struct ir2_context *ctx, nir_instr *instr)800{801switch (instr->type) {802case nir_instr_type_alu:803emit_alu(ctx, nir_instr_as_alu(instr));804break;805case nir_instr_type_deref:806/* ignored, handled as part of the intrinsic they are src to */807break;808case nir_instr_type_intrinsic:809emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));810break;811case nir_instr_type_load_const:812/* dealt with when using nir_src */813break;814case nir_instr_type_tex:815emit_tex(ctx, nir_instr_as_tex(instr));816break;817case nir_instr_type_jump:818ctx->block_has_jump[ctx->block_idx] = true;819break;820case nir_instr_type_ssa_undef:821emit_undef(ctx, nir_instr_as_ssa_undef(instr));822break;823default:824break;825}826}827828/* fragcoord.zw and a20x hw binning outputs */829static void830extra_position_exports(struct ir2_context *ctx, bool binning)831{832struct ir2_instr *instr, *rcp, *sc, *wincoord, *off;833834if (ctx->f->fragcoord < 0 && !binning)835return;836837instr = instr_create_alu(ctx, nir_op_fmax, 1);838instr->src[0] = ctx->position;839instr->src[0].swizzle = IR2_SWIZZLE_W;840instr->src[1] = ir2_zero(ctx);841842rcp = instr_create_alu(ctx, nir_op_frcp, 1);843rcp->src[0] = ir2_src(instr->idx, 0, IR2_SRC_SSA);844845sc = instr_create_alu(ctx, nir_op_fmul, 4);846sc->src[0] = ctx->position;847sc->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);848849wincoord = instr_create_alu(ctx, nir_op_ffma, 4);850wincoord->src[0] = ir2_src(66, 0, IR2_SRC_CONST);851wincoord->src[1] = ir2_src(sc->idx, 0, IR2_SRC_SSA);852wincoord->src[2] = ir2_src(65, 0, IR2_SRC_CONST);853854/* fragcoord z/w */855if (ctx->f->fragcoord >= 0 && !binning) {856instr = instr_create_alu(ctx, nir_op_mov, 1);857instr->src[0] = ir2_src(wincoord->idx, IR2_SWIZZLE_Z, IR2_SRC_SSA);858instr->alu.export = ctx->f->fragcoord;859860instr = instr_create_alu(ctx, nir_op_mov, 1);861instr->src[0] = ctx->position;862instr->src[0].swizzle = IR2_SWIZZLE_W;863instr->alu.export = ctx->f->fragcoord;864instr->alu.write_mask = 2;865}866867if (!binning)868return;869870off = instr_create_alu(ctx, nir_op_fadd, 1);871off->src[0] = ir2_src(64, 0, IR2_SRC_CONST);872off->src[1] = ir2_src(2, 0, IR2_SRC_INPUT);873874/* 8 max set in freedreno_screen.. unneeded instrs patched out */875for (int i = 0; i < 8; i++) {876instr = instr_create_alu(ctx, nir_op_ffma, 4);877instr->src[0] = ir2_src(1, IR2_SWIZZLE_WYWW, IR2_SRC_CONST);878instr->src[1] = ir2_src(off->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);879instr->src[2] = ir2_src(3 + i, 0, IR2_SRC_CONST);880instr->alu.export = 32;881882instr = instr_create_alu(ctx, nir_op_ffma, 4);883instr->src[0] = ir2_src(68 + i * 2, 0, IR2_SRC_CONST);884instr->src[1] = ir2_src(wincoord->idx, 0, IR2_SRC_SSA);885instr->src[2] = ir2_src(67 + i * 2, 0, IR2_SRC_CONST);886instr->alu.export = 33;887}888}889890static bool emit_cf_list(struct ir2_context *ctx, struct exec_list *list);891892static bool893emit_block(struct ir2_context *ctx, nir_block *block)894{895struct ir2_instr *instr;896nir_block *succs = block->successors[0];897898ctx->block_idx = block->index;899900nir_foreach_instr (instr, block)901emit_instr(ctx, instr);902903if (!succs || !succs->index)904return false;905906/* we want to be smart and always jump and have the backend cleanup907* but we are not, so there are two cases where jump is needed:908* loops (succs index lower)909* jumps (jump instruction seen in block)910*/911if (succs->index > block->index && !ctx->block_has_jump[block->index])912return false;913914assert(block->successors[1] == NULL);915916instr = ir2_instr_create(ctx, IR2_CF);917instr->cf.block_idx = succs->index;918/* XXX can't jump to a block with different predicate */919return true;920}921922static void923emit_if(struct ir2_context *ctx, nir_if *nif)924{925unsigned pred = ctx->pred, pred_idx = ctx->pred_idx;926struct ir2_instr *instr;927928/* XXX: blob seems to always use same register for condition */929930instr = ir2_instr_create(ctx, IR2_ALU);931instr->src[0] = make_src(ctx, nif->condition);932instr->src_count = 1;933instr->ssa.ncomp = 1;934instr->alu.vector_opc = VECTOR_NONE;935instr->alu.scalar_opc = SCALAR_NONE;936instr->alu.export = -1;937instr->alu.write_mask = 1;938instr->pred = 0;939940/* if nested, use PRED_SETNE_PUSHv */941if (pred) {942instr->alu.vector_opc = PRED_SETNE_PUSHv;943instr->src[1] = instr->src[0];944instr->src[0] = ir2_src(pred_idx, 0, IR2_SRC_SSA);945instr->src[0].swizzle = IR2_SWIZZLE_XXXX;946instr->src[1].swizzle = IR2_SWIZZLE_XXXX;947instr->src_count = 2;948} else {949instr->alu.scalar_opc = PRED_SETNEs;950}951952ctx->pred_idx = instr->idx;953ctx->pred = 3;954955emit_cf_list(ctx, &nif->then_list);956957/* TODO: if these is no else branch we don't need this958* and if the else branch is simple, can just flip ctx->pred instead959*/960instr = ir2_instr_create(ctx, IR2_ALU);961instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);962instr->src_count = 1;963instr->ssa.ncomp = 1;964instr->alu.vector_opc = VECTOR_NONE;965instr->alu.scalar_opc = PRED_SET_INVs;966instr->alu.export = -1;967instr->alu.write_mask = 1;968instr->pred = 0;969ctx->pred_idx = instr->idx;970971emit_cf_list(ctx, &nif->else_list);972973/* restore predicate for nested predicates */974if (pred) {975instr = ir2_instr_create(ctx, IR2_ALU);976instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);977instr->src_count = 1;978instr->ssa.ncomp = 1;979instr->alu.vector_opc = VECTOR_NONE;980instr->alu.scalar_opc = PRED_SET_POPs;981instr->alu.export = -1;982instr->alu.write_mask = 1;983instr->pred = 0;984ctx->pred_idx = instr->idx;985}986987/* restore ctx->pred */988ctx->pred = pred;989}990991/* get the highest block idx in the loop, so we know when992* we can free registers that are allocated outside the loop993*/994static unsigned995loop_last_block(struct exec_list *list)996{997nir_cf_node *node =998exec_node_data(nir_cf_node, exec_list_get_tail(list), node);999switch (node->type) {1000case nir_cf_node_block:1001return nir_cf_node_as_block(node)->index;1002case nir_cf_node_if:1003assert(0); /* XXX could this ever happen? */1004return 0;1005case nir_cf_node_loop:1006return loop_last_block(&nir_cf_node_as_loop(node)->body);1007default:1008compile_error(ctx, "Not supported\n");1009return 0;1010}1011}10121013static void1014emit_loop(struct ir2_context *ctx, nir_loop *nloop)1015{1016ctx->loop_last_block[++ctx->loop_depth] = loop_last_block(&nloop->body);1017emit_cf_list(ctx, &nloop->body);1018ctx->loop_depth--;1019}10201021static bool1022emit_cf_list(struct ir2_context *ctx, struct exec_list *list)1023{1024bool ret = false;1025foreach_list_typed (nir_cf_node, node, node, list) {1026ret = false;1027switch (node->type) {1028case nir_cf_node_block:1029ret = emit_block(ctx, nir_cf_node_as_block(node));1030break;1031case nir_cf_node_if:1032emit_if(ctx, nir_cf_node_as_if(node));1033break;1034case nir_cf_node_loop:1035emit_loop(ctx, nir_cf_node_as_loop(node));1036break;1037case nir_cf_node_function:1038compile_error(ctx, "Not supported\n");1039break;1040}1041}1042return ret;1043}10441045static void1046cleanup_binning(struct ir2_context *ctx)1047{1048assert(ctx->so->type == MESA_SHADER_VERTEX);10491050/* kill non-position outputs for binning variant */1051nir_foreach_block (block, nir_shader_get_entrypoint(ctx->nir)) {1052nir_foreach_instr_safe (instr, block) {1053if (instr->type != nir_instr_type_intrinsic)1054continue;10551056nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);1057if (intr->intrinsic != nir_intrinsic_store_output)1058continue;10591060if (output_slot(ctx, intr) != VARYING_SLOT_POS)1061nir_instr_remove(instr);1062}1063}10641065ir2_optimize_nir(ctx->nir, false);1066}10671068static bool1069ir2_alu_to_scalar_filter_cb(const nir_instr *instr, const void *data)1070{1071if (instr->type != nir_instr_type_alu)1072return false;10731074nir_alu_instr *alu = nir_instr_as_alu(instr);1075switch (alu->op) {1076case nir_op_frsq:1077case nir_op_frcp:1078case nir_op_flog2:1079case nir_op_fexp2:1080case nir_op_fsqrt:1081case nir_op_fcos:1082case nir_op_fsin:1083return true;1084default:1085break;1086}10871088return false;1089}10901091void1092ir2_nir_compile(struct ir2_context *ctx, bool binning)1093{1094struct fd2_shader_stateobj *so = ctx->so;10951096memset(ctx->ssa_map, 0xff, sizeof(ctx->ssa_map));10971098ctx->nir = nir_shader_clone(NULL, so->nir);10991100if (binning)1101cleanup_binning(ctx);11021103OPT_V(ctx->nir, nir_copy_prop);1104OPT_V(ctx->nir, nir_opt_dce);1105OPT_V(ctx->nir, nir_opt_move, nir_move_comparisons);11061107OPT_V(ctx->nir, nir_lower_int_to_float);1108OPT_V(ctx->nir, nir_lower_bool_to_float);1109while (OPT(ctx->nir, nir_opt_algebraic))1110;1111OPT_V(ctx->nir, nir_opt_algebraic_late);1112OPT_V(ctx->nir, nir_lower_to_source_mods, nir_lower_all_source_mods);11131114OPT_V(ctx->nir, nir_lower_alu_to_scalar, ir2_alu_to_scalar_filter_cb, NULL);11151116OPT_V(ctx->nir, nir_lower_locals_to_regs);11171118OPT_V(ctx->nir, nir_convert_from_ssa, true);11191120OPT_V(ctx->nir, nir_move_vec_src_uses_to_dest);1121OPT_V(ctx->nir, nir_lower_vec_to_movs, NULL, NULL);11221123OPT_V(ctx->nir, nir_opt_dce);11241125nir_sweep(ctx->nir);11261127if (FD_DBG(DISASM)) {1128debug_printf("----------------------\n");1129nir_print_shader(ctx->nir, stdout);1130debug_printf("----------------------\n");1131}11321133/* fd2_shader_stateobj init */1134if (so->type == MESA_SHADER_FRAGMENT) {1135ctx->f->fragcoord = -1;1136ctx->f->inputs_count = 0;1137memset(ctx->f->inputs, 0, sizeof(ctx->f->inputs));1138}11391140/* Setup inputs: */1141nir_foreach_shader_in_variable (in, ctx->nir)1142setup_input(ctx, in);11431144if (so->type == MESA_SHADER_FRAGMENT) {1145unsigned idx;1146for (idx = 0; idx < ctx->f->inputs_count; idx++) {1147ctx->input[idx].ncomp = ctx->f->inputs[idx].ncomp;1148update_range(ctx, &ctx->input[idx]);1149}1150/* assume we have param input and kill it later if not */1151ctx->input[idx].ncomp = 4;1152update_range(ctx, &ctx->input[idx]);1153} else {1154ctx->input[0].ncomp = 1;1155ctx->input[2].ncomp = 1;1156update_range(ctx, &ctx->input[0]);1157update_range(ctx, &ctx->input[2]);1158}11591160/* And emit the body: */1161nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->nir);11621163nir_foreach_register (reg, &fxn->registers) {1164ctx->reg[reg->index].ncomp = reg->num_components;1165ctx->reg_count = MAX2(ctx->reg_count, reg->index + 1);1166}11671168nir_metadata_require(fxn, nir_metadata_block_index);1169emit_cf_list(ctx, &fxn->body);1170/* TODO emit_block(ctx, fxn->end_block); */11711172if (so->type == MESA_SHADER_VERTEX)1173extra_position_exports(ctx, binning);11741175ralloc_free(ctx->nir);11761177/* kill unused param input */1178if (so->type == MESA_SHADER_FRAGMENT && !so->need_param)1179ctx->input[ctx->f->inputs_count].initialized = false;1180}118111821183