Path: blob/21.2-virgl/src/gallium/drivers/freedreno/a2xx/ir2.c
4574 views
/*1* Copyright (C) 2018 Jonathan Marek <[email protected]>2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,19* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE20* SOFTWARE.21*22* Authors:23* Jonathan Marek <[email protected]>24*/2526#include "ir2_private.h"2728static bool29scalar_possible(struct ir2_instr *instr)30{31if (instr->alu.scalar_opc == SCALAR_NONE)32return false;3334return src_ncomp(instr) == 1;35}3637static bool38is_alu_compatible(struct ir2_instr *a, struct ir2_instr *b)39{40if (!a)41return true;4243/* dont use same instruction twice */44if (a == b)45return false;4647/* PRED_SET must be alone */48if (b->alu.scalar_opc >= PRED_SETEs &&49b->alu.scalar_opc <= PRED_SET_RESTOREs)50return false;5152/* must write to same export (issues otherwise?) */53return a->alu.export == b->alu.export;54}5556/* priority of vector instruction for scheduling (lower=higher prio) */57static unsigned58alu_vector_prio(struct ir2_instr *instr)59{60if (instr->alu.vector_opc == VECTOR_NONE)61return ~0u;6263if (is_export(instr))64return 4;6566/* TODO check src type and ncomps */67if (instr->src_count == 3)68return 0;6970if (!scalar_possible(instr))71return 1;7273return instr->src_count == 2 ? 2 : 3;74}7576/* priority of scalar instruction for scheduling (lower=higher prio) */77static unsigned78alu_scalar_prio(struct ir2_instr *instr)79{80if (!scalar_possible(instr))81return ~0u;8283/* this case is dealt with later */84if (instr->src_count > 1)85return ~0u;8687if (is_export(instr))88return 4;8990/* PRED to end of block */91if (instr->alu.scalar_opc >= PRED_SETEs &&92instr->alu.scalar_opc <= PRED_SET_RESTOREs)93return 5;9495/* scalar only have highest priority */96return instr->alu.vector_opc == VECTOR_NONE ? 0 : 3;97}9899/* this is a bit messy:100* we want to find a slot where we can insert a scalar MOV with101* a vector instruction that was already scheduled102*/103static struct ir2_sched_instr *104insert(struct ir2_context *ctx, unsigned block_idx, unsigned reg_idx,105struct ir2_src src1, unsigned *comp)106{107struct ir2_sched_instr *sched = NULL, *s;108unsigned i, mask = 0xf;109110/* go first earliest point where the mov can be inserted */111for (i = ctx->instr_sched_count - 1; i > 0; i--) {112s = &ctx->instr_sched[i - 1];113114if (s->instr && s->instr->block_idx != block_idx)115break;116if (s->instr_s && s->instr_s->block_idx != block_idx)117break;118119if (src1.type == IR2_SRC_SSA) {120if ((s->instr && s->instr->idx == src1.num) ||121(s->instr_s && s->instr_s->idx == src1.num))122break;123}124125unsigned mr = ~(s->reg_state[reg_idx / 8] >> reg_idx % 8 * 4 & 0xf);126if ((mask & mr) == 0)127break;128129mask &= mr;130if (s->instr_s || s->instr->src_count == 3)131continue;132133if (s->instr->type != IR2_ALU || s->instr->alu.export >= 0)134continue;135136sched = s;137}138*comp = ffs(mask) - 1;139140if (sched) {141for (s = sched; s != &ctx->instr_sched[ctx->instr_sched_count]; s++)142s->reg_state[reg_idx / 8] |= 1 << (*comp + reg_idx % 8 * 4);143}144145return sched;146}147148/* case1:149* in this case, insert a mov to place the 2nd src into to same reg150* (scalar sources come from the same register)151*152* this is a common case which works when one of the srcs is input/const153* but for instrs which have 2 ssa/reg srcs, then its not ideal154*/155static bool156scalarize_case1(struct ir2_context *ctx, struct ir2_instr *instr, bool order)157{158struct ir2_src src0 = instr->src[order];159struct ir2_src src1 = instr->src[!order];160struct ir2_sched_instr *sched;161struct ir2_instr *ins;162struct ir2_reg *reg;163unsigned idx, comp;164165switch (src0.type) {166case IR2_SRC_CONST:167case IR2_SRC_INPUT:168return false;169default:170break;171}172173/* TODO, insert needs logic for this */174if (src1.type == IR2_SRC_REG)175return false;176177/* we could do something if they match src1.. */178if (src0.negate || src0.abs)179return false;180181reg = get_reg_src(ctx, &src0);182183/* result not used more since we will overwrite */184for (int i = 0; i < 4; i++)185if (reg->comp[i].ref_count != !!(instr->alu.write_mask & 1 << i))186return false;187188/* find a place to insert the mov */189sched = insert(ctx, instr->block_idx, reg->idx, src1, &comp);190if (!sched)191return false;192193ins = &ctx->instr[idx = ctx->instr_count++];194ins->idx = idx;195ins->type = IR2_ALU;196ins->src[0] = src1;197ins->src_count = 1;198ins->is_ssa = true;199ins->ssa.idx = reg->idx;200ins->ssa.ncomp = 1;201ins->ssa.comp[0].c = comp;202ins->alu.scalar_opc = MAXs;203ins->alu.export = -1;204ins->alu.write_mask = 1;205ins->pred = instr->pred;206ins->block_idx = instr->block_idx;207208instr->src[0] = src0;209instr->alu.src1_swizzle = comp;210211sched->instr_s = ins;212return true;213}214215/* fill sched with next fetch or (vector and/or scalar) alu instruction */216static int217sched_next(struct ir2_context *ctx, struct ir2_sched_instr *sched)218{219struct ir2_instr *avail[0x100], *instr_v = NULL, *instr_s = NULL;220unsigned avail_count = 0;221222instr_alloc_type_t export = ~0u;223int block_idx = -1;224225/* XXX merge this loop with the other one somehow? */226ir2_foreach_instr (instr, ctx) {227if (!instr->need_emit)228continue;229if (is_export(instr))230export = MIN2(export, export_buf(instr->alu.export));231}232233ir2_foreach_instr (instr, ctx) {234if (!instr->need_emit)235continue;236237/* dont mix exports */238if (is_export(instr) && export_buf(instr->alu.export) != export)239continue;240241if (block_idx < 0)242block_idx = instr->block_idx;243else if (block_idx != instr->block_idx || /* must be same block */244instr->type == IR2_CF || /* CF/MEM must be alone */245(is_export(instr) && export == SQ_MEMORY))246break;247/* it works because IR2_CF is always at end of block248* and somewhat same idea with MEM exports, which might not be alone249* but will end up in-order at least250*/251252/* check if dependencies are satisfied */253bool is_ok = true;254ir2_foreach_src (src, instr) {255if (src->type == IR2_SRC_REG) {256/* need to check if all previous instructions in the block257* which write the reg have been emitted258* slow..259* XXX: check components instead of whole register260*/261struct ir2_reg *reg = get_reg_src(ctx, src);262ir2_foreach_instr (p, ctx) {263if (!p->is_ssa && p->reg == reg && p->idx < instr->idx)264is_ok &= !p->need_emit;265}266} else if (src->type == IR2_SRC_SSA) {267/* in this case its easy, just check need_emit */268is_ok &= !ctx->instr[src->num].need_emit;269}270}271/* don't reorder non-ssa write before read */272if (!instr->is_ssa) {273ir2_foreach_instr (p, ctx) {274if (!p->need_emit || p->idx >= instr->idx)275continue;276277ir2_foreach_src (src, p) {278if (get_reg_src(ctx, src) == instr->reg)279is_ok = false;280}281}282}283/* don't reorder across predicates */284if (avail_count && instr->pred != avail[0]->pred)285is_ok = false;286287if (!is_ok)288continue;289290avail[avail_count++] = instr;291}292293if (!avail_count) {294assert(block_idx == -1);295return -1;296}297298/* priority to FETCH instructions */299ir2_foreach_avail (instr) {300if (instr->type == IR2_ALU)301continue;302303ra_src_free(ctx, instr);304ra_reg(ctx, get_reg(instr), -1, false, 0);305306instr->need_emit = false;307sched->instr = instr;308sched->instr_s = NULL;309return block_idx;310}311312/* TODO precompute priorities */313314unsigned prio_v = ~0u, prio_s = ~0u, prio;315ir2_foreach_avail (instr) {316prio = alu_vector_prio(instr);317if (prio < prio_v) {318instr_v = instr;319prio_v = prio;320}321}322323/* TODO can still insert scalar if src_count=3, if smart about it */324if (!instr_v || instr_v->src_count < 3) {325ir2_foreach_avail (instr) {326bool compat = is_alu_compatible(instr_v, instr);327328prio = alu_scalar_prio(instr);329if (prio >= prio_v && !compat)330continue;331332if (prio < prio_s) {333instr_s = instr;334prio_s = prio;335if (!compat)336instr_v = NULL;337}338}339}340341assert(instr_v || instr_s);342343/* now, we try more complex insertion of vector instruction as scalar344* TODO: if we are smart we can still insert if instr_v->src_count==3345*/346if (!instr_s && instr_v->src_count < 3) {347ir2_foreach_avail (instr) {348if (!is_alu_compatible(instr_v, instr) || !scalar_possible(instr))349continue;350351/* at this point, src_count should always be 2 */352assert(instr->src_count == 2);353354if (scalarize_case1(ctx, instr, 0)) {355instr_s = instr;356break;357}358if (scalarize_case1(ctx, instr, 1)) {359instr_s = instr;360break;361}362}363}364365/* free src registers */366if (instr_v) {367instr_v->need_emit = false;368ra_src_free(ctx, instr_v);369}370371if (instr_s) {372instr_s->need_emit = false;373ra_src_free(ctx, instr_s);374}375376/* allocate dst registers */377if (instr_v)378ra_reg(ctx, get_reg(instr_v), -1, is_export(instr_v),379instr_v->alu.write_mask);380381if (instr_s)382ra_reg(ctx, get_reg(instr_s), -1, is_export(instr_s),383instr_s->alu.write_mask);384385sched->instr = instr_v;386sched->instr_s = instr_s;387return block_idx;388}389390/* scheduling: determine order of instructions */391static void392schedule_instrs(struct ir2_context *ctx)393{394struct ir2_sched_instr *sched;395int block_idx;396397/* allocate input registers */398for (unsigned idx = 0; idx < ARRAY_SIZE(ctx->input); idx++)399if (ctx->input[idx].initialized)400ra_reg(ctx, &ctx->input[idx], idx, false, 0);401402for (;;) {403sched = &ctx->instr_sched[ctx->instr_sched_count++];404block_idx = sched_next(ctx, sched);405if (block_idx < 0)406break;407memcpy(sched->reg_state, ctx->reg_state, sizeof(ctx->reg_state));408409/* catch texture fetch after scheduling and insert the410* SET_TEX_LOD right before it if necessary411* TODO clean this up412*/413struct ir2_instr *instr = sched->instr, *tex_lod;414if (instr && instr->type == IR2_FETCH && instr->fetch.opc == TEX_FETCH &&415instr->src_count == 2) {416/* generate the SET_LOD instruction */417tex_lod = &ctx->instr[ctx->instr_count++];418tex_lod->type = IR2_FETCH;419tex_lod->block_idx = instr->block_idx;420tex_lod->pred = instr->pred;421tex_lod->fetch.opc = TEX_SET_TEX_LOD;422tex_lod->src[0] = instr->src[1];423tex_lod->src_count = 1;424425sched[1] = sched[0];426sched->instr = tex_lod;427ctx->instr_sched_count++;428}429430bool free_block = true;431ir2_foreach_instr (instr, ctx)432free_block &= instr->block_idx != block_idx;433if (free_block)434ra_block_free(ctx, block_idx);435};436ctx->instr_sched_count--;437}438439void440ir2_compile(struct fd2_shader_stateobj *so, unsigned variant,441struct fd2_shader_stateobj *fp)442{443struct ir2_context ctx = {};444bool binning = !fp && so->type == MESA_SHADER_VERTEX;445446if (fp)447so->variant[variant].f = fp->variant[0].f;448449ctx.so = so;450ctx.info = &so->variant[variant].info;451ctx.f = &so->variant[variant].f;452ctx.info->max_reg = -1;453454/* convert nir to internal representation */455ir2_nir_compile(&ctx, binning);456457/* copy propagate srcs */458cp_src(&ctx);459460/* get ref_counts and kill non-needed instructions */461ra_count_refs(&ctx);462463/* remove movs used to write outputs */464cp_export(&ctx);465466/* instruction order.. and vector->scalar conversions */467schedule_instrs(&ctx);468469/* finally, assemble to bitcode */470assemble(&ctx, binning);471}472473474