Path: blob/21.2-virgl/src/gallium/drivers/freedreno/a2xx/ir2_assemble.c
4574 views
/*1* Copyright (C) 2018 Jonathan Marek <[email protected]>2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,19* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE20* SOFTWARE.21*22* Authors:23* Jonathan Marek <[email protected]>24*/2526#include "ir2_private.h"2728static unsigned29src_swizzle(struct ir2_context *ctx, struct ir2_src *src, unsigned ncomp)30{31struct ir2_reg_component *comps;32unsigned swiz = 0;3334switch (src->type) {35case IR2_SRC_SSA:36case IR2_SRC_REG:37break;38default:39return src->swizzle;40}41/* we need to take into account where the components were allocated */42comps = get_reg_src(ctx, src)->comp;43for (int i = 0; i < ncomp; i++) {44swiz |= swiz_set(comps[swiz_get(src->swizzle, i)].c, i);45}46return swiz;47}4849/* alu instr need to take into how the output components are allocated */5051/* scalar doesn't need to take into account dest swizzle */5253static unsigned54alu_swizzle_scalar(struct ir2_context *ctx, struct ir2_src *reg)55{56/* hardware seems to take from W, but swizzle everywhere just in case */57return swiz_merge(src_swizzle(ctx, reg, 1), IR2_SWIZZLE_XXXX);58}5960static unsigned61alu_swizzle(struct ir2_context *ctx, struct ir2_instr *instr,62struct ir2_src *src)63{64struct ir2_reg_component *comp = get_reg(instr)->comp;65unsigned swiz0 = src_swizzle(ctx, src, src_ncomp(instr));66unsigned swiz = 0;6768/* non per component special cases */69switch (instr->alu.vector_opc) {70case PRED_SETE_PUSHv ... PRED_SETGTE_PUSHv:71return alu_swizzle_scalar(ctx, src);72case DOT2ADDv:73case DOT3v:74case DOT4v:75case CUBEv:76return swiz0;77default:78break;79}8081for (int i = 0, j = 0; i < dst_ncomp(instr); j++) {82if (instr->alu.write_mask & 1 << j) {83if (comp[j].c != 7)84swiz |= swiz_set(i, comp[j].c);85i++;86}87}88return swiz_merge(swiz0, swiz);89}9091static unsigned92alu_swizzle_scalar2(struct ir2_context *ctx, struct ir2_src *src, unsigned s1)93{94/* hardware seems to take from ZW, but swizzle everywhere (ABAB) */95unsigned s0 = swiz_get(src_swizzle(ctx, src, 1), 0);96return swiz_merge(swiz_set(s0, 0) | swiz_set(s1, 1), IR2_SWIZZLE_XYXY);97}9899/* write_mask needs to be transformed by allocation information */100101static unsigned102alu_write_mask(struct ir2_context *ctx, struct ir2_instr *instr)103{104struct ir2_reg_component *comp = get_reg(instr)->comp;105unsigned write_mask = 0;106107for (int i = 0; i < 4; i++) {108if (instr->alu.write_mask & 1 << i)109write_mask |= 1 << comp[i].c;110}111112return write_mask;113}114115/* fetch instructions can swizzle dest, but src swizzle needs conversion */116117static unsigned118fetch_swizzle(struct ir2_context *ctx, struct ir2_src *src, unsigned ncomp)119{120unsigned alu_swiz = src_swizzle(ctx, src, ncomp);121unsigned swiz = 0;122for (int i = 0; i < ncomp; i++)123swiz |= swiz_get(alu_swiz, i) << i * 2;124return swiz;125}126127static unsigned128fetch_dst_swiz(struct ir2_context *ctx, struct ir2_instr *instr)129{130struct ir2_reg_component *comp = get_reg(instr)->comp;131unsigned dst_swiz = 0xfff;132for (int i = 0; i < dst_ncomp(instr); i++) {133dst_swiz &= ~(7 << comp[i].c * 3);134dst_swiz |= i << comp[i].c * 3;135}136return dst_swiz;137}138139/* register / export # for instr */140static unsigned141dst_to_reg(struct ir2_context *ctx, struct ir2_instr *instr)142{143if (is_export(instr))144return instr->alu.export;145146return get_reg(instr)->idx;147}148149/* register # for src */150static unsigned151src_to_reg(struct ir2_context *ctx, struct ir2_src *src)152{153return get_reg_src(ctx, src)->idx;154}155156static unsigned157src_reg_byte(struct ir2_context *ctx, struct ir2_src *src)158{159if (src->type == IR2_SRC_CONST) {160assert(!src->abs); /* no abs bit for const */161return src->num;162}163return src_to_reg(ctx, src) | (src->abs ? 0x80 : 0);164}165166/* produce the 12 byte binary instruction for a given sched_instr */167static void168fill_instr(struct ir2_context *ctx, struct ir2_sched_instr *sched, instr_t *bc,169bool *is_fetch)170{171struct ir2_instr *instr = sched->instr, *instr_s, *instr_v;172173*bc = (instr_t){};174175if (instr && instr->type == IR2_FETCH) {176*is_fetch = true;177178bc->fetch.opc = instr->fetch.opc;179bc->fetch.pred_select = !!instr->pred;180bc->fetch.pred_condition = instr->pred & 1;181182struct ir2_src *src = instr->src;183184if (instr->fetch.opc == VTX_FETCH) {185instr_fetch_vtx_t *vtx = &bc->fetch.vtx;186187assert(instr->fetch.vtx.const_idx <= 0x1f);188assert(instr->fetch.vtx.const_idx_sel <= 0x3);189190vtx->src_reg = src_to_reg(ctx, src);191vtx->src_swiz = fetch_swizzle(ctx, src, 1);192vtx->dst_reg = dst_to_reg(ctx, instr);193vtx->dst_swiz = fetch_dst_swiz(ctx, instr);194195vtx->must_be_one = 1;196vtx->const_index = instr->fetch.vtx.const_idx;197vtx->const_index_sel = instr->fetch.vtx.const_idx_sel;198199/* other fields will be patched */200201/* XXX seems like every FETCH but the first has202* this bit set:203*/204vtx->reserved3 = instr->idx ? 0x1 : 0x0;205vtx->reserved0 = instr->idx ? 0x2 : 0x3;206} else if (instr->fetch.opc == TEX_FETCH) {207instr_fetch_tex_t *tex = &bc->fetch.tex;208209tex->src_reg = src_to_reg(ctx, src);210tex->src_swiz = fetch_swizzle(ctx, src, 3);211tex->dst_reg = dst_to_reg(ctx, instr);212tex->dst_swiz = fetch_dst_swiz(ctx, instr);213/* tex->const_idx = patch_fetches */214tex->mag_filter = TEX_FILTER_USE_FETCH_CONST;215tex->min_filter = TEX_FILTER_USE_FETCH_CONST;216tex->mip_filter = TEX_FILTER_USE_FETCH_CONST;217tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST;218tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST;219tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST;220tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST;221tex->use_comp_lod = ctx->so->type == MESA_SHADER_FRAGMENT;222tex->use_reg_lod = instr->src_count == 2;223tex->sample_location = SAMPLE_CENTER;224tex->tx_coord_denorm = instr->fetch.tex.is_rect;225} else if (instr->fetch.opc == TEX_SET_TEX_LOD) {226instr_fetch_tex_t *tex = &bc->fetch.tex;227228tex->src_reg = src_to_reg(ctx, src);229tex->src_swiz = fetch_swizzle(ctx, src, 1);230tex->dst_reg = 0;231tex->dst_swiz = 0xfff;232233tex->mag_filter = TEX_FILTER_USE_FETCH_CONST;234tex->min_filter = TEX_FILTER_USE_FETCH_CONST;235tex->mip_filter = TEX_FILTER_USE_FETCH_CONST;236tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST;237tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST;238tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST;239tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST;240tex->use_comp_lod = 1;241tex->use_reg_lod = 0;242tex->sample_location = SAMPLE_CENTER;243} else {244assert(0);245}246return;247}248249instr_v = sched->instr;250instr_s = sched->instr_s;251252if (instr_v) {253struct ir2_src src1, src2, *src3;254255src1 = instr_v->src[0];256src2 = instr_v->src[instr_v->src_count > 1];257src3 = instr_v->src_count == 3 ? &instr_v->src[2] : NULL;258259bc->alu.vector_opc = instr_v->alu.vector_opc;260bc->alu.vector_write_mask = alu_write_mask(ctx, instr_v);261bc->alu.vector_dest = dst_to_reg(ctx, instr_v);262bc->alu.vector_clamp = instr_v->alu.saturate;263bc->alu.export_data = instr_v->alu.export >= 0;264265/* single operand SETEv, use 0.0f as src2 */266if (instr_v->src_count == 1 &&267(bc->alu.vector_opc == SETEv || bc->alu.vector_opc == SETNEv ||268bc->alu.vector_opc == SETGTv || bc->alu.vector_opc == SETGTEv))269src2 = ir2_zero(ctx);270271/* export32 instr for a20x hw binning has this bit set..272* it seems to do more than change the base address of constants273* XXX this is a hack274*/275bc->alu.relative_addr =276(bc->alu.export_data && bc->alu.vector_dest == 32);277278bc->alu.src1_reg_byte = src_reg_byte(ctx, &src1);279bc->alu.src1_swiz = alu_swizzle(ctx, instr_v, &src1);280bc->alu.src1_reg_negate = src1.negate;281bc->alu.src1_sel = src1.type != IR2_SRC_CONST;282283bc->alu.src2_reg_byte = src_reg_byte(ctx, &src2);284bc->alu.src2_swiz = alu_swizzle(ctx, instr_v, &src2);285bc->alu.src2_reg_negate = src2.negate;286bc->alu.src2_sel = src2.type != IR2_SRC_CONST;287288if (src3) {289bc->alu.src3_reg_byte = src_reg_byte(ctx, src3);290bc->alu.src3_swiz = alu_swizzle(ctx, instr_v, src3);291bc->alu.src3_reg_negate = src3->negate;292bc->alu.src3_sel = src3->type != IR2_SRC_CONST;293}294295bc->alu.pred_select = instr_v->pred;296}297298if (instr_s) {299struct ir2_src *src = instr_s->src;300301bc->alu.scalar_opc = instr_s->alu.scalar_opc;302bc->alu.scalar_write_mask = alu_write_mask(ctx, instr_s);303bc->alu.scalar_dest = dst_to_reg(ctx, instr_s);304bc->alu.scalar_clamp = instr_s->alu.saturate;305bc->alu.export_data = instr_s->alu.export >= 0;306307if (instr_s->src_count == 1) {308bc->alu.src3_reg_byte = src_reg_byte(ctx, src);309bc->alu.src3_swiz = alu_swizzle_scalar(ctx, src);310bc->alu.src3_reg_negate = src->negate;311bc->alu.src3_sel = src->type != IR2_SRC_CONST;312} else {313assert(instr_s->src_count == 2);314315bc->alu.src3_reg_byte = src_reg_byte(ctx, src);316bc->alu.src3_swiz =317alu_swizzle_scalar2(ctx, src, instr_s->alu.src1_swizzle);318bc->alu.src3_reg_negate = src->negate;319bc->alu.src3_sel = src->type != IR2_SRC_CONST;320;321}322323if (instr_v)324assert(instr_s->pred == instr_v->pred);325bc->alu.pred_select = instr_s->pred;326}327328*is_fetch = false;329return;330}331332static unsigned333write_cfs(struct ir2_context *ctx, instr_cf_t *cfs, unsigned cf_idx,334instr_cf_alloc_t *alloc, instr_cf_exec_t *exec)335{336assert(exec->count);337338if (alloc)339cfs[cf_idx++].alloc = *alloc;340341/* for memory alloc offset for patching */342if (alloc && alloc->buffer_select == SQ_MEMORY &&343ctx->info->mem_export_ptr == -1)344ctx->info->mem_export_ptr = cf_idx / 2 * 3;345346cfs[cf_idx++].exec = *exec;347exec->address += exec->count;348exec->serialize = 0;349exec->count = 0;350351return cf_idx;352}353354/* assemble the final shader */355void356assemble(struct ir2_context *ctx, bool binning)357{358/* hw seems to have a limit of 384 (num_cf/2+num_instr <= 384)359* address is 9 bits so could it be 512 ?360*/361instr_cf_t cfs[384];362instr_t bytecode[384], bc;363unsigned block_addr[128];364unsigned num_cf = 0;365366/* CF instr state */367instr_cf_exec_t exec = {.opc = EXEC};368instr_cf_alloc_t alloc = {.opc = ALLOC};369370int sync_id, sync_id_prev = -1;371bool is_fetch = false;372bool need_sync = true;373bool need_alloc = false;374unsigned block_idx = 0;375376ctx->info->mem_export_ptr = -1;377ctx->info->num_fetch_instrs = 0;378379/* vertex shader always needs to allocate at least one parameter380* if it will never happen,381*/382if (ctx->so->type == MESA_SHADER_VERTEX && ctx->f->inputs_count == 0) {383alloc.buffer_select = SQ_PARAMETER_PIXEL;384cfs[num_cf++].alloc = alloc;385}386387block_addr[0] = 0;388389for (int i = 0, j = 0; j < ctx->instr_sched_count; j++) {390struct ir2_instr *instr = ctx->instr_sched[j].instr;391392/* catch IR2_CF since it isn't a regular instruction */393if (instr && instr->type == IR2_CF) {394assert(!need_alloc); /* XXX */395396/* flush any exec cf before inserting jmp */397if (exec.count)398num_cf = write_cfs(ctx, cfs, num_cf, NULL, &exec);399400cfs[num_cf++].jmp_call = (instr_cf_jmp_call_t){401.opc = COND_JMP,402.address = instr->cf.block_idx, /* will be fixed later */403.force_call = !instr->pred,404.predicated_jmp = 1,405.direction = instr->cf.block_idx > instr->block_idx,406.condition = instr->pred & 1,407};408continue;409}410411/* fill the 3 dwords for the instruction */412fill_instr(ctx, &ctx->instr_sched[j], &bc, &is_fetch);413414/* we need to sync between ALU/VTX_FETCH/TEX_FETCH types */415sync_id = 0;416if (is_fetch)417sync_id = bc.fetch.opc == VTX_FETCH ? 1 : 2;418419need_sync = sync_id != sync_id_prev;420sync_id_prev = sync_id;421422unsigned block;423{424425if (ctx->instr_sched[j].instr)426block = ctx->instr_sched[j].instr->block_idx;427else428block = ctx->instr_sched[j].instr_s->block_idx;429430assert(block_idx <= block);431}432433/* info for patching */434if (is_fetch) {435struct ir2_fetch_info *info =436&ctx->info->fetch_info[ctx->info->num_fetch_instrs++];437info->offset = i * 3; /* add cf offset later */438439if (bc.fetch.opc == VTX_FETCH) {440info->vtx.dst_swiz = bc.fetch.vtx.dst_swiz;441} else if (bc.fetch.opc == TEX_FETCH) {442info->tex.samp_id = instr->fetch.tex.samp_id;443info->tex.src_swiz = bc.fetch.tex.src_swiz;444} else {445ctx->info->num_fetch_instrs--;446}447}448449/* exec cf after 6 instr or when switching between fetch / alu */450if (exec.count == 6 ||451(exec.count && (need_sync || block != block_idx))) {452num_cf =453write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec);454need_alloc = false;455}456457/* update block_addrs for jmp patching */458while (block_idx < block)459block_addr[++block_idx] = num_cf;460461/* export - fill alloc cf */462if (!is_fetch && bc.alu.export_data) {463/* get the export buffer from either vector/scalar dest */464instr_alloc_type_t buffer = export_buf(bc.alu.vector_dest);465if (bc.alu.scalar_write_mask) {466if (bc.alu.vector_write_mask)467assert(buffer == export_buf(bc.alu.scalar_dest));468buffer = export_buf(bc.alu.scalar_dest);469}470471/* flush previous alloc if the buffer changes */472bool need_new_alloc = buffer != alloc.buffer_select;473474/* memory export always in 32/33 pair, new alloc on 32 */475if (bc.alu.vector_dest == 32)476need_new_alloc = true;477478if (need_new_alloc && exec.count) {479num_cf =480write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec);481need_alloc = false;482}483484need_alloc |= need_new_alloc;485486alloc.size = 0;487alloc.buffer_select = buffer;488489if (buffer == SQ_PARAMETER_PIXEL &&490ctx->so->type == MESA_SHADER_VERTEX)491alloc.size = ctx->f->inputs_count - 1;492493if (buffer == SQ_POSITION)494alloc.size = ctx->so->writes_psize;495}496497if (is_fetch)498exec.serialize |= 0x1 << exec.count * 2;499if (need_sync)500exec.serialize |= 0x2 << exec.count * 2;501502need_sync = false;503exec.count += 1;504bytecode[i++] = bc;505}506507/* final exec cf */508exec.opc = EXEC_END;509num_cf = write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec);510511/* insert nop to get an even # of CFs */512if (num_cf % 2)513cfs[num_cf++] = (instr_cf_t){.opc = NOP};514515/* patch cf addrs */516for (int idx = 0; idx < num_cf; idx++) {517switch (cfs[idx].opc) {518case NOP:519case ALLOC:520break;521case EXEC:522case EXEC_END:523cfs[idx].exec.address += num_cf / 2;524break;525case COND_JMP:526cfs[idx].jmp_call.address = block_addr[cfs[idx].jmp_call.address];527break;528default:529assert(0);530}531}532533/* concatenate cfs and alu/fetch */534uint32_t cfdwords = num_cf / 2 * 3;535uint32_t alufetchdwords = exec.address * 3;536uint32_t sizedwords = cfdwords + alufetchdwords;537uint32_t *dwords = malloc(sizedwords * 4);538assert(dwords);539memcpy(dwords, cfs, cfdwords * 4);540memcpy(&dwords[cfdwords], bytecode, alufetchdwords * 4);541542/* finalize ir2_shader_info */543ctx->info->dwords = dwords;544ctx->info->sizedwords = sizedwords;545for (int i = 0; i < ctx->info->num_fetch_instrs; i++)546ctx->info->fetch_info[i].offset += cfdwords;547548if (FD_DBG(DISASM)) {549DBG("disassemble: type=%d", ctx->so->type);550disasm_a2xx(dwords, sizedwords, 0, ctx->so->type);551}552}553554555