Path: blob/21.2-virgl/src/panfrost/midgard/midgard_emit.c
4564 views
/*1* Copyright (C) 2018-2019 Alyssa Rosenzweig <[email protected]>2* Copyright (C) 2019-2020 Collabora, Ltd.3*4* Permission is hereby granted, free of charge, to any person obtaining a5* copy of this software and associated documentation files (the "Software"),6* to deal in the Software without restriction, including without limitation7* the rights to use, copy, modify, merge, publish, distribute, sublicense,8* and/or sell copies of the Software, and to permit persons to whom the9* Software is furnished to do so, subject to the following conditions:10*11* The above copyright notice and this permission notice (including the next12* paragraph) shall be included in all copies or substantial portions of the13* Software.14*15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,20* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE21* SOFTWARE.22*/2324#include "compiler.h"25#include "midgard_ops.h"26#include "midgard_quirks.h"2728static midgard_int_mod29mir_get_imod(bool shift, nir_alu_type T, bool half, bool scalar)30{31if (!half) {32assert(!shift);33/* Doesn't matter, src mods are only used when expanding */34return midgard_int_sign_extend;35}3637if (shift)38return midgard_int_left_shift;3940if (nir_alu_type_get_base_type(T) == nir_type_int)41return midgard_int_sign_extend;42else43return midgard_int_zero_extend;44}4546void47midgard_pack_ubo_index_imm(midgard_load_store_word *word, unsigned index)48{49word->arg_comp = index & 0x3;50word->arg_reg = (index >> 2) & 0x7;51word->bitsize_toggle = (index >> 5) & 0x1;52word->index_format = (index >> 6) & 0x3;53}5455unsigned56midgard_unpack_ubo_index_imm(midgard_load_store_word word)57{58unsigned ubo = word.arg_comp |59(word.arg_reg << 2) |60(word.bitsize_toggle << 5) |61(word.index_format << 6);6263return ubo;64}6566void midgard_pack_varying_params(midgard_load_store_word *word, midgard_varying_params p)67{68/* Currently these parameters are not supported. */69assert(p.direct_sample_pos_x == 0 && p.direct_sample_pos_y == 0);7071unsigned u;72memcpy(&u, &p, sizeof(p));7374word->signed_offset |= u & 0x1FF;75}7677midgard_varying_params midgard_unpack_varying_params(midgard_load_store_word word)78{79unsigned params = word.signed_offset & 0x1FF;8081midgard_varying_params p;82memcpy(&p, ¶ms, sizeof(p));8384return p;85}8687unsigned88mir_pack_mod(midgard_instruction *ins, unsigned i, bool scalar)89{90bool integer = midgard_is_integer_op(ins->op);91unsigned base_size = max_bitsize_for_alu(ins);92unsigned sz = nir_alu_type_get_type_size(ins->src_types[i]);93bool half = (sz == (base_size >> 1));9495return integer ?96mir_get_imod(ins->src_shift[i], ins->src_types[i], half, scalar) :97((ins->src_abs[i] << 0) |98((ins->src_neg[i] << 1)));99}100101/* Midgard IR only knows vector ALU types, but we sometimes need to actually102* use scalar ALU instructions, for functional or performance reasons. To do103* this, we just demote vector ALU payloads to scalar. */104105static int106component_from_mask(unsigned mask)107{108for (int c = 0; c < 8; ++c) {109if (mask & (1 << c))110return c;111}112113assert(0);114return 0;115}116117static unsigned118mir_pack_scalar_source(unsigned mod, bool is_full, unsigned component)119{120midgard_scalar_alu_src s = {121.mod = mod,122.full = is_full,123.component = component << (is_full ? 1 : 0)124};125126unsigned o;127memcpy(&o, &s, sizeof(s));128129return o & ((1 << 6) - 1);130}131132static midgard_scalar_alu133vector_to_scalar_alu(midgard_vector_alu v, midgard_instruction *ins)134{135bool is_full = nir_alu_type_get_type_size(ins->dest_type) == 32;136137bool half_0 = nir_alu_type_get_type_size(ins->src_types[0]) == 16;138bool half_1 = nir_alu_type_get_type_size(ins->src_types[1]) == 16;139unsigned comp = component_from_mask(ins->mask);140141unsigned packed_src[2] = {142mir_pack_scalar_source(mir_pack_mod(ins, 0, true), !half_0, ins->swizzle[0][comp]),143mir_pack_scalar_source(mir_pack_mod(ins, 1, true), !half_1, ins->swizzle[1][comp])144};145146/* The output component is from the mask */147midgard_scalar_alu s = {148.op = v.op,149.src1 = packed_src[0],150.src2 = packed_src[1],151.unknown = 0,152.outmod = v.outmod,153.output_full = is_full,154.output_component = comp155};156157/* Full components are physically spaced out */158if (is_full) {159assert(s.output_component < 4);160s.output_component <<= 1;161}162163/* Inline constant is passed along rather than trying to extract it164* from v */165166if (ins->has_inline_constant) {167uint16_t imm = 0;168int lower_11 = ins->inline_constant & ((1 << 12) - 1);169imm |= (lower_11 >> 9) & 3;170imm |= (lower_11 >> 6) & 4;171imm |= (lower_11 >> 2) & 0x38;172imm |= (lower_11 & 63) << 6;173174s.src2 = imm;175}176177return s;178}179180/* 64-bit swizzles are super easy since there are 2 components of 2 components181* in an 8-bit field ... lots of duplication to go around!182*183* Swizzles of 32-bit vectors accessed from 64-bit instructions are a little184* funny -- pack them *as if* they were native 64-bit, using rep_* flags to185* flag upper. For instance, xy would become 64-bit XY but that's just xyzw186* native. Likewise, zz would become 64-bit XX with rep* so it would be xyxy187* with rep. Pretty nifty, huh? */188189static unsigned190mir_pack_swizzle_64(unsigned *swizzle, unsigned max_component)191{192unsigned packed = 0;193194for (unsigned i = 0; i < 2; ++i) {195assert(swizzle[i] <= max_component);196197unsigned a = (swizzle[i] & 1) ?198(COMPONENT_W << 2) | COMPONENT_Z :199(COMPONENT_Y << 2) | COMPONENT_X;200201packed |= a << (i * 4);202}203204return packed;205}206207static void208mir_pack_mask_alu(midgard_instruction *ins, midgard_vector_alu *alu)209{210unsigned effective = ins->mask;211212/* If we have a destination override, we need to figure out whether to213* override to the lower or upper half, shifting the effective mask in214* the latter, so AAAA.... becomes AAAA */215216unsigned inst_size = max_bitsize_for_alu(ins);217signed upper_shift = mir_upper_override(ins, inst_size);218219if (upper_shift >= 0) {220effective >>= upper_shift;221alu->shrink_mode = upper_shift ?222midgard_shrink_mode_upper :223midgard_shrink_mode_lower;224} else {225alu->shrink_mode = midgard_shrink_mode_none;226}227228if (inst_size == 32)229alu->mask = expand_writemask(effective, 2);230else if (inst_size == 64)231alu->mask = expand_writemask(effective, 1);232else233alu->mask = effective;234}235236static unsigned237mir_pack_swizzle(unsigned mask, unsigned *swizzle,238unsigned sz, unsigned base_size,239bool op_channeled, midgard_src_expand_mode *expand_mode)240{241unsigned packed = 0;242243*expand_mode = midgard_src_passthrough;244245midgard_reg_mode reg_mode = reg_mode_for_bitsize(base_size);246247if (reg_mode == midgard_reg_mode_64) {248assert(sz == 64 || sz == 32);249unsigned components = (sz == 32) ? 4 : 2;250251packed = mir_pack_swizzle_64(swizzle, components);252253if (sz == 32) {254bool lo = swizzle[0] >= COMPONENT_Z;255bool hi = swizzle[1] >= COMPONENT_Z;256257if (mask & 0x1) {258/* We can't mix halves... */259if (mask & 2)260assert(lo == hi);261262*expand_mode = lo ? midgard_src_expand_high :263midgard_src_expand_low;264} else {265*expand_mode = hi ? midgard_src_expand_high :266midgard_src_expand_low;267}268} else if (sz < 32) {269unreachable("Cannot encode 8/16 swizzle in 64-bit");270}271} else {272/* For 32-bit, swizzle packing is stupid-simple. For 16-bit,273* the strategy is to check whether the nibble we're on is274* upper or lower. We need all components to be on the same275* "side"; that much is enforced by the ISA and should have276* been lowered. TODO: 8-bit packing. TODO: vec8 */277278unsigned first = mask ? ffs(mask) - 1 : 0;279bool upper = swizzle[first] > 3;280281if (upper && mask)282assert(sz <= 16);283284bool dest_up = !op_channeled && (first >= 4);285286for (unsigned c = (dest_up ? 4 : 0); c < (dest_up ? 8 : 4); ++c) {287unsigned v = swizzle[c];288289ASSERTED bool t_upper = v > 3;290291/* Ensure we're doing something sane */292293if (mask & (1 << c)) {294assert(t_upper == upper);295assert(v <= 7);296}297298/* Use the non upper part */299v &= 0x3;300301packed |= v << (2 * (c % 4));302}303304305/* Replicate for now.. should really pick a side for306* dot products */307308if (reg_mode == midgard_reg_mode_16 && sz == 16) {309*expand_mode = upper ? midgard_src_rep_high :310midgard_src_rep_low;311} else if (reg_mode == midgard_reg_mode_16 && sz == 8) {312if (base_size == 16) {313*expand_mode = upper ? midgard_src_expand_high :314midgard_src_expand_low;315} else if (upper) {316*expand_mode = midgard_src_swap;317}318} else if (reg_mode == midgard_reg_mode_32 && sz == 16) {319*expand_mode = upper ? midgard_src_expand_high :320midgard_src_expand_low;321} else if (reg_mode == midgard_reg_mode_8) {322unreachable("Unhandled reg mode");323}324}325326return packed;327}328329static void330mir_pack_vector_srcs(midgard_instruction *ins, midgard_vector_alu *alu)331{332bool channeled = GET_CHANNEL_COUNT(alu_opcode_props[ins->op].props);333334unsigned base_size = max_bitsize_for_alu(ins);335336for (unsigned i = 0; i < 2; ++i) {337if (ins->has_inline_constant && (i == 1))338continue;339340if (ins->src[i] == ~0)341continue;342343unsigned sz = nir_alu_type_get_type_size(ins->src_types[i]);344assert((sz == base_size) || (sz == base_size / 2));345346midgard_src_expand_mode expand_mode = midgard_src_passthrough;347unsigned swizzle = mir_pack_swizzle(ins->mask, ins->swizzle[i],348sz, base_size, channeled,349&expand_mode);350351midgard_vector_alu_src pack = {352.mod = mir_pack_mod(ins, i, false),353.expand_mode = expand_mode,354.swizzle = swizzle355};356357unsigned p = vector_alu_srco_unsigned(pack);358359if (i == 0)360alu->src1 = p;361else362alu->src2 = p;363}364}365366static void367mir_pack_swizzle_ldst(midgard_instruction *ins)368{369/* TODO: non-32-bit, non-vec4 */370for (unsigned c = 0; c < 4; ++c) {371unsigned v = ins->swizzle[0][c];372373/* Check vec4 */374assert(v <= 3);375376ins->load_store.swizzle |= v << (2 * c);377}378379/* TODO: arg_1/2 */380}381382static void383mir_pack_swizzle_tex(midgard_instruction *ins)384{385for (unsigned i = 0; i < 2; ++i) {386unsigned packed = 0;387388for (unsigned c = 0; c < 4; ++c) {389unsigned v = ins->swizzle[i][c];390391/* Check vec4 */392assert(v <= 3);393394packed |= v << (2 * c);395}396397if (i == 0)398ins->texture.swizzle = packed;399else400ins->texture.in_reg_swizzle = packed;401}402403/* TODO: bias component */404}405406/* Up to 3 { ALU, LDST } bundles can execute in parallel with a texture op.407* Given a texture op, lookahead to see how many such bundles we can flag for408* OoO execution */409410static bool411mir_can_run_ooo(midgard_block *block, midgard_bundle *bundle,412unsigned dependency)413{414/* Don't read out of bounds */415if (bundle >= (midgard_bundle *) ((char *) block->bundles.data + block->bundles.size))416return false;417418/* Texture ops can't execute with other texture ops */419if (!IS_ALU(bundle->tag) && bundle->tag != TAG_LOAD_STORE_4)420return false;421422/* Ensure there is no read-after-write dependency */423424for (unsigned i = 0; i < bundle->instruction_count; ++i) {425midgard_instruction *ins = bundle->instructions[i];426427mir_foreach_src(ins, s) {428if (ins->src[s] == dependency)429return false;430}431}432433/* Otherwise, we're okay */434return true;435}436437static void438mir_pack_tex_ooo(midgard_block *block, midgard_bundle *bundle, midgard_instruction *ins)439{440unsigned count = 0;441442for (count = 0; count < 3; ++count) {443if (!mir_can_run_ooo(block, bundle + count + 1, ins->dest))444break;445}446447ins->texture.out_of_order = count;448}449450/* Load store masks are 4-bits. Load/store ops pack for that.451* For most operations, vec4 is the natural mask width; vec8 is constrained to452* be in pairs, vec2 is duplicated. TODO: 8-bit?453* For common stores (i.e. ST.*), each bit masks a single byte in the 32-bit454* case, 2 bytes in the 64-bit case and 4 bytes in the 128-bit case.455*/456457static unsigned458midgard_pack_common_store_mask(midgard_instruction *ins) {459unsigned comp_sz = nir_alu_type_get_type_size(ins->dest_type);460unsigned mask = ins->mask;461unsigned packed = 0;462unsigned nr_comp;463464switch (ins->op) {465case midgard_op_st_u8:466packed |= mask & 1;467break;468case midgard_op_st_u16:469nr_comp = 16 / comp_sz;470for (int i = 0; i < nr_comp; i++) {471if (mask & (1 << i)) {472if (comp_sz == 16)473packed |= 0x3;474else if (comp_sz == 8)475packed |= 1 << i;476}477}478break;479case midgard_op_st_32:480case midgard_op_st_64:481case midgard_op_st_128: {482unsigned total_sz = 32;483if (ins->op == midgard_op_st_128)484total_sz = 128;485else if (ins->op == midgard_op_st_64)486total_sz = 64;487488nr_comp = total_sz / comp_sz;489490/* Each writemask bit masks 1/4th of the value to be stored. */491assert(comp_sz >= total_sz / 4);492493for (int i = 0; i < nr_comp; i++) {494if (mask & (1 << i)) {495if (comp_sz == total_sz)496packed |= 0xF;497else if (comp_sz == total_sz / 2)498packed |= 0x3 << (i * 2);499else if (comp_sz == total_sz / 4)500packed |= 0x1 << i;501}502}503break;504}505default:506unreachable("unexpected ldst opcode");507}508509return packed;510}511512static void513mir_pack_ldst_mask(midgard_instruction *ins)514{515unsigned sz = nir_alu_type_get_type_size(ins->dest_type);516unsigned packed = ins->mask;517518if (OP_IS_COMMON_STORE(ins->op)) {519packed = midgard_pack_common_store_mask(ins);520} else {521if (sz == 64) {522packed = ((ins->mask & 0x2) ? (0x8 | 0x4) : 0) |523((ins->mask & 0x1) ? (0x2 | 0x1) : 0);524} else if (sz == 16) {525packed = 0;526527for (unsigned i = 0; i < 4; ++i) {528/* Make sure we're duplicated */529bool u = (ins->mask & (1 << (2*i + 0))) != 0;530ASSERTED bool v = (ins->mask & (1 << (2*i + 1))) != 0;531assert(u == v);532533packed |= (u << i);534}535} else {536assert(sz == 32);537}538}539540ins->load_store.mask = packed;541}542543static void544mir_lower_inverts(midgard_instruction *ins)545{546bool inv[3] = {547ins->src_invert[0],548ins->src_invert[1],549ins->src_invert[2]550};551552switch (ins->op) {553case midgard_alu_op_iand:554/* a & ~b = iandnot(a, b) */555/* ~a & ~b = ~(a | b) = inor(a, b) */556557if (inv[0] && inv[1])558ins->op = midgard_alu_op_inor;559else if (inv[1])560ins->op = midgard_alu_op_iandnot;561562break;563case midgard_alu_op_ior:564/* a | ~b = iornot(a, b) */565/* ~a | ~b = ~(a & b) = inand(a, b) */566567if (inv[0] && inv[1])568ins->op = midgard_alu_op_inand;569else if (inv[1])570ins->op = midgard_alu_op_iornot;571572break;573574case midgard_alu_op_ixor:575/* ~a ^ b = a ^ ~b = ~(a ^ b) = inxor(a, b) */576/* ~a ^ ~b = a ^ b */577578if (inv[0] ^ inv[1])579ins->op = midgard_alu_op_inxor;580581break;582583default:584break;585}586}587588/* Opcodes with ROUNDS are the base (rte/0) type so we can just add */589590static void591mir_lower_roundmode(midgard_instruction *ins)592{593if (alu_opcode_props[ins->op].props & MIDGARD_ROUNDS) {594assert(ins->roundmode <= 0x3);595ins->op += ins->roundmode;596}597}598599static midgard_load_store_word600load_store_from_instr(midgard_instruction *ins)601{602midgard_load_store_word ldst = ins->load_store;603ldst.op = ins->op;604605if (OP_IS_STORE(ldst.op)) {606ldst.reg = SSA_REG_FROM_FIXED(ins->src[0]) & 1;607} else {608ldst.reg = SSA_REG_FROM_FIXED(ins->dest);609}610611/* Atomic opcode swizzles have a special meaning:612* - The first two bits say which component of the implicit register should be used613* - The next two bits say if the implicit register is r26 or r27 */614if (OP_IS_ATOMIC(ins->op)) {615ldst.swizzle = 0;616ldst.swizzle |= ins->swizzle[3][0] & 3;617ldst.swizzle |= (SSA_REG_FROM_FIXED(ins->src[3]) & 1 ? 1 : 0) << 2;618}619620if (ins->src[1] != ~0) {621ldst.arg_reg = SSA_REG_FROM_FIXED(ins->src[1]) - REGISTER_LDST_BASE;622unsigned sz = nir_alu_type_get_type_size(ins->src_types[1]);623ldst.arg_comp = midgard_ldst_comp(ldst.arg_reg, ins->swizzle[1][0], sz);624}625626if (ins->src[2] != ~0) {627ldst.index_reg = SSA_REG_FROM_FIXED(ins->src[2]) - REGISTER_LDST_BASE;628unsigned sz = nir_alu_type_get_type_size(ins->src_types[2]);629ldst.index_comp = midgard_ldst_comp(ldst.index_reg, ins->swizzle[2][0], sz);630}631632return ldst;633}634635static midgard_texture_word636texture_word_from_instr(midgard_instruction *ins)637{638midgard_texture_word tex = ins->texture;639tex.op = ins->op;640641unsigned src1 = ins->src[1] == ~0 ? REGISTER_UNUSED : SSA_REG_FROM_FIXED(ins->src[1]);642tex.in_reg_select = src1 & 1;643644unsigned dest = ins->dest == ~0 ? REGISTER_UNUSED : SSA_REG_FROM_FIXED(ins->dest);645tex.out_reg_select = dest & 1;646647if (ins->src[2] != ~0) {648midgard_tex_register_select sel = {649.select = SSA_REG_FROM_FIXED(ins->src[2]) & 1,650.full = 1,651.component = ins->swizzle[2][0]652};653uint8_t packed;654memcpy(&packed, &sel, sizeof(packed));655tex.bias = packed;656}657658if (ins->src[3] != ~0) {659unsigned x = ins->swizzle[3][0];660unsigned y = x + 1;661unsigned z = x + 2;662663/* Check range, TODO: half-registers */664assert(z < 4);665666unsigned offset_reg = SSA_REG_FROM_FIXED(ins->src[3]);667tex.offset =668(1) | /* full */669(offset_reg & 1) << 1 | /* select */670(0 << 2) | /* upper */671(x << 3) | /* swizzle */672(y << 5) | /* swizzle */673(z << 7); /* swizzle */674}675676return tex;677}678679static midgard_vector_alu680vector_alu_from_instr(midgard_instruction *ins)681{682midgard_vector_alu alu = {683.op = ins->op,684.outmod = ins->outmod,685.reg_mode = reg_mode_for_bitsize(max_bitsize_for_alu(ins))686};687688if (ins->has_inline_constant) {689/* Encode inline 16-bit constant. See disassembler for690* where the algorithm is from */691692int lower_11 = ins->inline_constant & ((1 << 12) - 1);693uint16_t imm = ((lower_11 >> 8) & 0x7) |694((lower_11 & 0xFF) << 3);695696alu.src2 = imm << 2;697}698699return alu;700}701702static midgard_branch_extended703midgard_create_branch_extended( midgard_condition cond,704midgard_jmp_writeout_op op,705unsigned dest_tag,706signed quadword_offset)707{708/* The condition code is actually a LUT describing a function to709* combine multiple condition codes. However, we only support a single710* condition code at the moment, so we just duplicate over a bunch of711* times. */712713uint16_t duplicated_cond =714(cond << 14) |715(cond << 12) |716(cond << 10) |717(cond << 8) |718(cond << 6) |719(cond << 4) |720(cond << 2) |721(cond << 0);722723midgard_branch_extended branch = {724.op = op,725.dest_tag = dest_tag,726.offset = quadword_offset,727.cond = duplicated_cond728};729730return branch;731}732733static void734emit_branch(midgard_instruction *ins,735compiler_context *ctx,736midgard_block *block,737midgard_bundle *bundle,738struct util_dynarray *emission)739{740/* Parse some basic branch info */741bool is_compact = ins->unit == ALU_ENAB_BR_COMPACT;742bool is_conditional = ins->branch.conditional;743bool is_inverted = ins->branch.invert_conditional;744bool is_discard = ins->branch.target_type == TARGET_DISCARD;745bool is_tilebuf_wait = ins->branch.target_type == TARGET_TILEBUF_WAIT;746bool is_special = is_discard || is_tilebuf_wait;747bool is_writeout = ins->writeout;748749/* Determine the block we're jumping to */750int target_number = ins->branch.target_block;751752/* Report the destination tag */753int dest_tag = is_discard ? 0 :754is_tilebuf_wait ? bundle->tag :755midgard_get_first_tag_from_block(ctx, target_number);756757/* Count up the number of quadwords we're758* jumping over = number of quadwords until759* (br_block_idx, target_number) */760761int quadword_offset = 0;762763if (is_discard) {764/* Fixed encoding, not actually an offset */765quadword_offset = 0x2;766} else if (is_tilebuf_wait) {767quadword_offset = -1;768} else if (target_number > block->base.name) {769/* Jump forward */770771for (int idx = block->base.name+1; idx < target_number; ++idx) {772midgard_block *blk = mir_get_block(ctx, idx);773assert(blk);774775quadword_offset += blk->quadword_count;776}777} else {778/* Jump backwards */779780for (int idx = block->base.name; idx >= target_number; --idx) {781midgard_block *blk = mir_get_block(ctx, idx);782assert(blk);783784quadword_offset -= blk->quadword_count;785}786}787788/* Unconditional extended branches (far jumps)789* have issues, so we always use a conditional790* branch, setting the condition to always for791* unconditional. For compact unconditional792* branches, cond isn't used so it doesn't793* matter what we pick. */794795midgard_condition cond =796!is_conditional ? midgard_condition_always :797is_inverted ? midgard_condition_false :798midgard_condition_true;799800midgard_jmp_writeout_op op =801is_discard ? midgard_jmp_writeout_op_discard :802is_tilebuf_wait ? midgard_jmp_writeout_op_tilebuffer_pending :803is_writeout ? midgard_jmp_writeout_op_writeout :804(is_compact && !is_conditional) ?805midgard_jmp_writeout_op_branch_uncond :806midgard_jmp_writeout_op_branch_cond;807808if (is_compact) {809unsigned size = sizeof(midgard_branch_cond);810811if (is_conditional || is_special) {812midgard_branch_cond branch = {813.op = op,814.dest_tag = dest_tag,815.offset = quadword_offset,816.cond = cond817};818memcpy(util_dynarray_grow_bytes(emission, size, 1), &branch, size);819} else {820assert(op == midgard_jmp_writeout_op_branch_uncond);821midgard_branch_uncond branch = {822.op = op,823.dest_tag = dest_tag,824.offset = quadword_offset,825.unknown = 1826};827assert(branch.offset == quadword_offset);828memcpy(util_dynarray_grow_bytes(emission, size, 1), &branch, size);829}830} else { /* `ins->compact_branch`, misnomer */831unsigned size = sizeof(midgard_branch_extended);832833midgard_branch_extended branch =834midgard_create_branch_extended(835cond, op,836dest_tag,837quadword_offset);838839memcpy(util_dynarray_grow_bytes(emission, size, 1), &branch, size);840}841}842843static void844emit_alu_bundle(compiler_context *ctx,845midgard_block *block,846midgard_bundle *bundle,847struct util_dynarray *emission,848unsigned lookahead)849{850/* Emit the control word */851util_dynarray_append(emission, uint32_t, bundle->control | lookahead);852853/* Next up, emit register words */854for (unsigned i = 0; i < bundle->instruction_count; ++i) {855midgard_instruction *ins = bundle->instructions[i];856857/* Check if this instruction has registers */858if (ins->compact_branch) continue;859860unsigned src2_reg = REGISTER_UNUSED;861if (ins->has_inline_constant)862src2_reg = ins->inline_constant >> 11;863else if (ins->src[1] != ~0)864src2_reg = SSA_REG_FROM_FIXED(ins->src[1]);865866/* Otherwise, just emit the registers */867uint16_t reg_word = 0;868midgard_reg_info registers = {869.src1_reg = (ins->src[0] == ~0 ?870REGISTER_UNUSED :871SSA_REG_FROM_FIXED(ins->src[0])),872.src2_reg = src2_reg,873.src2_imm = ins->has_inline_constant,874.out_reg = (ins->dest == ~0 ?875REGISTER_UNUSED :876SSA_REG_FROM_FIXED(ins->dest)),877};878memcpy(®_word, ®isters, sizeof(uint16_t));879util_dynarray_append(emission, uint16_t, reg_word);880}881882/* Now, we emit the body itself */883for (unsigned i = 0; i < bundle->instruction_count; ++i) {884midgard_instruction *ins = bundle->instructions[i];885886if (!ins->compact_branch) {887mir_lower_inverts(ins);888mir_lower_roundmode(ins);889}890891if (midgard_is_branch_unit(ins->unit)) {892emit_branch(ins, ctx, block, bundle, emission);893} else if (ins->unit & UNITS_ANY_VECTOR) {894midgard_vector_alu source = vector_alu_from_instr(ins);895mir_pack_mask_alu(ins, &source);896mir_pack_vector_srcs(ins, &source);897unsigned size = sizeof(source);898memcpy(util_dynarray_grow_bytes(emission, size, 1), &source, size);899} else {900midgard_scalar_alu source = vector_to_scalar_alu(vector_alu_from_instr(ins), ins);901unsigned size = sizeof(source);902memcpy(util_dynarray_grow_bytes(emission, size, 1), &source, size);903}904}905906/* Emit padding (all zero) */907if (bundle->padding) {908memset(util_dynarray_grow_bytes(emission, bundle->padding, 1),9090, bundle->padding);910}911912/* Tack on constants */913914if (bundle->has_embedded_constants)915util_dynarray_append(emission, midgard_constants, bundle->constants);916}917918/* Shift applied to the immediate used as an offset. Probably this is papering919* over some other semantic distinction else well, but it unifies things in the920* compiler so I don't mind. */921922static void923mir_ldst_pack_offset(midgard_instruction *ins, int offset)924{925/* These opcodes don't support offsets */926assert(!OP_IS_REG2REG_LDST(ins->op) ||927ins->op == midgard_op_lea ||928ins->op == midgard_op_lea_image);929930if (OP_IS_UBO_READ(ins->op))931ins->load_store.signed_offset |= PACK_LDST_UBO_OFS(offset);932else if (OP_IS_IMAGE(ins->op))933ins->load_store.signed_offset |= PACK_LDST_ATTRIB_OFS(offset);934else if (OP_IS_SPECIAL(ins->op))935ins->load_store.signed_offset |= PACK_LDST_SELECTOR_OFS(offset);936else937ins->load_store.signed_offset |= PACK_LDST_MEM_OFS(offset);938}939940static enum mali_sampler_type941midgard_sampler_type(nir_alu_type t) {942switch (nir_alu_type_get_base_type(t))943{944case nir_type_float:945return MALI_SAMPLER_FLOAT;946case nir_type_int:947return MALI_SAMPLER_SIGNED;948case nir_type_uint:949return MALI_SAMPLER_UNSIGNED;950default:951unreachable("Unknown sampler type");952}953}954955/* After everything is scheduled, emit whole bundles at a time */956957void958emit_binary_bundle(compiler_context *ctx,959midgard_block *block,960midgard_bundle *bundle,961struct util_dynarray *emission,962int next_tag)963{964int lookahead = next_tag << 4;965966switch (bundle->tag) {967case TAG_ALU_4:968case TAG_ALU_8:969case TAG_ALU_12:970case TAG_ALU_16:971case TAG_ALU_4 + 4:972case TAG_ALU_8 + 4:973case TAG_ALU_12 + 4:974case TAG_ALU_16 + 4:975emit_alu_bundle(ctx, block, bundle, emission, lookahead);976break;977978case TAG_LOAD_STORE_4: {979/* One or two composing instructions */980981uint64_t current64, next64 = LDST_NOP;982983/* Copy masks */984985for (unsigned i = 0; i < bundle->instruction_count; ++i) {986midgard_instruction *ins = bundle->instructions[i];987mir_pack_ldst_mask(ins);988989/* Atomic ops don't use this swizzle the same way as other ops */990if (!OP_IS_ATOMIC(ins->op))991mir_pack_swizzle_ldst(ins);992993/* Apply a constant offset */994unsigned offset = ins->constants.u32[0];995if (offset)996mir_ldst_pack_offset(ins, offset);997}998999midgard_load_store_word ldst0 =1000load_store_from_instr(bundle->instructions[0]);1001memcpy(¤t64, &ldst0, sizeof(current64));10021003if (bundle->instruction_count == 2) {1004midgard_load_store_word ldst1 =1005load_store_from_instr(bundle->instructions[1]);1006memcpy(&next64, &ldst1, sizeof(next64));1007}10081009midgard_load_store instruction = {1010.type = bundle->tag,1011.next_type = next_tag,1012.word1 = current64,1013.word2 = next641014};10151016util_dynarray_append(emission, midgard_load_store, instruction);10171018break;1019}10201021case TAG_TEXTURE_4:1022case TAG_TEXTURE_4_VTX:1023case TAG_TEXTURE_4_BARRIER: {1024/* Texture instructions are easy, since there is no pipelining1025* nor VLIW to worry about. We may need to set .cont/.last1026* flags. */10271028midgard_instruction *ins = bundle->instructions[0];10291030ins->texture.type = bundle->tag;1031ins->texture.next_type = next_tag;10321033/* Nothing else to pack for barriers */1034if (ins->op == midgard_tex_op_barrier) {1035ins->texture.cont = ins->texture.last = 1;1036ins->texture.op = ins->op;1037util_dynarray_append(emission, midgard_texture_word, ins->texture);1038return;1039}10401041signed override = mir_upper_override(ins, 32);10421043ins->texture.mask = override > 0 ?1044ins->mask >> override :1045ins->mask;10461047mir_pack_swizzle_tex(ins);10481049if (!(ctx->quirks & MIDGARD_NO_OOO))1050mir_pack_tex_ooo(block, bundle, ins);10511052unsigned osz = nir_alu_type_get_type_size(ins->dest_type);1053unsigned isz = nir_alu_type_get_type_size(ins->src_types[1]);10541055assert(osz == 32 || osz == 16);1056assert(isz == 32 || isz == 16);10571058ins->texture.out_full = (osz == 32);1059ins->texture.out_upper = override > 0;1060ins->texture.in_reg_full = (isz == 32);1061ins->texture.sampler_type = midgard_sampler_type(ins->dest_type);1062ins->texture.outmod = ins->outmod;10631064if (mir_op_computes_derivatives(ctx->stage, ins->op)) {1065ins->texture.cont = !ins->helper_terminate;1066ins->texture.last = ins->helper_terminate || ins->helper_execute;1067} else {1068ins->texture.cont = ins->texture.last = 1;1069}10701071midgard_texture_word texture = texture_word_from_instr(ins);1072util_dynarray_append(emission, midgard_texture_word, texture);1073break;1074}10751076default:1077unreachable("Unknown midgard instruction type\n");1078}1079}108010811082