Path: blob/21.2-virgl/src/panfrost/midgard/midgard_compile.c
4564 views
/*1* Copyright (C) 2018-2019 Alyssa Rosenzweig <[email protected]>2* Copyright (C) 2019-2020 Collabora, Ltd.3*4* Permission is hereby granted, free of charge, to any person obtaining a5* copy of this software and associated documentation files (the "Software"),6* to deal in the Software without restriction, including without limitation7* the rights to use, copy, modify, merge, publish, distribute, sublicense,8* and/or sell copies of the Software, and to permit persons to whom the9* Software is furnished to do so, subject to the following conditions:10*11* The above copyright notice and this permission notice (including the next12* paragraph) shall be included in all copies or substantial portions of the13* Software.14*15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,20* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE21* SOFTWARE.22*/2324#include <sys/types.h>25#include <sys/stat.h>26#include <sys/mman.h>27#include <fcntl.h>28#include <stdint.h>29#include <stdlib.h>30#include <stdio.h>31#include <err.h>3233#include "main/mtypes.h"34#include "compiler/glsl/glsl_to_nir.h"35#include "compiler/nir_types.h"36#include "compiler/nir/nir_builder.h"37#include "util/half_float.h"38#include "util/u_math.h"39#include "util/u_debug.h"40#include "util/u_dynarray.h"41#include "util/list.h"42#include "main/mtypes.h"4344#include "midgard.h"45#include "midgard_nir.h"46#include "midgard_compile.h"47#include "midgard_ops.h"48#include "helpers.h"49#include "compiler.h"50#include "midgard_quirks.h"51#include "panfrost-quirks.h"52#include "panfrost/util/pan_lower_framebuffer.h"5354#include "disassemble.h"5556static const struct debug_named_value midgard_debug_options[] = {57{"msgs", MIDGARD_DBG_MSGS, "Print debug messages"},58{"shaders", MIDGARD_DBG_SHADERS, "Dump shaders in NIR and MIR"},59{"shaderdb", MIDGARD_DBG_SHADERDB, "Prints shader-db statistics"},60{"inorder", MIDGARD_DBG_INORDER, "Disables out-of-order scheduling"},61{"verbose", MIDGARD_DBG_VERBOSE, "Dump shaders verbosely"},62{"internal", MIDGARD_DBG_INTERNAL, "Dump internal shaders"},63DEBUG_NAMED_VALUE_END64};6566DEBUG_GET_ONCE_FLAGS_OPTION(midgard_debug, "MIDGARD_MESA_DEBUG", midgard_debug_options, 0)6768int midgard_debug = 0;6970#define DBG(fmt, ...) \71do { if (midgard_debug & MIDGARD_DBG_MSGS) \72fprintf(stderr, "%s:%d: "fmt, \73__FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)74static midgard_block *75create_empty_block(compiler_context *ctx)76{77midgard_block *blk = rzalloc(ctx, midgard_block);7879blk->base.predecessors = _mesa_set_create(blk,80_mesa_hash_pointer,81_mesa_key_pointer_equal);8283blk->base.name = ctx->block_source_count++;8485return blk;86}8788static void89schedule_barrier(compiler_context *ctx)90{91midgard_block *temp = ctx->after_block;92ctx->after_block = create_empty_block(ctx);93ctx->block_count++;94list_addtail(&ctx->after_block->base.link, &ctx->blocks);95list_inithead(&ctx->after_block->base.instructions);96pan_block_add_successor(&ctx->current_block->base, &ctx->after_block->base);97ctx->current_block = ctx->after_block;98ctx->after_block = temp;99}100101/* Helpers to generate midgard_instruction's using macro magic, since every102* driver seems to do it that way */103104#define EMIT(op, ...) emit_mir_instruction(ctx, v_##op(__VA_ARGS__));105106#define M_LOAD_STORE(name, store, T) \107static midgard_instruction m_##name(unsigned ssa, unsigned address) { \108midgard_instruction i = { \109.type = TAG_LOAD_STORE_4, \110.mask = 0xF, \111.dest = ~0, \112.src = { ~0, ~0, ~0, ~0 }, \113.swizzle = SWIZZLE_IDENTITY_4, \114.op = midgard_op_##name, \115.load_store = { \116.signed_offset = address \117} \118}; \119\120if (store) { \121i.src[0] = ssa; \122i.src_types[0] = T; \123i.dest_type = T; \124} else { \125i.dest = ssa; \126i.dest_type = T; \127} \128return i; \129}130131#define M_LOAD(name, T) M_LOAD_STORE(name, false, T)132#define M_STORE(name, T) M_LOAD_STORE(name, true, T)133134M_LOAD(ld_attr_32, nir_type_uint32);135M_LOAD(ld_vary_32, nir_type_uint32);136M_LOAD(ld_ubo_32, nir_type_uint32);137M_LOAD(ld_ubo_64, nir_type_uint32);138M_LOAD(ld_ubo_128, nir_type_uint32);139M_LOAD(ld_32, nir_type_uint32);140M_LOAD(ld_64, nir_type_uint32);141M_LOAD(ld_128, nir_type_uint32);142M_STORE(st_32, nir_type_uint32);143M_STORE(st_64, nir_type_uint32);144M_STORE(st_128, nir_type_uint32);145M_LOAD(ld_tilebuffer_raw, nir_type_uint32);146M_LOAD(ld_tilebuffer_16f, nir_type_float16);147M_LOAD(ld_tilebuffer_32f, nir_type_float32);148M_STORE(st_vary_32, nir_type_uint32);149M_LOAD(ld_cubemap_coords, nir_type_uint32);150M_LOAD(ldst_mov, nir_type_uint32);151M_LOAD(ld_image_32f, nir_type_float32);152M_LOAD(ld_image_16f, nir_type_float16);153M_LOAD(ld_image_32u, nir_type_uint32);154M_LOAD(ld_image_32i, nir_type_int32);155M_STORE(st_image_32f, nir_type_float32);156M_STORE(st_image_16f, nir_type_float16);157M_STORE(st_image_32u, nir_type_uint32);158M_STORE(st_image_32i, nir_type_int32);159M_LOAD(lea_image, nir_type_uint64);160161#define M_IMAGE(op) \162static midgard_instruction \163op ## _image(nir_alu_type type, unsigned val, unsigned address) \164{ \165switch (type) { \166case nir_type_float32: \167return m_ ## op ## _image_32f(val, address); \168case nir_type_float16: \169return m_ ## op ## _image_16f(val, address); \170case nir_type_uint32: \171return m_ ## op ## _image_32u(val, address); \172case nir_type_int32: \173return m_ ## op ## _image_32i(val, address); \174default: \175unreachable("Invalid image type"); \176} \177}178179M_IMAGE(ld);180M_IMAGE(st);181182static midgard_instruction183v_branch(bool conditional, bool invert)184{185midgard_instruction ins = {186.type = TAG_ALU_4,187.unit = ALU_ENAB_BRANCH,188.compact_branch = true,189.branch = {190.conditional = conditional,191.invert_conditional = invert192},193.dest = ~0,194.src = { ~0, ~0, ~0, ~0 },195};196197return ins;198}199200static void201attach_constants(compiler_context *ctx, midgard_instruction *ins, void *constants, int name)202{203ins->has_constants = true;204memcpy(&ins->constants, constants, 16);205}206207static int208glsl_type_size(const struct glsl_type *type, bool bindless)209{210return glsl_count_attribute_slots(type, false);211}212213/* Lower fdot2 to a vector multiplication followed by channel addition */214static bool215midgard_nir_lower_fdot2_instr(nir_builder *b, nir_instr *instr, void *data)216{217if (instr->type != nir_instr_type_alu)218return false;219220nir_alu_instr *alu = nir_instr_as_alu(instr);221if (alu->op != nir_op_fdot2)222return false;223224b->cursor = nir_before_instr(&alu->instr);225226nir_ssa_def *src0 = nir_ssa_for_alu_src(b, alu, 0);227nir_ssa_def *src1 = nir_ssa_for_alu_src(b, alu, 1);228229nir_ssa_def *product = nir_fmul(b, src0, src1);230231nir_ssa_def *sum = nir_fadd(b,232nir_channel(b, product, 0),233nir_channel(b, product, 1));234235/* Replace the fdot2 with this sum */236nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, sum);237238return true;239}240241static bool242midgard_nir_lower_fdot2(nir_shader *shader)243{244return nir_shader_instructions_pass(shader,245midgard_nir_lower_fdot2_instr,246nir_metadata_block_index | nir_metadata_dominance,247NULL);248}249250static bool251mdg_is_64(const nir_instr *instr, const void *_unused)252{253const nir_alu_instr *alu = nir_instr_as_alu(instr);254255if (nir_dest_bit_size(alu->dest.dest) == 64)256return true;257258switch (alu->op) {259case nir_op_umul_high:260case nir_op_imul_high:261return true;262default:263return false;264}265}266267/* Only vectorize int64 up to vec2 */268static bool269midgard_vectorize_filter(const nir_instr *instr, void *data)270{271if (instr->type != nir_instr_type_alu)272return true;273274const nir_alu_instr *alu = nir_instr_as_alu(instr);275276unsigned num_components = alu->dest.dest.ssa.num_components;277278int src_bit_size = nir_src_bit_size(alu->src[0].src);279int dst_bit_size = nir_dest_bit_size(alu->dest.dest);280281if (src_bit_size == 64 || dst_bit_size == 64) {282if (num_components > 1)283return false;284}285286return true;287}288289290/* Flushes undefined values to zero */291292static void293optimise_nir(nir_shader *nir, unsigned quirks, bool is_blend)294{295bool progress;296unsigned lower_flrp =297(nir->options->lower_flrp16 ? 16 : 0) |298(nir->options->lower_flrp32 ? 32 : 0) |299(nir->options->lower_flrp64 ? 64 : 0);300301NIR_PASS(progress, nir, nir_lower_regs_to_ssa);302nir_lower_idiv_options idiv_options = {303.imprecise_32bit_lowering = true,304.allow_fp16 = true,305};306NIR_PASS(progress, nir, nir_lower_idiv, &idiv_options);307308nir_lower_tex_options lower_tex_options = {309.lower_txs_lod = true,310.lower_txp = ~0,311.lower_tg4_broadcom_swizzle = true,312/* TODO: we have native gradient.. */313.lower_txd = true,314};315316NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_options);317318/* Must lower fdot2 after tex is lowered */319NIR_PASS(progress, nir, midgard_nir_lower_fdot2);320321/* T720 is broken. */322323if (quirks & MIDGARD_BROKEN_LOD)324NIR_PASS_V(nir, midgard_nir_lod_errata);325326/* Midgard image ops coordinates are 16-bit instead of 32-bit */327NIR_PASS(progress, nir, midgard_nir_lower_image_bitsize);328NIR_PASS(progress, nir, midgard_nir_lower_helper_writes);329NIR_PASS(progress, nir, pan_lower_helper_invocation);330NIR_PASS(progress, nir, pan_lower_sample_pos);331332NIR_PASS(progress, nir, midgard_nir_lower_algebraic_early);333334do {335progress = false;336337NIR_PASS(progress, nir, nir_lower_var_copies);338NIR_PASS(progress, nir, nir_lower_vars_to_ssa);339340NIR_PASS(progress, nir, nir_copy_prop);341NIR_PASS(progress, nir, nir_opt_remove_phis);342NIR_PASS(progress, nir, nir_opt_dce);343NIR_PASS(progress, nir, nir_opt_dead_cf);344NIR_PASS(progress, nir, nir_opt_cse);345NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);346NIR_PASS(progress, nir, nir_opt_algebraic);347NIR_PASS(progress, nir, nir_opt_constant_folding);348349if (lower_flrp != 0) {350bool lower_flrp_progress = false;351NIR_PASS(lower_flrp_progress,352nir,353nir_lower_flrp,354lower_flrp,355false /* always_precise */);356if (lower_flrp_progress) {357NIR_PASS(progress, nir,358nir_opt_constant_folding);359progress = true;360}361362/* Nothing should rematerialize any flrps, so we only363* need to do this lowering once.364*/365lower_flrp = 0;366}367368NIR_PASS(progress, nir, nir_opt_undef);369NIR_PASS(progress, nir, nir_lower_undef_to_zero);370371NIR_PASS(progress, nir, nir_opt_loop_unroll,372nir_var_shader_in |373nir_var_shader_out |374nir_var_function_temp);375376NIR_PASS(progress, nir, nir_opt_vectorize,377midgard_vectorize_filter, NULL);378} while (progress);379380NIR_PASS_V(nir, nir_lower_alu_to_scalar, mdg_is_64, NULL);381382/* Run after opts so it can hit more */383if (!is_blend)384NIR_PASS(progress, nir, nir_fuse_io_16);385386/* Must be run at the end to prevent creation of fsin/fcos ops */387NIR_PASS(progress, nir, midgard_nir_scale_trig);388389do {390progress = false;391392NIR_PASS(progress, nir, nir_opt_dce);393NIR_PASS(progress, nir, nir_opt_algebraic);394NIR_PASS(progress, nir, nir_opt_constant_folding);395NIR_PASS(progress, nir, nir_copy_prop);396} while (progress);397398NIR_PASS(progress, nir, nir_opt_algebraic_late);399NIR_PASS(progress, nir, nir_opt_algebraic_distribute_src_mods);400401/* We implement booleans as 32-bit 0/~0 */402NIR_PASS(progress, nir, nir_lower_bool_to_int32);403404/* Now that booleans are lowered, we can run out late opts */405NIR_PASS(progress, nir, midgard_nir_lower_algebraic_late);406NIR_PASS(progress, nir, midgard_nir_cancel_inot);407408NIR_PASS(progress, nir, nir_copy_prop);409NIR_PASS(progress, nir, nir_opt_dce);410411/* Backend scheduler is purely local, so do some global optimizations412* to reduce register pressure. */413nir_move_options move_all =414nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |415nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;416417NIR_PASS_V(nir, nir_opt_sink, move_all);418NIR_PASS_V(nir, nir_opt_move, move_all);419420/* Take us out of SSA */421NIR_PASS(progress, nir, nir_lower_locals_to_regs);422NIR_PASS(progress, nir, nir_convert_from_ssa, true);423424/* We are a vector architecture; write combine where possible */425NIR_PASS(progress, nir, nir_move_vec_src_uses_to_dest);426NIR_PASS(progress, nir, nir_lower_vec_to_movs, NULL, NULL);427428NIR_PASS(progress, nir, nir_opt_dce);429}430431/* Do not actually emit a load; instead, cache the constant for inlining */432433static void434emit_load_const(compiler_context *ctx, nir_load_const_instr *instr)435{436nir_ssa_def def = instr->def;437438midgard_constants *consts = rzalloc(ctx, midgard_constants);439440assert(instr->def.num_components * instr->def.bit_size <= sizeof(*consts) * 8);441442#define RAW_CONST_COPY(bits) \443nir_const_value_to_array(consts->u##bits, instr->value, \444instr->def.num_components, u##bits)445446switch (instr->def.bit_size) {447case 64:448RAW_CONST_COPY(64);449break;450case 32:451RAW_CONST_COPY(32);452break;453case 16:454RAW_CONST_COPY(16);455break;456case 8:457RAW_CONST_COPY(8);458break;459default:460unreachable("Invalid bit_size for load_const instruction\n");461}462463/* Shifted for SSA, +1 for off-by-one */464_mesa_hash_table_u64_insert(ctx->ssa_constants, (def.index << 1) + 1, consts);465}466467/* Normally constants are embedded implicitly, but for I/O and such we have to468* explicitly emit a move with the constant source */469470static void471emit_explicit_constant(compiler_context *ctx, unsigned node, unsigned to)472{473void *constant_value = _mesa_hash_table_u64_search(ctx->ssa_constants, node + 1);474475if (constant_value) {476midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), to);477attach_constants(ctx, &ins, constant_value, node + 1);478emit_mir_instruction(ctx, ins);479}480}481482static bool483nir_is_non_scalar_swizzle(nir_alu_src *src, unsigned nr_components)484{485unsigned comp = src->swizzle[0];486487for (unsigned c = 1; c < nr_components; ++c) {488if (src->swizzle[c] != comp)489return true;490}491492return false;493}494495#define ATOMIC_CASE_IMPL(ctx, instr, nir, op, is_shared) \496case nir_intrinsic_##nir: \497emit_atomic(ctx, instr, is_shared, midgard_op_##op, ~0); \498break;499500#define ATOMIC_CASE(ctx, instr, nir, op) \501ATOMIC_CASE_IMPL(ctx, instr, shared_atomic_##nir, atomic_##op, true); \502ATOMIC_CASE_IMPL(ctx, instr, global_atomic_##nir, atomic_##op, false);503504#define IMAGE_ATOMIC_CASE(ctx, instr, nir, op) \505case nir_intrinsic_image_atomic_##nir: { \506midgard_instruction ins = emit_image_op(ctx, instr, true); \507emit_atomic(ctx, instr, false, midgard_op_atomic_##op, ins.dest); \508break; \509}510511#define ALU_CASE(nir, _op) \512case nir_op_##nir: \513op = midgard_alu_op_##_op; \514assert(src_bitsize == dst_bitsize); \515break;516517#define ALU_CASE_RTZ(nir, _op) \518case nir_op_##nir: \519op = midgard_alu_op_##_op; \520roundmode = MIDGARD_RTZ; \521break;522523#define ALU_CHECK_CMP() \524assert(src_bitsize == 16 || src_bitsize == 32 || src_bitsize == 64); \525assert(dst_bitsize == 16 || dst_bitsize == 32); \526527#define ALU_CASE_BCAST(nir, _op, count) \528case nir_op_##nir: \529op = midgard_alu_op_##_op; \530broadcast_swizzle = count; \531ALU_CHECK_CMP(); \532break;533534#define ALU_CASE_CMP(nir, _op) \535case nir_op_##nir: \536op = midgard_alu_op_##_op; \537ALU_CHECK_CMP(); \538break;539540/* Compare mir_lower_invert */541static bool542nir_accepts_inot(nir_op op, unsigned src)543{544switch (op) {545case nir_op_ior:546case nir_op_iand: /* TODO: b2f16 */547case nir_op_ixor:548return true;549case nir_op_b32csel:550/* Only the condition */551return (src == 0);552default:553return false;554}555}556557static bool558mir_accept_dest_mod(compiler_context *ctx, nir_dest **dest, nir_op op)559{560if (pan_has_dest_mod(dest, op)) {561assert((*dest)->is_ssa);562BITSET_SET(ctx->already_emitted, (*dest)->ssa.index);563return true;564}565566return false;567}568569/* Look for floating point mods. We have the mods clamp_m1_1, clamp_0_1,570* and clamp_0_inf. We also have the relations (note 3 * 2 = 6 cases):571*572* clamp_0_1(clamp_0_inf(x)) = clamp_m1_1(x)573* clamp_0_1(clamp_m1_1(x)) = clamp_m1_1(x)574* clamp_0_inf(clamp_0_1(x)) = clamp_m1_1(x)575* clamp_0_inf(clamp_m1_1(x)) = clamp_m1_1(x)576* clamp_m1_1(clamp_0_1(x)) = clamp_m1_1(x)577* clamp_m1_1(clamp_0_inf(x)) = clamp_m1_1(x)578*579* So by cases any composition of output modifiers is equivalent to580* clamp_m1_1 alone.581*/582static unsigned583mir_determine_float_outmod(compiler_context *ctx, nir_dest **dest, unsigned prior_outmod)584{585bool clamp_0_inf = mir_accept_dest_mod(ctx, dest, nir_op_fclamp_pos_mali);586bool clamp_0_1 = mir_accept_dest_mod(ctx, dest, nir_op_fsat);587bool clamp_m1_1 = mir_accept_dest_mod(ctx, dest, nir_op_fsat_signed_mali);588bool prior = (prior_outmod != midgard_outmod_none);589int count = (int) prior + (int) clamp_0_inf + (int) clamp_0_1 + (int) clamp_m1_1;590591return ((count > 1) || clamp_0_1) ? midgard_outmod_clamp_0_1 :592clamp_0_inf ? midgard_outmod_clamp_0_inf :593clamp_m1_1 ? midgard_outmod_clamp_m1_1 :594prior_outmod;595}596597static void598mir_copy_src(midgard_instruction *ins, nir_alu_instr *instr, unsigned i, unsigned to, bool *abs, bool *neg, bool *not, enum midgard_roundmode *roundmode, bool is_int, unsigned bcast_count)599{600nir_alu_src src = instr->src[i];601602if (!is_int) {603if (pan_has_source_mod(&src, nir_op_fneg))604*neg = !(*neg);605606if (pan_has_source_mod(&src, nir_op_fabs))607*abs = true;608}609610if (nir_accepts_inot(instr->op, i) && pan_has_source_mod(&src, nir_op_inot))611*not = true;612613if (roundmode) {614if (pan_has_source_mod(&src, nir_op_fround_even))615*roundmode = MIDGARD_RTE;616617if (pan_has_source_mod(&src, nir_op_ftrunc))618*roundmode = MIDGARD_RTZ;619620if (pan_has_source_mod(&src, nir_op_ffloor))621*roundmode = MIDGARD_RTN;622623if (pan_has_source_mod(&src, nir_op_fceil))624*roundmode = MIDGARD_RTP;625}626627unsigned bits = nir_src_bit_size(src.src);628629ins->src[to] = nir_src_index(NULL, &src.src);630ins->src_types[to] = nir_op_infos[instr->op].input_types[i] | bits;631632for (unsigned c = 0; c < NIR_MAX_VEC_COMPONENTS; ++c) {633ins->swizzle[to][c] = src.swizzle[634(!bcast_count || c < bcast_count) ? c :635(bcast_count - 1)];636}637}638639/* Midgard features both fcsel and icsel, depending on whether you want int or640* float modifiers. NIR's csel is typeless, so we want a heuristic to guess if641* we should emit an int or float csel depending on what modifiers could be642* placed. In the absense of modifiers, this is probably arbitrary. */643644static bool645mir_is_bcsel_float(nir_alu_instr *instr)646{647nir_op intmods[] = {648nir_op_i2i8, nir_op_i2i16,649nir_op_i2i32, nir_op_i2i64650};651652nir_op floatmods[] = {653nir_op_fabs, nir_op_fneg,654nir_op_f2f16, nir_op_f2f32,655nir_op_f2f64656};657658nir_op floatdestmods[] = {659nir_op_fsat, nir_op_fsat_signed_mali, nir_op_fclamp_pos_mali,660nir_op_f2f16, nir_op_f2f32661};662663signed score = 0;664665for (unsigned i = 1; i < 3; ++i) {666nir_alu_src s = instr->src[i];667for (unsigned q = 0; q < ARRAY_SIZE(intmods); ++q) {668if (pan_has_source_mod(&s, intmods[q]))669score--;670}671}672673for (unsigned i = 1; i < 3; ++i) {674nir_alu_src s = instr->src[i];675for (unsigned q = 0; q < ARRAY_SIZE(floatmods); ++q) {676if (pan_has_source_mod(&s, floatmods[q]))677score++;678}679}680681for (unsigned q = 0; q < ARRAY_SIZE(floatdestmods); ++q) {682nir_dest *dest = &instr->dest.dest;683if (pan_has_dest_mod(&dest, floatdestmods[q]))684score++;685}686687return (score > 0);688}689690static void691emit_alu(compiler_context *ctx, nir_alu_instr *instr)692{693nir_dest *dest = &instr->dest.dest;694695if (dest->is_ssa && BITSET_TEST(ctx->already_emitted, dest->ssa.index))696return;697698/* Derivatives end up emitted on the texture pipe, not the ALUs. This699* is handled elsewhere */700701if (instr->op == nir_op_fddx || instr->op == nir_op_fddy) {702midgard_emit_derivatives(ctx, instr);703return;704}705706bool is_ssa = dest->is_ssa;707708unsigned nr_components = nir_dest_num_components(*dest);709unsigned nr_inputs = nir_op_infos[instr->op].num_inputs;710unsigned op = 0;711712/* Number of components valid to check for the instruction (the rest713* will be forced to the last), or 0 to use as-is. Relevant as714* ball-type instructions have a channel count in NIR but are all vec4715* in Midgard */716717unsigned broadcast_swizzle = 0;718719/* Should we swap arguments? */720bool flip_src12 = false;721722ASSERTED unsigned src_bitsize = nir_src_bit_size(instr->src[0].src);723ASSERTED unsigned dst_bitsize = nir_dest_bit_size(*dest);724725enum midgard_roundmode roundmode = MIDGARD_RTE;726727switch (instr->op) {728ALU_CASE(fadd, fadd);729ALU_CASE(fmul, fmul);730ALU_CASE(fmin, fmin);731ALU_CASE(fmax, fmax);732ALU_CASE(imin, imin);733ALU_CASE(imax, imax);734ALU_CASE(umin, umin);735ALU_CASE(umax, umax);736ALU_CASE(ffloor, ffloor);737ALU_CASE(fround_even, froundeven);738ALU_CASE(ftrunc, ftrunc);739ALU_CASE(fceil, fceil);740ALU_CASE(fdot3, fdot3);741ALU_CASE(fdot4, fdot4);742ALU_CASE(iadd, iadd);743ALU_CASE(isub, isub);744ALU_CASE(iadd_sat, iaddsat);745ALU_CASE(isub_sat, isubsat);746ALU_CASE(uadd_sat, uaddsat);747ALU_CASE(usub_sat, usubsat);748ALU_CASE(imul, imul);749ALU_CASE(imul_high, imul);750ALU_CASE(umul_high, imul);751ALU_CASE(uclz, iclz);752753/* Zero shoved as second-arg */754ALU_CASE(iabs, iabsdiff);755756ALU_CASE(uabs_isub, iabsdiff);757ALU_CASE(uabs_usub, uabsdiff);758759ALU_CASE(mov, imov);760761ALU_CASE_CMP(feq32, feq);762ALU_CASE_CMP(fneu32, fne);763ALU_CASE_CMP(flt32, flt);764ALU_CASE_CMP(ieq32, ieq);765ALU_CASE_CMP(ine32, ine);766ALU_CASE_CMP(ilt32, ilt);767ALU_CASE_CMP(ult32, ult);768769/* We don't have a native b2f32 instruction. Instead, like many770* GPUs, we exploit booleans as 0/~0 for false/true, and771* correspondingly AND772* by 1.0 to do the type conversion. For the moment, prime us773* to emit:774*775* iand [whatever], #0776*777* At the end of emit_alu (as MIR), we'll fix-up the constant778*/779780ALU_CASE_CMP(b2f32, iand);781ALU_CASE_CMP(b2f16, iand);782ALU_CASE_CMP(b2i32, iand);783784/* Likewise, we don't have a dedicated f2b32 instruction, but785* we can do a "not equal to 0.0" test. */786787ALU_CASE_CMP(f2b32, fne);788ALU_CASE_CMP(i2b32, ine);789790ALU_CASE(frcp, frcp);791ALU_CASE(frsq, frsqrt);792ALU_CASE(fsqrt, fsqrt);793ALU_CASE(fexp2, fexp2);794ALU_CASE(flog2, flog2);795796ALU_CASE_RTZ(f2i64, f2i_rte);797ALU_CASE_RTZ(f2u64, f2u_rte);798ALU_CASE_RTZ(i2f64, i2f_rte);799ALU_CASE_RTZ(u2f64, u2f_rte);800801ALU_CASE_RTZ(f2i32, f2i_rte);802ALU_CASE_RTZ(f2u32, f2u_rte);803ALU_CASE_RTZ(i2f32, i2f_rte);804ALU_CASE_RTZ(u2f32, u2f_rte);805806ALU_CASE_RTZ(f2i8, f2i_rte);807ALU_CASE_RTZ(f2u8, f2u_rte);808809ALU_CASE_RTZ(f2i16, f2i_rte);810ALU_CASE_RTZ(f2u16, f2u_rte);811ALU_CASE_RTZ(i2f16, i2f_rte);812ALU_CASE_RTZ(u2f16, u2f_rte);813814ALU_CASE(fsin, fsinpi);815ALU_CASE(fcos, fcospi);816817/* We'll get 0 in the second arg, so:818* ~a = ~(a | 0) = nor(a, 0) */819ALU_CASE(inot, inor);820ALU_CASE(iand, iand);821ALU_CASE(ior, ior);822ALU_CASE(ixor, ixor);823ALU_CASE(ishl, ishl);824ALU_CASE(ishr, iasr);825ALU_CASE(ushr, ilsr);826827ALU_CASE_BCAST(b32all_fequal2, fball_eq, 2);828ALU_CASE_BCAST(b32all_fequal3, fball_eq, 3);829ALU_CASE_CMP(b32all_fequal4, fball_eq);830831ALU_CASE_BCAST(b32any_fnequal2, fbany_neq, 2);832ALU_CASE_BCAST(b32any_fnequal3, fbany_neq, 3);833ALU_CASE_CMP(b32any_fnequal4, fbany_neq);834835ALU_CASE_BCAST(b32all_iequal2, iball_eq, 2);836ALU_CASE_BCAST(b32all_iequal3, iball_eq, 3);837ALU_CASE_CMP(b32all_iequal4, iball_eq);838839ALU_CASE_BCAST(b32any_inequal2, ibany_neq, 2);840ALU_CASE_BCAST(b32any_inequal3, ibany_neq, 3);841ALU_CASE_CMP(b32any_inequal4, ibany_neq);842843/* Source mods will be shoved in later */844ALU_CASE(fabs, fmov);845ALU_CASE(fneg, fmov);846ALU_CASE(fsat, fmov);847ALU_CASE(fsat_signed_mali, fmov);848ALU_CASE(fclamp_pos_mali, fmov);849850/* For size conversion, we use a move. Ideally though we would squash851* these ops together; maybe that has to happen after in NIR as part of852* propagation...? An earlier algebraic pass ensured we step down by853* only / exactly one size. If stepping down, we use a dest override to854* reduce the size; if stepping up, we use a larger-sized move with a855* half source and a sign/zero-extension modifier */856857case nir_op_i2i8:858case nir_op_i2i16:859case nir_op_i2i32:860case nir_op_i2i64:861case nir_op_u2u8:862case nir_op_u2u16:863case nir_op_u2u32:864case nir_op_u2u64:865case nir_op_f2f16:866case nir_op_f2f32:867case nir_op_f2f64: {868if (instr->op == nir_op_f2f16 || instr->op == nir_op_f2f32 ||869instr->op == nir_op_f2f64)870op = midgard_alu_op_fmov;871else872op = midgard_alu_op_imov;873874break;875}876877/* For greater-or-equal, we lower to less-or-equal and flip the878* arguments */879880case nir_op_fge:881case nir_op_fge32:882case nir_op_ige32:883case nir_op_uge32: {884op =885instr->op == nir_op_fge ? midgard_alu_op_fle :886instr->op == nir_op_fge32 ? midgard_alu_op_fle :887instr->op == nir_op_ige32 ? midgard_alu_op_ile :888instr->op == nir_op_uge32 ? midgard_alu_op_ule :8890;890891flip_src12 = true;892ALU_CHECK_CMP();893break;894}895896case nir_op_b32csel: {897bool mixed = nir_is_non_scalar_swizzle(&instr->src[0], nr_components);898bool is_float = mir_is_bcsel_float(instr);899op = is_float ?900(mixed ? midgard_alu_op_fcsel_v : midgard_alu_op_fcsel) :901(mixed ? midgard_alu_op_icsel_v : midgard_alu_op_icsel);902903break;904}905906case nir_op_unpack_32_2x16:907case nir_op_unpack_32_4x8:908case nir_op_pack_32_2x16:909case nir_op_pack_32_4x8: {910op = midgard_alu_op_imov;911break;912}913914default:915DBG("Unhandled ALU op %s\n", nir_op_infos[instr->op].name);916assert(0);917return;918}919920/* Promote imov to fmov if it might help inline a constant */921if (op == midgard_alu_op_imov && nir_src_is_const(instr->src[0].src)922&& nir_src_bit_size(instr->src[0].src) == 32923&& nir_is_same_comp_swizzle(instr->src[0].swizzle,924nir_src_num_components(instr->src[0].src))) {925op = midgard_alu_op_fmov;926}927928/* Midgard can perform certain modifiers on output of an ALU op */929930unsigned outmod = 0;931bool is_int = midgard_is_integer_op(op);932933if (instr->op == nir_op_umul_high || instr->op == nir_op_imul_high) {934outmod = midgard_outmod_keephi;935} else if (midgard_is_integer_out_op(op)) {936outmod = midgard_outmod_keeplo;937} else if (instr->op == nir_op_fsat) {938outmod = midgard_outmod_clamp_0_1;939} else if (instr->op == nir_op_fsat_signed_mali) {940outmod = midgard_outmod_clamp_m1_1;941} else if (instr->op == nir_op_fclamp_pos_mali) {942outmod = midgard_outmod_clamp_0_inf;943}944945/* Fetch unit, quirks, etc information */946unsigned opcode_props = alu_opcode_props[op].props;947bool quirk_flipped_r24 = opcode_props & QUIRK_FLIPPED_R24;948949if (!midgard_is_integer_out_op(op)) {950outmod = mir_determine_float_outmod(ctx, &dest, outmod);951}952953midgard_instruction ins = {954.type = TAG_ALU_4,955.dest = nir_dest_index(dest),956.dest_type = nir_op_infos[instr->op].output_type957| nir_dest_bit_size(*dest),958.roundmode = roundmode,959};960961enum midgard_roundmode *roundptr = (opcode_props & MIDGARD_ROUNDS) ?962&ins.roundmode : NULL;963964for (unsigned i = nr_inputs; i < ARRAY_SIZE(ins.src); ++i)965ins.src[i] = ~0;966967if (quirk_flipped_r24) {968ins.src[0] = ~0;969mir_copy_src(&ins, instr, 0, 1, &ins.src_abs[1], &ins.src_neg[1], &ins.src_invert[1], roundptr, is_int, broadcast_swizzle);970} else {971for (unsigned i = 0; i < nr_inputs; ++i) {972unsigned to = i;973974if (instr->op == nir_op_b32csel) {975/* The condition is the first argument; move976* the other arguments up one to be a binary977* instruction for Midgard with the condition978* last */979980if (i == 0)981to = 2;982else if (flip_src12)983to = 2 - i;984else985to = i - 1;986} else if (flip_src12) {987to = 1 - to;988}989990mir_copy_src(&ins, instr, i, to, &ins.src_abs[to], &ins.src_neg[to], &ins.src_invert[to], roundptr, is_int, broadcast_swizzle);991992/* (!c) ? a : b = c ? b : a */993if (instr->op == nir_op_b32csel && ins.src_invert[2]) {994ins.src_invert[2] = false;995flip_src12 ^= true;996}997}998}9991000if (instr->op == nir_op_fneg || instr->op == nir_op_fabs) {1001/* Lowered to move */1002if (instr->op == nir_op_fneg)1003ins.src_neg[1] ^= true;10041005if (instr->op == nir_op_fabs)1006ins.src_abs[1] = true;1007}10081009ins.mask = mask_of(nr_components);10101011/* Apply writemask if non-SSA, keeping in mind that we can't write to1012* components that don't exist. Note modifier => SSA => !reg => no1013* writemask, so we don't have to worry about writemasks here.*/10141015if (!is_ssa)1016ins.mask &= instr->dest.write_mask;10171018ins.op = op;1019ins.outmod = outmod;10201021/* Late fixup for emulated instructions */10221023if (instr->op == nir_op_b2f32 || instr->op == nir_op_b2i32) {1024/* Presently, our second argument is an inline #0 constant.1025* Switch over to an embedded 1.0 constant (that can't fit1026* inline, since we're 32-bit, not 16-bit like the inline1027* constants) */10281029ins.has_inline_constant = false;1030ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT);1031ins.src_types[1] = nir_type_float32;1032ins.has_constants = true;10331034if (instr->op == nir_op_b2f32)1035ins.constants.f32[0] = 1.0f;1036else1037ins.constants.i32[0] = 1;10381039for (unsigned c = 0; c < 16; ++c)1040ins.swizzle[1][c] = 0;1041} else if (instr->op == nir_op_b2f16) {1042ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT);1043ins.src_types[1] = nir_type_float16;1044ins.has_constants = true;1045ins.constants.i16[0] = _mesa_float_to_half(1.0);10461047for (unsigned c = 0; c < 16; ++c)1048ins.swizzle[1][c] = 0;1049} else if (nr_inputs == 1 && !quirk_flipped_r24) {1050/* Lots of instructions need a 0 plonked in */1051ins.has_inline_constant = false;1052ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT);1053ins.src_types[1] = ins.src_types[0];1054ins.has_constants = true;1055ins.constants.u32[0] = 0;10561057for (unsigned c = 0; c < 16; ++c)1058ins.swizzle[1][c] = 0;1059} else if (instr->op == nir_op_pack_32_2x16) {1060ins.dest_type = nir_type_uint16;1061ins.mask = mask_of(nr_components * 2);1062ins.is_pack = true;1063} else if (instr->op == nir_op_pack_32_4x8) {1064ins.dest_type = nir_type_uint8;1065ins.mask = mask_of(nr_components * 4);1066ins.is_pack = true;1067} else if (instr->op == nir_op_unpack_32_2x16) {1068ins.dest_type = nir_type_uint32;1069ins.mask = mask_of(nr_components >> 1);1070ins.is_pack = true;1071} else if (instr->op == nir_op_unpack_32_4x8) {1072ins.dest_type = nir_type_uint32;1073ins.mask = mask_of(nr_components >> 2);1074ins.is_pack = true;1075}10761077if ((opcode_props & UNITS_ALL) == UNIT_VLUT) {1078/* To avoid duplicating the lookup tables (probably), true LUT1079* instructions can only operate as if they were scalars. Lower1080* them here by changing the component. */10811082unsigned orig_mask = ins.mask;10831084unsigned swizzle_back[MIR_VEC_COMPONENTS];1085memcpy(&swizzle_back, ins.swizzle[0], sizeof(swizzle_back));10861087midgard_instruction ins_split[MIR_VEC_COMPONENTS];1088unsigned ins_count = 0;10891090for (int i = 0; i < nr_components; ++i) {1091/* Mask the associated component, dropping the1092* instruction if needed */10931094ins.mask = 1 << i;1095ins.mask &= orig_mask;10961097for (unsigned j = 0; j < ins_count; ++j) {1098if (swizzle_back[i] == ins_split[j].swizzle[0][0]) {1099ins_split[j].mask |= ins.mask;1100ins.mask = 0;1101break;1102}1103}11041105if (!ins.mask)1106continue;11071108for (unsigned j = 0; j < MIR_VEC_COMPONENTS; ++j)1109ins.swizzle[0][j] = swizzle_back[i]; /* Pull from the correct component */11101111ins_split[ins_count] = ins;11121113++ins_count;1114}11151116for (unsigned i = 0; i < ins_count; ++i) {1117emit_mir_instruction(ctx, ins_split[i]);1118}1119} else {1120emit_mir_instruction(ctx, ins);1121}1122}11231124#undef ALU_CASE11251126static void1127mir_set_intr_mask(nir_instr *instr, midgard_instruction *ins, bool is_read)1128{1129nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);1130unsigned nir_mask = 0;1131unsigned dsize = 0;11321133if (is_read) {1134nir_mask = mask_of(nir_intrinsic_dest_components(intr));1135dsize = nir_dest_bit_size(intr->dest);1136} else {1137nir_mask = nir_intrinsic_write_mask(intr);1138dsize = 32;1139}11401141/* Once we have the NIR mask, we need to normalize to work in 32-bit space */1142unsigned bytemask = pan_to_bytemask(dsize, nir_mask);1143ins->dest_type = nir_type_uint | dsize;1144mir_set_bytemask(ins, bytemask);1145}11461147/* Uniforms and UBOs use a shared code path, as uniforms are just (slightly1148* optimized) versions of UBO #0 */11491150static midgard_instruction *1151emit_ubo_read(1152compiler_context *ctx,1153nir_instr *instr,1154unsigned dest,1155unsigned offset,1156nir_src *indirect_offset,1157unsigned indirect_shift,1158unsigned index,1159unsigned nr_comps)1160{1161midgard_instruction ins;11621163unsigned dest_size = (instr->type == nir_instr_type_intrinsic) ?1164nir_dest_bit_size(nir_instr_as_intrinsic(instr)->dest) : 32;11651166unsigned bitsize = dest_size * nr_comps;11671168/* Pick the smallest intrinsic to avoid out-of-bounds reads */1169if (bitsize <= 32)1170ins = m_ld_ubo_32(dest, 0);1171else if (bitsize <= 64)1172ins = m_ld_ubo_64(dest, 0);1173else if (bitsize <= 128)1174ins = m_ld_ubo_128(dest, 0);1175else1176unreachable("Invalid UBO read size");11771178ins.constants.u32[0] = offset;11791180if (instr->type == nir_instr_type_intrinsic)1181mir_set_intr_mask(instr, &ins, true);11821183if (indirect_offset) {1184ins.src[2] = nir_src_index(ctx, indirect_offset);1185ins.src_types[2] = nir_type_uint32;1186ins.load_store.index_shift = indirect_shift;11871188/* X component for the whole swizzle to prevent register1189* pressure from ballooning from the extra components */1190for (unsigned i = 0; i < ARRAY_SIZE(ins.swizzle[2]); ++i)1191ins.swizzle[2][i] = 0;1192} else {1193ins.load_store.index_reg = REGISTER_LDST_ZERO;1194}11951196if (indirect_offset && indirect_offset->is_ssa && !indirect_shift)1197mir_set_ubo_offset(&ins, indirect_offset, offset);11981199midgard_pack_ubo_index_imm(&ins.load_store, index);12001201return emit_mir_instruction(ctx, ins);1202}12031204/* Globals are like UBOs if you squint. And shared memory is like globals if1205* you squint even harder */12061207static void1208emit_global(1209compiler_context *ctx,1210nir_instr *instr,1211bool is_read,1212unsigned srcdest,1213nir_src *offset,1214unsigned seg)1215{1216midgard_instruction ins;12171218nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);1219if (is_read) {1220unsigned bitsize = nir_dest_bit_size(intr->dest) *1221nir_dest_num_components(intr->dest);12221223if (bitsize <= 32)1224ins = m_ld_32(srcdest, 0);1225else if (bitsize <= 64)1226ins = m_ld_64(srcdest, 0);1227else if (bitsize <= 128)1228ins = m_ld_128(srcdest, 0);1229else1230unreachable("Invalid global read size");1231} else {1232unsigned bitsize = nir_src_bit_size(intr->src[0]) *1233nir_src_num_components(intr->src[0]);12341235if (bitsize <= 32)1236ins = m_st_32(srcdest, 0);1237else if (bitsize <= 64)1238ins = m_st_64(srcdest, 0);1239else if (bitsize <= 128)1240ins = m_st_128(srcdest, 0);1241else1242unreachable("Invalid global store size");1243}12441245mir_set_offset(ctx, &ins, offset, seg);1246mir_set_intr_mask(instr, &ins, is_read);12471248/* Set a valid swizzle for masked out components */1249assert(ins.mask);1250unsigned first_component = __builtin_ffs(ins.mask) - 1;12511252for (unsigned i = 0; i < ARRAY_SIZE(ins.swizzle[0]); ++i) {1253if (!(ins.mask & (1 << i)))1254ins.swizzle[0][i] = first_component;1255}12561257emit_mir_instruction(ctx, ins);1258}12591260/* If is_shared is off, the only other possible value are globals, since1261* SSBO's are being lowered to globals through a NIR pass.1262* `image_direct_address` should be ~0 when instr is not an image_atomic1263* and the destination register of a lea_image op when it is an image_atomic. */1264static void1265emit_atomic(1266compiler_context *ctx,1267nir_intrinsic_instr *instr,1268bool is_shared,1269midgard_load_store_op op,1270unsigned image_direct_address)1271{1272nir_alu_type type =1273(op == midgard_op_atomic_imin || op == midgard_op_atomic_imax) ?1274nir_type_int : nir_type_uint;12751276bool is_image = image_direct_address != ~0;12771278unsigned dest = nir_dest_index(&instr->dest);1279unsigned val_src = is_image ? 3 : 1;1280unsigned val = nir_src_index(ctx, &instr->src[val_src]);1281unsigned bitsize = nir_src_bit_size(instr->src[val_src]);1282emit_explicit_constant(ctx, val, val);12831284midgard_instruction ins = {1285.type = TAG_LOAD_STORE_4,1286.mask = 0xF,1287.dest = dest,1288.src = { ~0, ~0, ~0, val },1289.src_types = { 0, 0, 0, type | bitsize },1290.op = op1291};12921293nir_src *src_offset = nir_get_io_offset_src(instr);12941295if (op == midgard_op_atomic_cmpxchg) {1296unsigned xchg_val_src = is_image ? 4 : 2;1297unsigned xchg_val = nir_src_index(ctx, &instr->src[xchg_val_src]);1298emit_explicit_constant(ctx, xchg_val, xchg_val);12991300ins.src[2] = val;1301ins.src_types[2] = type | bitsize;1302ins.src[3] = xchg_val;13031304if (is_shared) {1305ins.load_store.arg_reg = REGISTER_LDST_LOCAL_STORAGE_PTR;1306ins.load_store.arg_comp = COMPONENT_Z;1307ins.load_store.bitsize_toggle = true;1308} else {1309for(unsigned i = 0; i < 2; ++i)1310ins.swizzle[1][i] = i;13111312ins.src[1] = is_image ? image_direct_address :1313nir_src_index(ctx, src_offset);1314ins.src_types[1] = nir_type_uint64;1315}1316} else if (is_image) {1317for(unsigned i = 0; i < 2; ++i)1318ins.swizzle[2][i] = i;13191320ins.src[2] = image_direct_address;1321ins.src_types[2] = nir_type_uint64;13221323ins.load_store.arg_reg = REGISTER_LDST_ZERO;1324ins.load_store.bitsize_toggle = true;1325ins.load_store.index_format = midgard_index_address_u64;1326} else1327mir_set_offset(ctx, &ins, src_offset, is_shared ? LDST_SHARED : LDST_GLOBAL);13281329mir_set_intr_mask(&instr->instr, &ins, true);13301331emit_mir_instruction(ctx, ins);1332}13331334static void1335emit_varying_read(1336compiler_context *ctx,1337unsigned dest, unsigned offset,1338unsigned nr_comp, unsigned component,1339nir_src *indirect_offset, nir_alu_type type, bool flat)1340{1341/* XXX: Half-floats? */1342/* TODO: swizzle, mask */13431344midgard_instruction ins = m_ld_vary_32(dest, PACK_LDST_ATTRIB_OFS(offset));1345ins.mask = mask_of(nr_comp);1346ins.dest_type = type;13471348if (type == nir_type_float16) {1349/* Ensure we are aligned so we can pack it later */1350ins.mask = mask_of(ALIGN_POT(nr_comp, 2));1351}13521353for (unsigned i = 0; i < ARRAY_SIZE(ins.swizzle[0]); ++i)1354ins.swizzle[0][i] = MIN2(i + component, COMPONENT_W);135513561357midgard_varying_params p = {1358.flat_shading = flat,1359.perspective_correction = 1,1360.interpolate_sample = true,1361};1362midgard_pack_varying_params(&ins.load_store, p);13631364if (indirect_offset) {1365ins.src[2] = nir_src_index(ctx, indirect_offset);1366ins.src_types[2] = nir_type_uint32;1367} else1368ins.load_store.index_reg = REGISTER_LDST_ZERO;13691370ins.load_store.arg_reg = REGISTER_LDST_ZERO;1371ins.load_store.index_format = midgard_index_address_u32;13721373/* Use the type appropriate load */1374switch (type) {1375case nir_type_uint32:1376case nir_type_bool32:1377ins.op = midgard_op_ld_vary_32u;1378break;1379case nir_type_int32:1380ins.op = midgard_op_ld_vary_32i;1381break;1382case nir_type_float32:1383ins.op = midgard_op_ld_vary_32;1384break;1385case nir_type_float16:1386ins.op = midgard_op_ld_vary_16;1387break;1388default:1389unreachable("Attempted to load unknown type");1390break;1391}13921393emit_mir_instruction(ctx, ins);1394}139513961397/* If `is_atomic` is true, we emit a `lea_image` since midgard doesn't not have special1398* image_atomic opcodes. The caller can then use that address to emit a normal atomic opcode. */1399static midgard_instruction1400emit_image_op(compiler_context *ctx, nir_intrinsic_instr *instr, bool is_atomic)1401{1402enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);1403unsigned nr_attr = ctx->stage == MESA_SHADER_VERTEX ?1404util_bitcount64(ctx->nir->info.inputs_read) : 0;1405unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim);1406bool is_array = nir_intrinsic_image_array(instr);1407bool is_store = instr->intrinsic == nir_intrinsic_image_store;14081409/* TODO: MSAA */1410assert(dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported");14111412unsigned coord_reg = nir_src_index(ctx, &instr->src[1]);1413emit_explicit_constant(ctx, coord_reg, coord_reg);14141415nir_src *index = &instr->src[0];1416bool is_direct = nir_src_is_const(*index);14171418/* For image opcodes, address is used as an index into the attribute descriptor */1419unsigned address = nr_attr;1420if (is_direct)1421address += nir_src_as_uint(*index);14221423midgard_instruction ins;1424if (is_store) { /* emit st_image_* */1425unsigned val = nir_src_index(ctx, &instr->src[3]);1426emit_explicit_constant(ctx, val, val);14271428nir_alu_type type = nir_intrinsic_src_type(instr);1429ins = st_image(type, val, PACK_LDST_ATTRIB_OFS(address));1430nir_alu_type base_type = nir_alu_type_get_base_type(type);1431ins.src_types[0] = base_type | nir_src_bit_size(instr->src[3]);1432} else if (is_atomic) { /* emit lea_image */1433unsigned dest = make_compiler_temp_reg(ctx);1434ins = m_lea_image(dest, PACK_LDST_ATTRIB_OFS(address));1435ins.mask = mask_of(2); /* 64-bit memory address */1436} else { /* emit ld_image_* */1437nir_alu_type type = nir_intrinsic_dest_type(instr);1438ins = ld_image(type, nir_dest_index(&instr->dest), PACK_LDST_ATTRIB_OFS(address));1439ins.mask = mask_of(nir_intrinsic_dest_components(instr));1440ins.dest_type = type;1441}14421443/* Coord reg */1444ins.src[1] = coord_reg;1445ins.src_types[1] = nir_type_uint16;1446if (nr_dim == 3 || is_array) {1447ins.load_store.bitsize_toggle = true;1448}14491450/* Image index reg */1451if (!is_direct) {1452ins.src[2] = nir_src_index(ctx, index);1453ins.src_types[2] = nir_type_uint32;1454} else1455ins.load_store.index_reg = REGISTER_LDST_ZERO;14561457emit_mir_instruction(ctx, ins);14581459return ins;1460}14611462static void1463emit_attr_read(1464compiler_context *ctx,1465unsigned dest, unsigned offset,1466unsigned nr_comp, nir_alu_type t)1467{1468midgard_instruction ins = m_ld_attr_32(dest, PACK_LDST_ATTRIB_OFS(offset));1469ins.load_store.arg_reg = REGISTER_LDST_ZERO;1470ins.load_store.index_reg = REGISTER_LDST_ZERO;1471ins.mask = mask_of(nr_comp);14721473/* Use the type appropriate load */1474switch (t) {1475case nir_type_uint:1476case nir_type_bool:1477ins.op = midgard_op_ld_attr_32u;1478break;1479case nir_type_int:1480ins.op = midgard_op_ld_attr_32i;1481break;1482case nir_type_float:1483ins.op = midgard_op_ld_attr_32;1484break;1485default:1486unreachable("Attempted to load unknown type");1487break;1488}14891490emit_mir_instruction(ctx, ins);1491}14921493static void1494emit_sysval_read(compiler_context *ctx, nir_instr *instr,1495unsigned nr_components, unsigned offset)1496{1497nir_dest nir_dest;14981499/* Figure out which uniform this is */1500unsigned sysval_ubo =1501MAX2(ctx->inputs->sysval_ubo, ctx->nir->info.num_ubos);1502int sysval = panfrost_sysval_for_instr(instr, &nir_dest);1503unsigned dest = nir_dest_index(&nir_dest);1504unsigned uniform =1505pan_lookup_sysval(ctx->sysval_to_id, &ctx->info->sysvals, sysval);15061507/* Emit the read itself -- this is never indirect */1508midgard_instruction *ins =1509emit_ubo_read(ctx, instr, dest, (uniform * 16) + offset, NULL, 0,1510sysval_ubo, nr_components);15111512ins->mask = mask_of(nr_components);1513}15141515static unsigned1516compute_builtin_arg(nir_intrinsic_op op)1517{1518switch (op) {1519case nir_intrinsic_load_workgroup_id:1520return REGISTER_LDST_GROUP_ID;1521case nir_intrinsic_load_local_invocation_id:1522return REGISTER_LDST_LOCAL_THREAD_ID;1523case nir_intrinsic_load_global_invocation_id:1524case nir_intrinsic_load_global_invocation_id_zero_base:1525return REGISTER_LDST_GLOBAL_THREAD_ID;1526default:1527unreachable("Invalid compute paramater loaded");1528}1529}15301531static void1532emit_fragment_store(compiler_context *ctx, unsigned src, unsigned src_z, unsigned src_s,1533enum midgard_rt_id rt, unsigned sample_iter)1534{1535assert(rt < ARRAY_SIZE(ctx->writeout_branch));1536assert(sample_iter < ARRAY_SIZE(ctx->writeout_branch[0]));15371538midgard_instruction *br = ctx->writeout_branch[rt][sample_iter];15391540assert(!br);15411542emit_explicit_constant(ctx, src, src);15431544struct midgard_instruction ins =1545v_branch(false, false);15461547bool depth_only = (rt == MIDGARD_ZS_RT);15481549ins.writeout = depth_only ? 0 : PAN_WRITEOUT_C;15501551/* Add dependencies */1552ins.src[0] = src;1553ins.src_types[0] = nir_type_uint32;15541555if (depth_only)1556ins.constants.u32[0] = 0xFF;1557else1558ins.constants.u32[0] = ((rt - MIDGARD_COLOR_RT0) << 8) | sample_iter;15591560for (int i = 0; i < 4; ++i)1561ins.swizzle[0][i] = i;15621563if (~src_z) {1564emit_explicit_constant(ctx, src_z, src_z);1565ins.src[2] = src_z;1566ins.src_types[2] = nir_type_uint32;1567ins.writeout |= PAN_WRITEOUT_Z;1568}1569if (~src_s) {1570emit_explicit_constant(ctx, src_s, src_s);1571ins.src[3] = src_s;1572ins.src_types[3] = nir_type_uint32;1573ins.writeout |= PAN_WRITEOUT_S;1574}15751576/* Emit the branch */1577br = emit_mir_instruction(ctx, ins);1578schedule_barrier(ctx);1579ctx->writeout_branch[rt][sample_iter] = br;15801581/* Push our current location = current block count - 1 = where we'll1582* jump to. Maybe a bit too clever for my own good */15831584br->branch.target_block = ctx->block_count - 1;1585}15861587static void1588emit_compute_builtin(compiler_context *ctx, nir_intrinsic_instr *instr)1589{1590unsigned reg = nir_dest_index(&instr->dest);1591midgard_instruction ins = m_ldst_mov(reg, 0);1592ins.mask = mask_of(3);1593ins.swizzle[0][3] = COMPONENT_X; /* xyzx */1594ins.load_store.arg_reg = compute_builtin_arg(instr->intrinsic);1595emit_mir_instruction(ctx, ins);1596}15971598static unsigned1599vertex_builtin_arg(nir_intrinsic_op op)1600{1601switch (op) {1602case nir_intrinsic_load_vertex_id_zero_base:1603return PAN_VERTEX_ID;1604case nir_intrinsic_load_instance_id:1605return PAN_INSTANCE_ID;1606default:1607unreachable("Invalid vertex builtin");1608}1609}16101611static void1612emit_vertex_builtin(compiler_context *ctx, nir_intrinsic_instr *instr)1613{1614unsigned reg = nir_dest_index(&instr->dest);1615emit_attr_read(ctx, reg, vertex_builtin_arg(instr->intrinsic), 1, nir_type_int);1616}16171618static void1619emit_special(compiler_context *ctx, nir_intrinsic_instr *instr, unsigned idx)1620{1621unsigned reg = nir_dest_index(&instr->dest);16221623midgard_instruction ld = m_ld_tilebuffer_raw(reg, 0);1624ld.op = midgard_op_ld_special_32u;1625ld.load_store.signed_offset = PACK_LDST_SELECTOR_OFS(idx);1626ld.load_store.index_reg = REGISTER_LDST_ZERO;16271628for (int i = 0; i < 4; ++i)1629ld.swizzle[0][i] = COMPONENT_X;16301631emit_mir_instruction(ctx, ld);1632}16331634static void1635emit_control_barrier(compiler_context *ctx)1636{1637midgard_instruction ins = {1638.type = TAG_TEXTURE_4,1639.dest = ~0,1640.src = { ~0, ~0, ~0, ~0 },1641.op = midgard_tex_op_barrier,1642};16431644emit_mir_instruction(ctx, ins);1645}16461647static unsigned1648mir_get_branch_cond(nir_src *src, bool *invert)1649{1650/* Wrap it. No swizzle since it's a scalar */16511652nir_alu_src alu = {1653.src = *src1654};16551656*invert = pan_has_source_mod(&alu, nir_op_inot);1657return nir_src_index(NULL, &alu.src);1658}16591660static uint8_t1661output_load_rt_addr(compiler_context *ctx, nir_intrinsic_instr *instr)1662{1663if (ctx->inputs->is_blend)1664return MIDGARD_COLOR_RT0 + ctx->inputs->blend.rt;16651666const nir_variable *var;1667var = nir_find_variable_with_driver_location(ctx->nir, nir_var_shader_out, nir_intrinsic_base(instr));1668assert(var);16691670unsigned loc = var->data.location;16711672if (loc >= FRAG_RESULT_DATA0)1673return loc - FRAG_RESULT_DATA0;16741675if (loc == FRAG_RESULT_DEPTH)1676return 0x1F;1677if (loc == FRAG_RESULT_STENCIL)1678return 0x1E;16791680unreachable("Invalid RT to load from");1681}16821683static void1684emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)1685{1686unsigned offset = 0, reg;16871688switch (instr->intrinsic) {1689case nir_intrinsic_discard_if:1690case nir_intrinsic_discard: {1691bool conditional = instr->intrinsic == nir_intrinsic_discard_if;1692struct midgard_instruction discard = v_branch(conditional, false);1693discard.branch.target_type = TARGET_DISCARD;16941695if (conditional) {1696discard.src[0] = mir_get_branch_cond(&instr->src[0],1697&discard.branch.invert_conditional);1698discard.src_types[0] = nir_type_uint32;1699}17001701emit_mir_instruction(ctx, discard);1702schedule_barrier(ctx);17031704break;1705}17061707case nir_intrinsic_image_load:1708case nir_intrinsic_image_store:1709emit_image_op(ctx, instr, false);1710break;17111712case nir_intrinsic_image_size: {1713unsigned nr_comp = nir_intrinsic_dest_components(instr);1714emit_sysval_read(ctx, &instr->instr, nr_comp, 0);1715break;1716}17171718case nir_intrinsic_load_ubo:1719case nir_intrinsic_load_global:1720case nir_intrinsic_load_global_constant:1721case nir_intrinsic_load_shared:1722case nir_intrinsic_load_scratch:1723case nir_intrinsic_load_input:1724case nir_intrinsic_load_kernel_input:1725case nir_intrinsic_load_interpolated_input: {1726bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo;1727bool is_global = instr->intrinsic == nir_intrinsic_load_global ||1728instr->intrinsic == nir_intrinsic_load_global_constant;1729bool is_shared = instr->intrinsic == nir_intrinsic_load_shared;1730bool is_scratch = instr->intrinsic == nir_intrinsic_load_scratch;1731bool is_flat = instr->intrinsic == nir_intrinsic_load_input;1732bool is_kernel = instr->intrinsic == nir_intrinsic_load_kernel_input;1733bool is_interp = instr->intrinsic == nir_intrinsic_load_interpolated_input;17341735/* Get the base type of the intrinsic */1736/* TODO: Infer type? Does it matter? */1737nir_alu_type t =1738(is_interp) ? nir_type_float :1739(is_flat) ? nir_intrinsic_dest_type(instr) :1740nir_type_uint;17411742t = nir_alu_type_get_base_type(t);17431744if (!(is_ubo || is_global || is_scratch)) {1745offset = nir_intrinsic_base(instr);1746}17471748unsigned nr_comp = nir_intrinsic_dest_components(instr);17491750nir_src *src_offset = nir_get_io_offset_src(instr);17511752bool direct = nir_src_is_const(*src_offset);1753nir_src *indirect_offset = direct ? NULL : src_offset;17541755if (direct)1756offset += nir_src_as_uint(*src_offset);17571758/* We may need to apply a fractional offset */1759int component = (is_flat || is_interp) ?1760nir_intrinsic_component(instr) : 0;1761reg = nir_dest_index(&instr->dest);17621763if (is_kernel) {1764emit_ubo_read(ctx, &instr->instr, reg, offset, indirect_offset, 0, 0, nr_comp);1765} else if (is_ubo) {1766nir_src index = instr->src[0];17671768/* TODO: Is indirect block number possible? */1769assert(nir_src_is_const(index));17701771uint32_t uindex = nir_src_as_uint(index);1772emit_ubo_read(ctx, &instr->instr, reg, offset, indirect_offset, 0, uindex, nr_comp);1773} else if (is_global || is_shared || is_scratch) {1774unsigned seg = is_global ? LDST_GLOBAL : (is_shared ? LDST_SHARED : LDST_SCRATCH);1775emit_global(ctx, &instr->instr, true, reg, src_offset, seg);1776} else if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend) {1777emit_varying_read(ctx, reg, offset, nr_comp, component, indirect_offset, t | nir_dest_bit_size(instr->dest), is_flat);1778} else if (ctx->inputs->is_blend) {1779/* ctx->blend_input will be precoloured to r0/r2, where1780* the input is preloaded */17811782unsigned *input = offset ? &ctx->blend_src1 : &ctx->blend_input;17831784if (*input == ~0)1785*input = reg;1786else1787emit_mir_instruction(ctx, v_mov(*input, reg));1788} else if (ctx->stage == MESA_SHADER_VERTEX) {1789emit_attr_read(ctx, reg, offset, nr_comp, t);1790} else {1791DBG("Unknown load\n");1792assert(0);1793}17941795break;1796}17971798/* Handled together with load_interpolated_input */1799case nir_intrinsic_load_barycentric_pixel:1800case nir_intrinsic_load_barycentric_centroid:1801case nir_intrinsic_load_barycentric_sample:1802break;18031804/* Reads 128-bit value raw off the tilebuffer during blending, tasty */18051806case nir_intrinsic_load_raw_output_pan: {1807reg = nir_dest_index(&instr->dest);18081809/* T720 and below use different blend opcodes with slightly1810* different semantics than T760 and up */18111812midgard_instruction ld = m_ld_tilebuffer_raw(reg, 0);18131814unsigned target = output_load_rt_addr(ctx, instr);1815ld.load_store.index_comp = target & 0x3;1816ld.load_store.index_reg = target >> 2;18171818if (nir_src_is_const(instr->src[0])) {1819unsigned sample = nir_src_as_uint(instr->src[0]);1820ld.load_store.arg_comp = sample & 0x3;1821ld.load_store.arg_reg = sample >> 2;1822} else {1823/* Enable sample index via register. */1824ld.load_store.signed_offset |= 1;1825ld.src[1] = nir_src_index(ctx, &instr->src[0]);1826ld.src_types[1] = nir_type_int32;1827}18281829if (ctx->quirks & MIDGARD_OLD_BLEND) {1830ld.op = midgard_op_ld_special_32u;1831ld.load_store.signed_offset = PACK_LDST_SELECTOR_OFS(16);1832ld.load_store.index_reg = REGISTER_LDST_ZERO;1833}18341835emit_mir_instruction(ctx, ld);1836break;1837}18381839case nir_intrinsic_load_output: {1840reg = nir_dest_index(&instr->dest);18411842unsigned bits = nir_dest_bit_size(instr->dest);18431844midgard_instruction ld;1845if (bits == 16)1846ld = m_ld_tilebuffer_16f(reg, 0);1847else1848ld = m_ld_tilebuffer_32f(reg, 0);18491850unsigned index = output_load_rt_addr(ctx, instr);1851ld.load_store.index_comp = index & 0x3;1852ld.load_store.index_reg = index >> 2;18531854for (unsigned c = 4; c < 16; ++c)1855ld.swizzle[0][c] = 0;18561857if (ctx->quirks & MIDGARD_OLD_BLEND) {1858if (bits == 16)1859ld.op = midgard_op_ld_special_16f;1860else1861ld.op = midgard_op_ld_special_32f;1862ld.load_store.signed_offset = PACK_LDST_SELECTOR_OFS(1);1863ld.load_store.index_reg = REGISTER_LDST_ZERO;1864}18651866emit_mir_instruction(ctx, ld);1867break;1868}18691870case nir_intrinsic_store_output:1871case nir_intrinsic_store_combined_output_pan:1872assert(nir_src_is_const(instr->src[1]) && "no indirect outputs");18731874offset = nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[1]);18751876reg = nir_src_index(ctx, &instr->src[0]);18771878if (ctx->stage == MESA_SHADER_FRAGMENT) {1879bool combined = instr->intrinsic ==1880nir_intrinsic_store_combined_output_pan;18811882const nir_variable *var;1883var = nir_find_variable_with_driver_location(ctx->nir, nir_var_shader_out,1884nir_intrinsic_base(instr));1885assert(var);18861887/* Dual-source blend writeout is done by leaving the1888* value in r2 for the blend shader to use. */1889if (var->data.index) {1890if (instr->src[0].is_ssa) {1891emit_explicit_constant(ctx, reg, reg);18921893unsigned out = make_compiler_temp(ctx);18941895midgard_instruction ins = v_mov(reg, out);1896emit_mir_instruction(ctx, ins);18971898ctx->blend_src1 = out;1899} else {1900ctx->blend_src1 = reg;1901}19021903break;1904}19051906enum midgard_rt_id rt;1907if (var->data.location >= FRAG_RESULT_DATA0)1908rt = MIDGARD_COLOR_RT0 + var->data.location -1909FRAG_RESULT_DATA0;1910else if (combined)1911rt = MIDGARD_ZS_RT;1912else1913unreachable("bad rt");19141915unsigned reg_z = ~0, reg_s = ~0;1916if (combined) {1917unsigned writeout = nir_intrinsic_component(instr);1918if (writeout & PAN_WRITEOUT_Z)1919reg_z = nir_src_index(ctx, &instr->src[2]);1920if (writeout & PAN_WRITEOUT_S)1921reg_s = nir_src_index(ctx, &instr->src[3]);1922}19231924emit_fragment_store(ctx, reg, reg_z, reg_s, rt, 0);1925} else if (ctx->stage == MESA_SHADER_VERTEX) {1926assert(instr->intrinsic == nir_intrinsic_store_output);19271928/* We should have been vectorized, though we don't1929* currently check that st_vary is emitted only once1930* per slot (this is relevant, since there's not a mask1931* parameter available on the store [set to 0 by the1932* blob]). We do respect the component by adjusting the1933* swizzle. If this is a constant source, we'll need to1934* emit that explicitly. */19351936emit_explicit_constant(ctx, reg, reg);19371938unsigned dst_component = nir_intrinsic_component(instr);1939unsigned nr_comp = nir_src_num_components(instr->src[0]);19401941midgard_instruction st = m_st_vary_32(reg, PACK_LDST_ATTRIB_OFS(offset));1942st.load_store.arg_reg = REGISTER_LDST_ZERO;1943st.load_store.index_format = midgard_index_address_u32;1944st.load_store.index_reg = REGISTER_LDST_ZERO;19451946switch (nir_alu_type_get_base_type(nir_intrinsic_src_type(instr))) {1947case nir_type_uint:1948case nir_type_bool:1949st.op = midgard_op_st_vary_32u;1950break;1951case nir_type_int:1952st.op = midgard_op_st_vary_32i;1953break;1954case nir_type_float:1955st.op = midgard_op_st_vary_32;1956break;1957default:1958unreachable("Attempted to store unknown type");1959break;1960}19611962/* nir_intrinsic_component(store_intr) encodes the1963* destination component start. Source component offset1964* adjustment is taken care of in1965* install_registers_instr(), when offset_swizzle() is1966* called.1967*/1968unsigned src_component = COMPONENT_X;19691970assert(nr_comp > 0);1971for (unsigned i = 0; i < ARRAY_SIZE(st.swizzle); ++i) {1972st.swizzle[0][i] = src_component;1973if (i >= dst_component && i < dst_component + nr_comp - 1)1974src_component++;1975}19761977emit_mir_instruction(ctx, st);1978} else {1979DBG("Unknown store\n");1980assert(0);1981}19821983break;19841985/* Special case of store_output for lowered blend shaders */1986case nir_intrinsic_store_raw_output_pan:1987assert (ctx->stage == MESA_SHADER_FRAGMENT);1988reg = nir_src_index(ctx, &instr->src[0]);1989for (unsigned s = 0; s < ctx->blend_sample_iterations; s++)1990emit_fragment_store(ctx, reg, ~0, ~0,1991ctx->inputs->blend.rt + MIDGARD_COLOR_RT0,1992s);1993break;19941995case nir_intrinsic_store_global:1996case nir_intrinsic_store_shared:1997case nir_intrinsic_store_scratch:1998reg = nir_src_index(ctx, &instr->src[0]);1999emit_explicit_constant(ctx, reg, reg);20002001unsigned seg;2002if (instr->intrinsic == nir_intrinsic_store_global)2003seg = LDST_GLOBAL;2004else if (instr->intrinsic == nir_intrinsic_store_shared)2005seg = LDST_SHARED;2006else2007seg = LDST_SCRATCH;20082009emit_global(ctx, &instr->instr, false, reg, &instr->src[1], seg);2010break;20112012case nir_intrinsic_load_first_vertex:2013case nir_intrinsic_load_ssbo_address:2014case nir_intrinsic_load_work_dim:2015emit_sysval_read(ctx, &instr->instr, 1, 0);2016break;20172018case nir_intrinsic_load_base_vertex:2019emit_sysval_read(ctx, &instr->instr, 1, 4);2020break;20212022case nir_intrinsic_load_base_instance:2023emit_sysval_read(ctx, &instr->instr, 1, 8);2024break;20252026case nir_intrinsic_load_sample_positions_pan:2027emit_sysval_read(ctx, &instr->instr, 2, 0);2028break;20292030case nir_intrinsic_get_ssbo_size:2031emit_sysval_read(ctx, &instr->instr, 1, 8);2032break;20332034case nir_intrinsic_load_viewport_scale:2035case nir_intrinsic_load_viewport_offset:2036case nir_intrinsic_load_num_workgroups:2037case nir_intrinsic_load_sampler_lod_parameters_pan:2038case nir_intrinsic_load_workgroup_size:2039emit_sysval_read(ctx, &instr->instr, 3, 0);2040break;20412042case nir_intrinsic_load_workgroup_id:2043case nir_intrinsic_load_local_invocation_id:2044case nir_intrinsic_load_global_invocation_id:2045case nir_intrinsic_load_global_invocation_id_zero_base:2046emit_compute_builtin(ctx, instr);2047break;20482049case nir_intrinsic_load_vertex_id_zero_base:2050case nir_intrinsic_load_instance_id:2051emit_vertex_builtin(ctx, instr);2052break;20532054case nir_intrinsic_load_sample_mask_in:2055emit_special(ctx, instr, 96);2056break;20572058case nir_intrinsic_load_sample_id:2059emit_special(ctx, instr, 97);2060break;20612062/* Midgard doesn't seem to want special handling */2063case nir_intrinsic_memory_barrier:2064case nir_intrinsic_memory_barrier_buffer:2065case nir_intrinsic_memory_barrier_image:2066case nir_intrinsic_memory_barrier_shared:2067case nir_intrinsic_group_memory_barrier:2068break;20692070case nir_intrinsic_control_barrier:2071schedule_barrier(ctx);2072emit_control_barrier(ctx);2073schedule_barrier(ctx);2074break;20752076ATOMIC_CASE(ctx, instr, add, add);2077ATOMIC_CASE(ctx, instr, and, and);2078ATOMIC_CASE(ctx, instr, comp_swap, cmpxchg);2079ATOMIC_CASE(ctx, instr, exchange, xchg);2080ATOMIC_CASE(ctx, instr, imax, imax);2081ATOMIC_CASE(ctx, instr, imin, imin);2082ATOMIC_CASE(ctx, instr, or, or);2083ATOMIC_CASE(ctx, instr, umax, umax);2084ATOMIC_CASE(ctx, instr, umin, umin);2085ATOMIC_CASE(ctx, instr, xor, xor);20862087IMAGE_ATOMIC_CASE(ctx, instr, add, add);2088IMAGE_ATOMIC_CASE(ctx, instr, and, and);2089IMAGE_ATOMIC_CASE(ctx, instr, comp_swap, cmpxchg);2090IMAGE_ATOMIC_CASE(ctx, instr, exchange, xchg);2091IMAGE_ATOMIC_CASE(ctx, instr, imax, imax);2092IMAGE_ATOMIC_CASE(ctx, instr, imin, imin);2093IMAGE_ATOMIC_CASE(ctx, instr, or, or);2094IMAGE_ATOMIC_CASE(ctx, instr, umax, umax);2095IMAGE_ATOMIC_CASE(ctx, instr, umin, umin);2096IMAGE_ATOMIC_CASE(ctx, instr, xor, xor);20972098default:2099fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name);2100assert(0);2101break;2102}2103}21042105/* Returns dimension with 0 special casing cubemaps */2106static unsigned2107midgard_tex_format(enum glsl_sampler_dim dim)2108{2109switch (dim) {2110case GLSL_SAMPLER_DIM_1D:2111case GLSL_SAMPLER_DIM_BUF:2112return 1;21132114case GLSL_SAMPLER_DIM_2D:2115case GLSL_SAMPLER_DIM_MS:2116case GLSL_SAMPLER_DIM_EXTERNAL:2117case GLSL_SAMPLER_DIM_RECT:2118return 2;21192120case GLSL_SAMPLER_DIM_3D:2121return 3;21222123case GLSL_SAMPLER_DIM_CUBE:2124return 0;21252126default:2127DBG("Unknown sampler dim type\n");2128assert(0);2129return 0;2130}2131}21322133/* Tries to attach an explicit LOD or bias as a constant. Returns whether this2134* was successful */21352136static bool2137pan_attach_constant_bias(2138compiler_context *ctx,2139nir_src lod,2140midgard_texture_word *word)2141{2142/* To attach as constant, it has to *be* constant */21432144if (!nir_src_is_const(lod))2145return false;21462147float f = nir_src_as_float(lod);21482149/* Break into fixed-point */2150signed lod_int = f;2151float lod_frac = f - lod_int;21522153/* Carry over negative fractions */2154if (lod_frac < 0.0) {2155lod_int--;2156lod_frac += 1.0;2157}21582159/* Encode */2160word->bias = float_to_ubyte(lod_frac);2161word->bias_int = lod_int;21622163return true;2164}21652166static enum mali_texture_mode2167mdg_texture_mode(nir_tex_instr *instr)2168{2169if (instr->op == nir_texop_tg4 && instr->is_shadow)2170return TEXTURE_GATHER_SHADOW;2171else if (instr->op == nir_texop_tg4)2172return TEXTURE_GATHER_X + instr->component;2173else if (instr->is_shadow)2174return TEXTURE_SHADOW;2175else2176return TEXTURE_NORMAL;2177}21782179static void2180set_tex_coord(compiler_context *ctx, nir_tex_instr *instr,2181midgard_instruction *ins)2182{2183int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord);21842185assert(coord_idx >= 0);21862187int comparator_idx = nir_tex_instr_src_index(instr, nir_tex_src_comparator);2188int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);2189assert(comparator_idx < 0 || ms_idx < 0);2190int ms_or_comparator_idx = ms_idx >= 0 ? ms_idx : comparator_idx;21912192unsigned coords = nir_src_index(ctx, &instr->src[coord_idx].src);21932194emit_explicit_constant(ctx, coords, coords);21952196ins->src_types[1] = nir_tex_instr_src_type(instr, coord_idx) |2197nir_src_bit_size(instr->src[coord_idx].src);21982199unsigned nr_comps = instr->coord_components;2200unsigned written_mask = 0, write_mask = 0;22012202/* Initialize all components to coord.x which is expected to always be2203* present. Swizzle is updated below based on the texture dimension2204* and extra attributes that are packed in the coordinate argument.2205*/2206for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++)2207ins->swizzle[1][c] = COMPONENT_X;22082209/* Shadow ref value is part of the coordinates if there's no comparator2210* source, in that case it's always placed in the last component.2211* Midgard wants the ref value in coord.z.2212*/2213if (instr->is_shadow && comparator_idx < 0) {2214ins->swizzle[1][COMPONENT_Z] = --nr_comps;2215write_mask |= 1 << COMPONENT_Z;2216}22172218/* The array index is the last component if there's no shadow ref value2219* or second last if there's one. We already decremented the number of2220* components to account for the shadow ref value above.2221* Midgard wants the array index in coord.w.2222*/2223if (instr->is_array) {2224ins->swizzle[1][COMPONENT_W] = --nr_comps;2225write_mask |= 1 << COMPONENT_W;2226}22272228if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {2229/* texelFetch is undefined on samplerCube */2230assert(ins->op != midgard_tex_op_fetch);22312232ins->src[1] = make_compiler_temp_reg(ctx);22332234/* For cubemaps, we use a special ld/st op to select the face2235* and copy the xy into the texture register2236*/2237midgard_instruction ld = m_ld_cubemap_coords(ins->src[1], 0);2238ld.src[1] = coords;2239ld.src_types[1] = ins->src_types[1];2240ld.mask = 0x3; /* xy */2241ld.load_store.bitsize_toggle = true;2242ld.swizzle[1][3] = COMPONENT_X;2243emit_mir_instruction(ctx, ld);22442245/* We packed cube coordiates (X,Y,Z) into (X,Y), update the2246* written mask accordingly and decrement the number of2247* components2248*/2249nr_comps--;2250written_mask |= 3;2251}22522253/* Now flag tex coord components that have not been written yet */2254write_mask |= mask_of(nr_comps) & ~written_mask;2255for (unsigned c = 0; c < nr_comps; c++)2256ins->swizzle[1][c] = c;22572258/* Sample index and shadow ref are expected in coord.z */2259if (ms_or_comparator_idx >= 0) {2260assert(!((write_mask | written_mask) & (1 << COMPONENT_Z)));22612262unsigned sample_or_ref =2263nir_src_index(ctx, &instr->src[ms_or_comparator_idx].src);22642265emit_explicit_constant(ctx, sample_or_ref, sample_or_ref);22662267if (ins->src[1] == ~0)2268ins->src[1] = make_compiler_temp_reg(ctx);22692270midgard_instruction mov = v_mov(sample_or_ref, ins->src[1]);22712272for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++)2273mov.swizzle[1][c] = COMPONENT_X;22742275mov.mask = 1 << COMPONENT_Z;2276written_mask |= 1 << COMPONENT_Z;2277ins->swizzle[1][COMPONENT_Z] = COMPONENT_Z;2278emit_mir_instruction(ctx, mov);2279}22802281/* Texelfetch coordinates uses all four elements (xyz/index) regardless2282* of texture dimensionality, which means it's necessary to zero the2283* unused components to keep everything happy.2284*/2285if (ins->op == midgard_tex_op_fetch &&2286(written_mask | write_mask) != 0xF) {2287if (ins->src[1] == ~0)2288ins->src[1] = make_compiler_temp_reg(ctx);22892290/* mov index.zw, #0, or generalized */2291midgard_instruction mov =2292v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), ins->src[1]);2293mov.has_constants = true;2294mov.mask = (written_mask | write_mask) ^ 0xF;2295emit_mir_instruction(ctx, mov);2296for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++) {2297if (mov.mask & (1 << c))2298ins->swizzle[1][c] = c;2299}2300}23012302if (ins->src[1] == ~0) {2303/* No temporary reg created, use the src coords directly */2304ins->src[1] = coords;2305} else if (write_mask) {2306/* Move the remaining coordinates to the temporary reg */2307midgard_instruction mov = v_mov(coords, ins->src[1]);23082309for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++) {2310if ((1 << c) & write_mask) {2311mov.swizzle[1][c] = ins->swizzle[1][c];2312ins->swizzle[1][c] = c;2313} else {2314mov.swizzle[1][c] = COMPONENT_X;2315}2316}23172318mov.mask = write_mask;2319emit_mir_instruction(ctx, mov);2320}2321}23222323static void2324emit_texop_native(compiler_context *ctx, nir_tex_instr *instr,2325unsigned midgard_texop)2326{2327/* TODO */2328//assert (!instr->sampler);23292330nir_dest *dest = &instr->dest;23312332int texture_index = instr->texture_index;2333int sampler_index = instr->sampler_index;23342335nir_alu_type dest_base = nir_alu_type_get_base_type(instr->dest_type);23362337/* texture instructions support float outmods */2338unsigned outmod = midgard_outmod_none;2339if (dest_base == nir_type_float) {2340outmod = mir_determine_float_outmod(ctx, &dest, 0);2341}23422343midgard_instruction ins = {2344.type = TAG_TEXTURE_4,2345.mask = 0xF,2346.dest = nir_dest_index(dest),2347.src = { ~0, ~0, ~0, ~0 },2348.dest_type = instr->dest_type,2349.swizzle = SWIZZLE_IDENTITY_4,2350.outmod = outmod,2351.op = midgard_texop,2352.texture = {2353.format = midgard_tex_format(instr->sampler_dim),2354.texture_handle = texture_index,2355.sampler_handle = sampler_index,2356.mode = mdg_texture_mode(instr)2357}2358};23592360if (instr->is_shadow && !instr->is_new_style_shadow && instr->op != nir_texop_tg4)2361for (int i = 0; i < 4; ++i)2362ins.swizzle[0][i] = COMPONENT_X;23632364for (unsigned i = 0; i < instr->num_srcs; ++i) {2365int index = nir_src_index(ctx, &instr->src[i].src);2366unsigned sz = nir_src_bit_size(instr->src[i].src);2367nir_alu_type T = nir_tex_instr_src_type(instr, i) | sz;23682369switch (instr->src[i].src_type) {2370case nir_tex_src_coord:2371set_tex_coord(ctx, instr, &ins);2372break;23732374case nir_tex_src_bias:2375case nir_tex_src_lod: {2376/* Try as a constant if we can */23772378bool is_txf = midgard_texop == midgard_tex_op_fetch;2379if (!is_txf && pan_attach_constant_bias(ctx, instr->src[i].src, &ins.texture))2380break;23812382ins.texture.lod_register = true;2383ins.src[2] = index;2384ins.src_types[2] = T;23852386for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c)2387ins.swizzle[2][c] = COMPONENT_X;23882389emit_explicit_constant(ctx, index, index);23902391break;2392};23932394case nir_tex_src_offset: {2395ins.texture.offset_register = true;2396ins.src[3] = index;2397ins.src_types[3] = T;23982399for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c)2400ins.swizzle[3][c] = (c > COMPONENT_Z) ? 0 : c;24012402emit_explicit_constant(ctx, index, index);2403break;2404};24052406case nir_tex_src_comparator:2407case nir_tex_src_ms_index:2408/* Nothing to do, handled in set_tex_coord() */2409break;24102411default: {2412fprintf(stderr, "Unknown texture source type: %d\n", instr->src[i].src_type);2413assert(0);2414}2415}2416}24172418emit_mir_instruction(ctx, ins);2419}24202421static void2422emit_tex(compiler_context *ctx, nir_tex_instr *instr)2423{2424switch (instr->op) {2425case nir_texop_tex:2426case nir_texop_txb:2427emit_texop_native(ctx, instr, midgard_tex_op_normal);2428break;2429case nir_texop_txl:2430case nir_texop_tg4:2431emit_texop_native(ctx, instr, midgard_tex_op_gradient);2432break;2433case nir_texop_txf:2434case nir_texop_txf_ms:2435emit_texop_native(ctx, instr, midgard_tex_op_fetch);2436break;2437case nir_texop_txs:2438emit_sysval_read(ctx, &instr->instr, 4, 0);2439break;2440default: {2441fprintf(stderr, "Unhandled texture op: %d\n", instr->op);2442assert(0);2443}2444}2445}24462447static void2448emit_jump(compiler_context *ctx, nir_jump_instr *instr)2449{2450switch (instr->type) {2451case nir_jump_break: {2452/* Emit a branch out of the loop */2453struct midgard_instruction br = v_branch(false, false);2454br.branch.target_type = TARGET_BREAK;2455br.branch.target_break = ctx->current_loop_depth;2456emit_mir_instruction(ctx, br);2457break;2458}24592460default:2461DBG("Unknown jump type %d\n", instr->type);2462break;2463}2464}24652466static void2467emit_instr(compiler_context *ctx, struct nir_instr *instr)2468{2469switch (instr->type) {2470case nir_instr_type_load_const:2471emit_load_const(ctx, nir_instr_as_load_const(instr));2472break;24732474case nir_instr_type_intrinsic:2475emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));2476break;24772478case nir_instr_type_alu:2479emit_alu(ctx, nir_instr_as_alu(instr));2480break;24812482case nir_instr_type_tex:2483emit_tex(ctx, nir_instr_as_tex(instr));2484break;24852486case nir_instr_type_jump:2487emit_jump(ctx, nir_instr_as_jump(instr));2488break;24892490case nir_instr_type_ssa_undef:2491/* Spurious */2492break;24932494default:2495DBG("Unhandled instruction type\n");2496break;2497}2498}249925002501/* ALU instructions can inline or embed constants, which decreases register2502* pressure and saves space. */25032504#define CONDITIONAL_ATTACH(idx) { \2505void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->src[idx] + 1); \2506\2507if (entry) { \2508attach_constants(ctx, alu, entry, alu->src[idx] + 1); \2509alu->src[idx] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); \2510} \2511}25122513static void2514inline_alu_constants(compiler_context *ctx, midgard_block *block)2515{2516mir_foreach_instr_in_block(block, alu) {2517/* Other instructions cannot inline constants */2518if (alu->type != TAG_ALU_4) continue;2519if (alu->compact_branch) continue;25202521/* If there is already a constant here, we can do nothing */2522if (alu->has_constants) continue;25232524CONDITIONAL_ATTACH(0);25252526if (!alu->has_constants) {2527CONDITIONAL_ATTACH(1)2528} else if (!alu->inline_constant) {2529/* Corner case: _two_ vec4 constants, for instance with a2530* csel. For this case, we can only use a constant2531* register for one, we'll have to emit a move for the2532* other. */25332534void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->src[1] + 1);2535unsigned scratch = make_compiler_temp(ctx);25362537if (entry) {2538midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), scratch);2539attach_constants(ctx, &ins, entry, alu->src[1] + 1);25402541/* Set the source */2542alu->src[1] = scratch;25432544/* Inject us -before- the last instruction which set r31 */2545mir_insert_instruction_before(ctx, mir_prev_op(alu), ins);2546}2547}2548}2549}25502551unsigned2552max_bitsize_for_alu(midgard_instruction *ins)2553{2554unsigned max_bitsize = 0;2555for (int i = 0; i < MIR_SRC_COUNT; i++) {2556if (ins->src[i] == ~0) continue;2557unsigned src_bitsize = nir_alu_type_get_type_size(ins->src_types[i]);2558max_bitsize = MAX2(src_bitsize, max_bitsize);2559}2560unsigned dst_bitsize = nir_alu_type_get_type_size(ins->dest_type);2561max_bitsize = MAX2(dst_bitsize, max_bitsize);25622563/* We don't have fp16 LUTs, so we'll want to emit code like:2564*2565* vlut.fsinr hr0, hr02566*2567* where both input and output are 16-bit but the operation is carried2568* out in 32-bit2569*/25702571switch (ins->op) {2572case midgard_alu_op_fsqrt:2573case midgard_alu_op_frcp:2574case midgard_alu_op_frsqrt:2575case midgard_alu_op_fsinpi:2576case midgard_alu_op_fcospi:2577case midgard_alu_op_fexp2:2578case midgard_alu_op_flog2:2579max_bitsize = MAX2(max_bitsize, 32);2580break;25812582default:2583break;2584}25852586/* High implies computing at a higher bitsize, e.g umul_high of 32-bit2587* requires computing at 64-bit */2588if (midgard_is_integer_out_op(ins->op) && ins->outmod == midgard_outmod_keephi) {2589max_bitsize *= 2;2590assert(max_bitsize <= 64);2591}25922593return max_bitsize;2594}25952596midgard_reg_mode2597reg_mode_for_bitsize(unsigned bitsize)2598{2599switch (bitsize) {2600/* use 16 pipe for 8 since we don't support vec16 yet */2601case 8:2602case 16:2603return midgard_reg_mode_16;2604case 32:2605return midgard_reg_mode_32;2606case 64:2607return midgard_reg_mode_64;2608default:2609unreachable("invalid bit size");2610}2611}26122613/* Midgard supports two types of constants, embedded constants (128-bit) and2614* inline constants (16-bit). Sometimes, especially with scalar ops, embedded2615* constants can be demoted to inline constants, for space savings and2616* sometimes a performance boost */26172618static void2619embedded_to_inline_constant(compiler_context *ctx, midgard_block *block)2620{2621mir_foreach_instr_in_block(block, ins) {2622if (!ins->has_constants) continue;2623if (ins->has_inline_constant) continue;26242625unsigned max_bitsize = max_bitsize_for_alu(ins);26262627/* We can inline 32-bit (sometimes) or 16-bit (usually) */2628bool is_16 = max_bitsize == 16;2629bool is_32 = max_bitsize == 32;26302631if (!(is_16 || is_32))2632continue;26332634/* src1 cannot be an inline constant due to encoding2635* restrictions. So, if possible we try to flip the arguments2636* in that case */26372638int op = ins->op;26392640if (ins->src[0] == SSA_FIXED_REGISTER(REGISTER_CONSTANT) &&2641alu_opcode_props[op].props & OP_COMMUTES) {2642mir_flip(ins);2643}26442645if (ins->src[1] == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) {2646/* Component is from the swizzle. Take a nonzero component */2647assert(ins->mask);2648unsigned first_comp = ffs(ins->mask) - 1;2649unsigned component = ins->swizzle[1][first_comp];26502651/* Scale constant appropriately, if we can legally */2652int16_t scaled_constant = 0;26532654if (is_16) {2655scaled_constant = ins->constants.u16[component];2656} else if (midgard_is_integer_op(op)) {2657scaled_constant = ins->constants.u32[component];26582659/* Constant overflow after resize */2660if (scaled_constant != ins->constants.u32[component])2661continue;2662} else {2663float original = ins->constants.f32[component];2664scaled_constant = _mesa_float_to_half(original);26652666/* Check for loss of precision. If this is2667* mediump, we don't care, but for a highp2668* shader, we need to pay attention. NIR2669* doesn't yet tell us which mode we're in!2670* Practically this prevents most constants2671* from being inlined, sadly. */26722673float fp32 = _mesa_half_to_float(scaled_constant);26742675if (fp32 != original)2676continue;2677}26782679/* Should've been const folded */2680if (ins->src_abs[1] || ins->src_neg[1])2681continue;26822683/* Make sure that the constant is not itself a vector2684* by checking if all accessed values are the same. */26852686const midgard_constants *cons = &ins->constants;2687uint32_t value = is_16 ? cons->u16[component] : cons->u32[component];26882689bool is_vector = false;2690unsigned mask = effective_writemask(ins->op, ins->mask);26912692for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c) {2693/* We only care if this component is actually used */2694if (!(mask & (1 << c)))2695continue;26962697uint32_t test = is_16 ?2698cons->u16[ins->swizzle[1][c]] :2699cons->u32[ins->swizzle[1][c]];27002701if (test != value) {2702is_vector = true;2703break;2704}2705}27062707if (is_vector)2708continue;27092710/* Get rid of the embedded constant */2711ins->has_constants = false;2712ins->src[1] = ~0;2713ins->has_inline_constant = true;2714ins->inline_constant = scaled_constant;2715}2716}2717}27182719/* Dead code elimination for branches at the end of a block - only one branch2720* per block is legal semantically */27212722static void2723midgard_cull_dead_branch(compiler_context *ctx, midgard_block *block)2724{2725bool branched = false;27262727mir_foreach_instr_in_block_safe(block, ins) {2728if (!midgard_is_branch_unit(ins->unit)) continue;27292730if (branched)2731mir_remove_instruction(ins);27322733branched = true;2734}2735}27362737/* We want to force the invert on AND/OR to the second slot to legalize into2738* iandnot/iornot. The relevant patterns are for AND (and OR respectively)2739*2740* ~a & #b = ~a & ~(#~b)2741* ~a & b = b & ~a2742*/27432744static void2745midgard_legalize_invert(compiler_context *ctx, midgard_block *block)2746{2747mir_foreach_instr_in_block(block, ins) {2748if (ins->type != TAG_ALU_4) continue;27492750if (ins->op != midgard_alu_op_iand &&2751ins->op != midgard_alu_op_ior) continue;27522753if (ins->src_invert[1] || !ins->src_invert[0]) continue;27542755if (ins->has_inline_constant) {2756/* ~(#~a) = ~(~#a) = a, so valid, and forces both2757* inverts on */2758ins->inline_constant = ~ins->inline_constant;2759ins->src_invert[1] = true;2760} else {2761/* Flip to the right invert order. Note2762* has_inline_constant false by assumption on the2763* branch, so flipping makes sense. */2764mir_flip(ins);2765}2766}2767}27682769static unsigned2770emit_fragment_epilogue(compiler_context *ctx, unsigned rt, unsigned sample_iter)2771{2772/* Loop to ourselves */2773midgard_instruction *br = ctx->writeout_branch[rt][sample_iter];2774struct midgard_instruction ins = v_branch(false, false);2775ins.writeout = br->writeout;2776ins.branch.target_block = ctx->block_count - 1;2777ins.constants.u32[0] = br->constants.u32[0];2778memcpy(&ins.src_types, &br->src_types, sizeof(ins.src_types));2779emit_mir_instruction(ctx, ins);27802781ctx->current_block->epilogue = true;2782schedule_barrier(ctx);2783return ins.branch.target_block;2784}27852786static midgard_block *2787emit_block_init(compiler_context *ctx)2788{2789midgard_block *this_block = ctx->after_block;2790ctx->after_block = NULL;27912792if (!this_block)2793this_block = create_empty_block(ctx);27942795list_addtail(&this_block->base.link, &ctx->blocks);27962797this_block->scheduled = false;2798++ctx->block_count;27992800/* Set up current block */2801list_inithead(&this_block->base.instructions);2802ctx->current_block = this_block;28032804return this_block;2805}28062807static midgard_block *2808emit_block(compiler_context *ctx, nir_block *block)2809{2810midgard_block *this_block = emit_block_init(ctx);28112812nir_foreach_instr(instr, block) {2813emit_instr(ctx, instr);2814++ctx->instruction_count;2815}28162817return this_block;2818}28192820static midgard_block *emit_cf_list(struct compiler_context *ctx, struct exec_list *list);28212822static void2823emit_if(struct compiler_context *ctx, nir_if *nif)2824{2825midgard_block *before_block = ctx->current_block;28262827/* Speculatively emit the branch, but we can't fill it in until later */2828bool inv = false;2829EMIT(branch, true, true);2830midgard_instruction *then_branch = mir_last_in_block(ctx->current_block);2831then_branch->src[0] = mir_get_branch_cond(&nif->condition, &inv);2832then_branch->src_types[0] = nir_type_uint32;2833then_branch->branch.invert_conditional = !inv;28342835/* Emit the two subblocks. */2836midgard_block *then_block = emit_cf_list(ctx, &nif->then_list);2837midgard_block *end_then_block = ctx->current_block;28382839/* Emit a jump from the end of the then block to the end of the else */2840EMIT(branch, false, false);2841midgard_instruction *then_exit = mir_last_in_block(ctx->current_block);28422843/* Emit second block, and check if it's empty */28442845int else_idx = ctx->block_count;2846int count_in = ctx->instruction_count;2847midgard_block *else_block = emit_cf_list(ctx, &nif->else_list);2848midgard_block *end_else_block = ctx->current_block;2849int after_else_idx = ctx->block_count;28502851/* Now that we have the subblocks emitted, fix up the branches */28522853assert(then_block);2854assert(else_block);28552856if (ctx->instruction_count == count_in) {2857/* The else block is empty, so don't emit an exit jump */2858mir_remove_instruction(then_exit);2859then_branch->branch.target_block = after_else_idx;2860} else {2861then_branch->branch.target_block = else_idx;2862then_exit->branch.target_block = after_else_idx;2863}28642865/* Wire up the successors */28662867ctx->after_block = create_empty_block(ctx);28682869pan_block_add_successor(&before_block->base, &then_block->base);2870pan_block_add_successor(&before_block->base, &else_block->base);28712872pan_block_add_successor(&end_then_block->base, &ctx->after_block->base);2873pan_block_add_successor(&end_else_block->base, &ctx->after_block->base);2874}28752876static void2877emit_loop(struct compiler_context *ctx, nir_loop *nloop)2878{2879/* Remember where we are */2880midgard_block *start_block = ctx->current_block;28812882/* Allocate a loop number, growing the current inner loop depth */2883int loop_idx = ++ctx->current_loop_depth;28842885/* Get index from before the body so we can loop back later */2886int start_idx = ctx->block_count;28872888/* Emit the body itself */2889midgard_block *loop_block = emit_cf_list(ctx, &nloop->body);28902891/* Branch back to loop back */2892struct midgard_instruction br_back = v_branch(false, false);2893br_back.branch.target_block = start_idx;2894emit_mir_instruction(ctx, br_back);28952896/* Mark down that branch in the graph. */2897pan_block_add_successor(&start_block->base, &loop_block->base);2898pan_block_add_successor(&ctx->current_block->base, &loop_block->base);28992900/* Find the index of the block about to follow us (note: we don't add2901* one; blocks are 0-indexed so we get a fencepost problem) */2902int break_block_idx = ctx->block_count;29032904/* Fix up the break statements we emitted to point to the right place,2905* now that we can allocate a block number for them */2906ctx->after_block = create_empty_block(ctx);29072908mir_foreach_block_from(ctx, start_block, _block) {2909mir_foreach_instr_in_block(((midgard_block *) _block), ins) {2910if (ins->type != TAG_ALU_4) continue;2911if (!ins->compact_branch) continue;29122913/* We found a branch -- check the type to see if we need to do anything */2914if (ins->branch.target_type != TARGET_BREAK) continue;29152916/* It's a break! Check if it's our break */2917if (ins->branch.target_break != loop_idx) continue;29182919/* Okay, cool, we're breaking out of this loop.2920* Rewrite from a break to a goto */29212922ins->branch.target_type = TARGET_GOTO;2923ins->branch.target_block = break_block_idx;29242925pan_block_add_successor(_block, &ctx->after_block->base);2926}2927}29282929/* Now that we've finished emitting the loop, free up the depth again2930* so we play nice with recursion amid nested loops */2931--ctx->current_loop_depth;29322933/* Dump loop stats */2934++ctx->loop_count;2935}29362937static midgard_block *2938emit_cf_list(struct compiler_context *ctx, struct exec_list *list)2939{2940midgard_block *start_block = NULL;29412942foreach_list_typed(nir_cf_node, node, node, list) {2943switch (node->type) {2944case nir_cf_node_block: {2945midgard_block *block = emit_block(ctx, nir_cf_node_as_block(node));29462947if (!start_block)2948start_block = block;29492950break;2951}29522953case nir_cf_node_if:2954emit_if(ctx, nir_cf_node_as_if(node));2955break;29562957case nir_cf_node_loop:2958emit_loop(ctx, nir_cf_node_as_loop(node));2959break;29602961case nir_cf_node_function:2962assert(0);2963break;2964}2965}29662967return start_block;2968}29692970/* Due to lookahead, we need to report the first tag executed in the command2971* stream and in branch targets. An initial block might be empty, so iterate2972* until we find one that 'works' */29732974unsigned2975midgard_get_first_tag_from_block(compiler_context *ctx, unsigned block_idx)2976{2977midgard_block *initial_block = mir_get_block(ctx, block_idx);29782979mir_foreach_block_from(ctx, initial_block, _v) {2980midgard_block *v = (midgard_block *) _v;2981if (v->quadword_count) {2982midgard_bundle *initial_bundle =2983util_dynarray_element(&v->bundles, midgard_bundle, 0);29842985return initial_bundle->tag;2986}2987}29882989/* Default to a tag 1 which will break from the shader, in case we jump2990* to the exit block (i.e. `return` in a compute shader) */29912992return 1;2993}29942995/* For each fragment writeout instruction, generate a writeout loop to2996* associate with it */29972998static void2999mir_add_writeout_loops(compiler_context *ctx)3000{3001for (unsigned rt = 0; rt < ARRAY_SIZE(ctx->writeout_branch); ++rt) {3002for (unsigned s = 0; s < MIDGARD_MAX_SAMPLE_ITER; ++s) {3003midgard_instruction *br = ctx->writeout_branch[rt][s];3004if (!br) continue;30053006unsigned popped = br->branch.target_block;3007pan_block_add_successor(&(mir_get_block(ctx, popped - 1)->base),3008&ctx->current_block->base);3009br->branch.target_block = emit_fragment_epilogue(ctx, rt, s);3010br->branch.target_type = TARGET_GOTO;30113012/* If we have more RTs, we'll need to restore back after our3013* loop terminates */3014midgard_instruction *next_br = NULL;30153016if ((s + 1) < MIDGARD_MAX_SAMPLE_ITER)3017next_br = ctx->writeout_branch[rt][s + 1];30183019if (!next_br && (rt + 1) < ARRAY_SIZE(ctx->writeout_branch))3020next_br = ctx->writeout_branch[rt + 1][0];30213022if (next_br) {3023midgard_instruction uncond = v_branch(false, false);3024uncond.branch.target_block = popped;3025uncond.branch.target_type = TARGET_GOTO;3026emit_mir_instruction(ctx, uncond);3027pan_block_add_successor(&ctx->current_block->base,3028&(mir_get_block(ctx, popped)->base));3029schedule_barrier(ctx);3030} else {3031/* We're last, so we can terminate here */3032br->last_writeout = true;3033}3034}3035}3036}30373038void3039midgard_compile_shader_nir(nir_shader *nir,3040const struct panfrost_compile_inputs *inputs,3041struct util_dynarray *binary,3042struct pan_shader_info *info)3043{3044midgard_debug = debug_get_option_midgard_debug();30453046/* TODO: Bound against what? */3047compiler_context *ctx = rzalloc(NULL, compiler_context);3048ctx->sysval_to_id = panfrost_init_sysvals(&info->sysvals, ctx);30493050ctx->inputs = inputs;3051ctx->nir = nir;3052ctx->info = info;3053ctx->stage = nir->info.stage;30543055if (inputs->is_blend) {3056unsigned nr_samples = MAX2(inputs->blend.nr_samples, 1);3057const struct util_format_description *desc =3058util_format_description(inputs->rt_formats[inputs->blend.rt]);30593060/* We have to split writeout in 128 bit chunks */3061ctx->blend_sample_iterations =3062DIV_ROUND_UP(desc->block.bits * nr_samples, 128);3063}3064ctx->blend_input = ~0;3065ctx->blend_src1 = ~0;3066ctx->quirks = midgard_get_quirks(inputs->gpu_id);30673068/* Initialize at a global (not block) level hash tables */30693070ctx->ssa_constants = _mesa_hash_table_u64_create(ctx);30713072/* Lower gl_Position pre-optimisation, but after lowering vars to ssa3073* (so we don't accidentally duplicate the epilogue since mesa/st has3074* messed with our I/O quite a bit already) */30753076NIR_PASS_V(nir, nir_lower_vars_to_ssa);30773078if (ctx->stage == MESA_SHADER_VERTEX) {3079NIR_PASS_V(nir, nir_lower_viewport_transform);3080NIR_PASS_V(nir, nir_lower_point_size, 1.0, 1024.0);3081}30823083NIR_PASS_V(nir, nir_lower_var_copies);3084NIR_PASS_V(nir, nir_lower_vars_to_ssa);3085NIR_PASS_V(nir, nir_split_var_copies);3086NIR_PASS_V(nir, nir_lower_var_copies);3087NIR_PASS_V(nir, nir_lower_global_vars_to_local);3088NIR_PASS_V(nir, nir_lower_var_copies);3089NIR_PASS_V(nir, nir_lower_vars_to_ssa);30903091unsigned pan_quirks = panfrost_get_quirks(inputs->gpu_id, 0);3092NIR_PASS_V(nir, pan_lower_framebuffer,3093inputs->rt_formats, inputs->is_blend, pan_quirks);30943095NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,3096glsl_type_size, 0);3097NIR_PASS_V(nir, nir_lower_ssbo);3098NIR_PASS_V(nir, pan_nir_lower_zs_store);30993100NIR_PASS_V(nir, pan_nir_lower_64bit_intrin);31013102/* Optimisation passes */31033104optimise_nir(nir, ctx->quirks, inputs->is_blend);31053106NIR_PASS_V(nir, pan_nir_reorder_writeout);31073108if ((midgard_debug & MIDGARD_DBG_SHADERS) &&3109((midgard_debug & MIDGARD_DBG_INTERNAL) || !nir->info.internal)) {3110nir_print_shader(nir, stdout);3111}31123113info->tls_size = nir->scratch_size;31143115nir_foreach_function(func, nir) {3116if (!func->impl)3117continue;31183119list_inithead(&ctx->blocks);3120ctx->block_count = 0;3121ctx->func = func;3122ctx->already_emitted = calloc(BITSET_WORDS(func->impl->ssa_alloc), sizeof(BITSET_WORD));31233124if (nir->info.outputs_read && !inputs->is_blend) {3125emit_block_init(ctx);31263127struct midgard_instruction wait = v_branch(false, false);3128wait.branch.target_type = TARGET_TILEBUF_WAIT;31293130emit_mir_instruction(ctx, wait);31313132++ctx->instruction_count;3133}31343135emit_cf_list(ctx, &func->impl->body);3136free(ctx->already_emitted);3137break; /* TODO: Multi-function shaders */3138}31393140/* Per-block lowering before opts */31413142mir_foreach_block(ctx, _block) {3143midgard_block *block = (midgard_block *) _block;3144inline_alu_constants(ctx, block);3145embedded_to_inline_constant(ctx, block);3146}3147/* MIR-level optimizations */31483149bool progress = false;31503151do {3152progress = false;3153progress |= midgard_opt_dead_code_eliminate(ctx);31543155mir_foreach_block(ctx, _block) {3156midgard_block *block = (midgard_block *) _block;3157progress |= midgard_opt_copy_prop(ctx, block);3158progress |= midgard_opt_combine_projection(ctx, block);3159progress |= midgard_opt_varying_projection(ctx, block);3160}3161} while (progress);31623163mir_foreach_block(ctx, _block) {3164midgard_block *block = (midgard_block *) _block;3165midgard_lower_derivatives(ctx, block);3166midgard_legalize_invert(ctx, block);3167midgard_cull_dead_branch(ctx, block);3168}31693170if (ctx->stage == MESA_SHADER_FRAGMENT)3171mir_add_writeout_loops(ctx);31723173/* Analyze now that the code is known but before scheduling creates3174* pipeline registers which are harder to track */3175mir_analyze_helper_requirements(ctx);31763177/* Schedule! */3178midgard_schedule_program(ctx);3179mir_ra(ctx);31803181/* Analyze after scheduling since this is order-dependent */3182mir_analyze_helper_terminate(ctx);31833184/* Emit flat binary from the instruction arrays. Iterate each block in3185* sequence. Save instruction boundaries such that lookahead tags can3186* be assigned easily */31873188/* Cache _all_ bundles in source order for lookahead across failed branches */31893190int bundle_count = 0;3191mir_foreach_block(ctx, _block) {3192midgard_block *block = (midgard_block *) _block;3193bundle_count += block->bundles.size / sizeof(midgard_bundle);3194}3195midgard_bundle **source_order_bundles = malloc(sizeof(midgard_bundle *) * bundle_count);3196int bundle_idx = 0;3197mir_foreach_block(ctx, _block) {3198midgard_block *block = (midgard_block *) _block;3199util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) {3200source_order_bundles[bundle_idx++] = bundle;3201}3202}32033204int current_bundle = 0;32053206/* Midgard prefetches instruction types, so during emission we3207* need to lookahead. Unless this is the last instruction, in3208* which we return 1. */32093210mir_foreach_block(ctx, _block) {3211midgard_block *block = (midgard_block *) _block;3212mir_foreach_bundle_in_block(block, bundle) {3213int lookahead = 1;32143215if (!bundle->last_writeout && (current_bundle + 1 < bundle_count))3216lookahead = source_order_bundles[current_bundle + 1]->tag;32173218emit_binary_bundle(ctx, block, bundle, binary, lookahead);3219++current_bundle;3220}32213222/* TODO: Free deeper */3223//util_dynarray_fini(&block->instructions);3224}32253226free(source_order_bundles);32273228/* Report the very first tag executed */3229info->midgard.first_tag = midgard_get_first_tag_from_block(ctx, 0);32303231info->ubo_mask = ctx->ubo_mask & BITSET_MASK(ctx->nir->info.num_ubos);32323233if ((midgard_debug & MIDGARD_DBG_SHADERS) &&3234((midgard_debug & MIDGARD_DBG_INTERNAL) || !nir->info.internal)) {3235disassemble_midgard(stdout, binary->data,3236binary->size, inputs->gpu_id,3237midgard_debug & MIDGARD_DBG_VERBOSE);3238fflush(stdout);3239}32403241/* A shader ending on a 16MB boundary causes INSTR_INVALID_PC faults,3242* workaround by adding some padding to the end of the shader. (The3243* kernel makes sure shader BOs can't cross 16MB boundaries.) */3244if (binary->size)3245memset(util_dynarray_grow(binary, uint8_t, 16), 0, 16);32463247if ((midgard_debug & MIDGARD_DBG_SHADERDB || inputs->shaderdb) &&3248!nir->info.internal) {3249unsigned nr_bundles = 0, nr_ins = 0;32503251/* Count instructions and bundles */32523253mir_foreach_block(ctx, _block) {3254midgard_block *block = (midgard_block *) _block;3255nr_bundles += util_dynarray_num_elements(3256&block->bundles, midgard_bundle);32573258mir_foreach_bundle_in_block(block, bun)3259nr_ins += bun->instruction_count;3260}32613262/* Calculate thread count. There are certain cutoffs by3263* register count for thread count */32643265unsigned nr_registers = info->work_reg_count;32663267unsigned nr_threads =3268(nr_registers <= 4) ? 4 :3269(nr_registers <= 8) ? 2 :32701;32713272/* Dump stats */32733274fprintf(stderr, "%s - %s shader: "3275"%u inst, %u bundles, %u quadwords, "3276"%u registers, %u threads, %u loops, "3277"%u:%u spills:fills\n",3278ctx->nir->info.label ?: "",3279ctx->inputs->is_blend ? "PAN_SHADER_BLEND" :3280gl_shader_stage_name(ctx->stage),3281nr_ins, nr_bundles, ctx->quadword_count,3282nr_registers, nr_threads,3283ctx->loop_count,3284ctx->spills, ctx->fills);3285}32863287ralloc_free(ctx);3288}328932903291