Path: blob/21.2-virgl/src/asahi/compiler/agx_compile.c
4564 views
/*1* Copyright (C) 2021 Alyssa Rosenzweig <[email protected]>2* Copyright (C) 2020 Collabora Ltd.3* Copyright © 2016 Broadcom4*5* Permission is hereby granted, free of charge, to any person obtaining a6* copy of this software and associated documentation files (the "Software"),7* to deal in the Software without restriction, including without limitation8* the rights to use, copy, modify, merge, publish, distribute, sublicense,9* and/or sell copies of the Software, and to permit persons to whom the10* Software is furnished to do so, subject to the following conditions:11*12* The above copyright notice and this permission notice (including the next13* paragraph) shall be included in all copies or substantial portions of the14* Software.15*16* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR17* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,18* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL19* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER20* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,21* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE22* SOFTWARE.23*/2425#include "main/mtypes.h"26#include "compiler/nir_types.h"27#include "compiler/nir/nir_builder.h"28#include "util/u_debug.h"29#include "agx_compile.h"30#include "agx_compiler.h"31#include "agx_builder.h"3233static const struct debug_named_value agx_debug_options[] = {34{"msgs", AGX_DBG_MSGS, "Print debug messages"},35{"shaders", AGX_DBG_SHADERS, "Dump shaders in NIR and AIR"},36{"shaderdb", AGX_DBG_SHADERDB, "Print statistics"},37{"verbose", AGX_DBG_VERBOSE, "Disassemble verbosely"},38{"internal", AGX_DBG_INTERNAL, "Dump even internal shaders"},39DEBUG_NAMED_VALUE_END40};4142DEBUG_GET_ONCE_FLAGS_OPTION(agx_debug, "AGX_MESA_DEBUG", agx_debug_options, 0)4344int agx_debug = 0;4546#define DBG(fmt, ...) \47do { if (agx_debug & AGX_DBG_MSGS) \48fprintf(stderr, "%s:%d: "fmt, \49__FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)5051static void52agx_block_add_successor(agx_block *block, agx_block *successor)53{54assert(block != NULL && successor != NULL);5556/* Cull impossible edges */57if (block->unconditional_jumps)58return;5960for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) {61if (block->successors[i]) {62if (block->successors[i] == successor)63return;64else65continue;66}6768block->successors[i] = successor;69_mesa_set_add(successor->predecessors, block);70return;71}7273unreachable("Too many successors");74}7576static void77agx_emit_load_const(agx_builder *b, nir_load_const_instr *instr)78{79/* Ensure we've been scalarized and bit size lowered */80unsigned bit_size = instr->def.bit_size;81assert(instr->def.num_components == 1);82assert(bit_size == 1 || bit_size == 16 || bit_size == 32);8384/* Emit move, later passes can inline/push if useful */85agx_mov_imm_to(b,86agx_get_index(instr->def.index, agx_size_for_bits(bit_size)),87nir_const_value_as_uint(instr->value[0], bit_size));88}8990/* AGX appears to lack support for vertex attributes. Lower to global loads. */91static agx_instr *92agx_emit_load_attr(agx_builder *b, nir_intrinsic_instr *instr)93{94nir_src *offset_src = nir_get_io_offset_src(instr);95assert(nir_src_is_const(*offset_src) && "no attribute indirects");96unsigned index = nir_intrinsic_base(instr) +97nir_src_as_uint(*offset_src);9899struct agx_shader_key *key = b->shader->key;100struct agx_attribute attrib = key->vs.attributes[index];101102/* address = base + (stride * vertex_id) + src_offset */103unsigned buf = attrib.buf;104agx_index stride = agx_mov_imm(b, 32, key->vs.vbuf_strides[buf]);105agx_index src_offset = agx_mov_imm(b, 32, attrib.src_offset);106agx_index vertex_id = agx_register(10, AGX_SIZE_32); // TODO: RA107agx_index offset = agx_imad(b, vertex_id, stride, src_offset, 0);108109/* Each VBO has a 64-bit = 4 x 16-bit address, lookup the base address as a sysval */110unsigned num_vbos = key->vs.num_vbufs;111unsigned base_length = (num_vbos * 4);112agx_index base = agx_indexed_sysval(b->shader,113AGX_PUSH_VBO_BASES, AGX_SIZE_64, buf * 4, base_length);114115/* Load the data */116assert(instr->num_components <= 4);117118bool pad = ((attrib.nr_comps_minus_1 + 1) < instr->num_components);119agx_index real_dest = agx_dest_index(&instr->dest);120agx_index dest = pad ? agx_temp(b->shader, AGX_SIZE_32) : real_dest;121122agx_device_load_to(b, dest, base, offset, attrib.format,123BITFIELD_MASK(attrib.nr_comps_minus_1 + 1), 0);124125agx_wait(b, 0);126127if (pad) {128agx_index one = agx_mov_imm(b, 32, fui(1.0));129agx_index zero = agx_mov_imm(b, 32, 0);130agx_index channels[4] = { zero, zero, zero, one };131for (unsigned i = 0; i < (attrib.nr_comps_minus_1 + 1); ++i)132channels[i] = agx_p_extract(b, dest, i);133for (unsigned i = instr->num_components; i < 4; ++i)134channels[i] = agx_null();135agx_p_combine_to(b, real_dest, channels[0], channels[1], channels[2], channels[3]);136}137138return NULL;139}140141static agx_instr *142agx_emit_load_vary_flat(agx_builder *b, nir_intrinsic_instr *instr)143{144unsigned components = instr->num_components;145assert(components >= 1 && components <= 4);146147nir_src *offset = nir_get_io_offset_src(instr);148assert(nir_src_is_const(*offset) && "no indirects");149unsigned imm_index = b->shader->varyings[nir_intrinsic_base(instr)];150imm_index += nir_src_as_uint(*offset);151152agx_index chan[4] = { agx_null() };153154for (unsigned i = 0; i < components; ++i) {155/* vec3 for each vertex, unknown what first 2 channels are for */156agx_index values = agx_ld_vary_flat(b, agx_immediate(imm_index + i), 1);157chan[i] = agx_p_extract(b, values, 2);158}159160return agx_p_combine_to(b, agx_dest_index(&instr->dest),161chan[0], chan[1], chan[2], chan[3]);162}163164static agx_instr *165agx_emit_load_vary(agx_builder *b, nir_intrinsic_instr *instr)166{167ASSERTED unsigned components = instr->num_components;168ASSERTED nir_intrinsic_instr *parent = nir_src_as_intrinsic(instr->src[0]);169170assert(components >= 1 && components <= 4);171assert(parent);172173/* TODO: Interpolation modes */174assert(parent->intrinsic == nir_intrinsic_load_barycentric_pixel);175176nir_src *offset = nir_get_io_offset_src(instr);177assert(nir_src_is_const(*offset) && "no indirects");178unsigned imm_index = b->shader->varyings[nir_intrinsic_base(instr)];179imm_index += nir_src_as_uint(*offset) * 4;180181return agx_ld_vary_to(b, agx_dest_index(&instr->dest),182agx_immediate(imm_index), components, true);183}184185static agx_instr *186agx_emit_store_vary(agx_builder *b, nir_intrinsic_instr *instr)187{188nir_src *offset = nir_get_io_offset_src(instr);189assert(nir_src_is_const(*offset) && "todo: indirects");190unsigned imm_index = b->shader->varyings[nir_intrinsic_base(instr)];191imm_index += nir_intrinsic_component(instr);192imm_index += nir_src_as_uint(*offset);193194/* nir_lower_io_to_scalar */195assert(nir_intrinsic_write_mask(instr) == 0x1);196197return agx_st_vary(b,198agx_immediate(imm_index),199agx_src_index(&instr->src[0]));200}201202static agx_instr *203agx_emit_fragment_out(agx_builder *b, nir_intrinsic_instr *instr)204{205const nir_variable *var =206nir_find_variable_with_driver_location(b->shader->nir,207nir_var_shader_out, nir_intrinsic_base(instr));208assert(var);209210unsigned loc = var->data.location;211assert(var->data.index == 0 && "todo: dual-source blending");212assert(loc == FRAG_RESULT_DATA0 && "todo: MRT");213unsigned rt = (loc - FRAG_RESULT_DATA0);214215/* TODO: Reverse-engineer interactions with MRT */216if (b->shader->nir->info.internal) {217/* clear */218} else if (b->shader->did_writeout) {219agx_writeout(b, 0x0004);220} else {221agx_writeout(b, 0xC200);222agx_writeout(b, 0x000C);223}224225b->shader->did_writeout = true;226return agx_st_tile(b, agx_src_index(&instr->src[0]),227b->shader->key->fs.tib_formats[rt]);228}229230static agx_instr *231agx_emit_load_tile(agx_builder *b, nir_intrinsic_instr *instr)232{233const nir_variable *var =234nir_find_variable_with_driver_location(b->shader->nir,235nir_var_shader_out, nir_intrinsic_base(instr));236assert(var);237238unsigned loc = var->data.location;239assert(var->data.index == 0 && "todo: dual-source blending");240assert(loc == FRAG_RESULT_DATA0 && "todo: MRT");241unsigned rt = (loc - FRAG_RESULT_DATA0);242243/* TODO: Reverse-engineer interactions with MRT */244agx_writeout(b, 0xC200);245agx_writeout(b, 0x0008);246b->shader->did_writeout = true;247b->shader->out->reads_tib = true;248249return agx_ld_tile_to(b, agx_dest_index(&instr->dest),250b->shader->key->fs.tib_formats[rt]);251}252253static enum agx_format254agx_format_for_bits(unsigned bits)255{256switch (bits) {257case 8: return AGX_FORMAT_I8;258case 16: return AGX_FORMAT_I16;259case 32: return AGX_FORMAT_I32;260default: unreachable("Invalid bit size for load/store");261}262}263264static agx_instr *265agx_emit_load_ubo(agx_builder *b, nir_intrinsic_instr *instr)266{267bool kernel_input = (instr->intrinsic == nir_intrinsic_load_kernel_input);268nir_src *offset = nir_get_io_offset_src(instr);269270if (!kernel_input && !nir_src_is_const(instr->src[0]))271unreachable("todo: indirect UBO access");272273/* Constant offsets for device_load are 16-bit */274bool offset_is_const = nir_src_is_const(*offset);275assert(offset_is_const && "todo: indirect UBO access");276int32_t const_offset = offset_is_const ? nir_src_as_int(*offset) : 0;277278/* Offsets are shifted by the type size, so divide that out */279unsigned bytes = nir_dest_bit_size(instr->dest) / 8;280assert((const_offset & (bytes - 1)) == 0);281const_offset = const_offset / bytes;282int16_t const_as_16 = const_offset;283284/* UBO blocks are specified (kernel inputs are always 0) */285uint32_t block = kernel_input ? 0 : nir_src_as_uint(instr->src[0]);286287/* Each UBO has a 64-bit = 4 x 16-bit address */288unsigned num_ubos = b->shader->nir->info.num_ubos;289unsigned base_length = (num_ubos * 4);290unsigned index = block * 4; /* 16 bit units */291292/* Lookup the base address (TODO: indirection) */293agx_index base = agx_indexed_sysval(b->shader,294AGX_PUSH_UBO_BASES, AGX_SIZE_64,295index, base_length);296297/* Load the data */298assert(instr->num_components <= 4);299300agx_device_load_to(b, agx_dest_index(&instr->dest),301base,302(offset_is_const && (const_offset == const_as_16)) ?303agx_immediate(const_as_16) : agx_mov_imm(b, 32, const_offset),304agx_format_for_bits(nir_dest_bit_size(instr->dest)),305BITFIELD_MASK(instr->num_components), 0);306307return agx_wait(b, 0);308}309310static agx_instr *311agx_emit_load_frag_coord(agx_builder *b, nir_intrinsic_instr *instr)312{313agx_index xy[2];314315for (unsigned i = 0; i < 2; ++i) {316xy[i] = agx_fadd(b, agx_convert(b, agx_immediate(AGX_CONVERT_U32_TO_F),317agx_get_sr(b, 32, AGX_SR_THREAD_POSITION_IN_GRID_X + i),318AGX_ROUND_RTE), agx_immediate_f(0.5f));319}320321/* Ordering by the ABI */322agx_index z = agx_ld_vary(b, agx_immediate(1), 1, false);323agx_index w = agx_ld_vary(b, agx_immediate(0), 1, false);324325return agx_p_combine_to(b, agx_dest_index(&instr->dest),326xy[0], xy[1], z, w);327}328329static agx_instr *330agx_blend_const(agx_builder *b, agx_index dst, unsigned comp)331{332agx_index val = agx_indexed_sysval(b->shader,333AGX_PUSH_BLEND_CONST, AGX_SIZE_32, comp * 2, 4 * 2);334335return agx_mov_to(b, dst, val);336}337338static agx_instr *339agx_emit_intrinsic(agx_builder *b, nir_intrinsic_instr *instr)340{341agx_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest ?342agx_dest_index(&instr->dest) : agx_null();343gl_shader_stage stage = b->shader->stage;344345switch (instr->intrinsic) {346case nir_intrinsic_load_barycentric_pixel:347case nir_intrinsic_load_barycentric_centroid:348case nir_intrinsic_load_barycentric_sample:349case nir_intrinsic_load_barycentric_at_sample:350case nir_intrinsic_load_barycentric_at_offset:351/* handled later via load_vary */352return NULL;353case nir_intrinsic_load_interpolated_input:354assert(stage == MESA_SHADER_FRAGMENT);355return agx_emit_load_vary(b, instr);356357case nir_intrinsic_load_input:358if (stage == MESA_SHADER_FRAGMENT)359return agx_emit_load_vary_flat(b, instr);360else if (stage == MESA_SHADER_VERTEX)361return agx_emit_load_attr(b, instr);362else363unreachable("Unsupported shader stage");364365case nir_intrinsic_store_output:366if (stage == MESA_SHADER_FRAGMENT)367return agx_emit_fragment_out(b, instr);368else if (stage == MESA_SHADER_VERTEX)369return agx_emit_store_vary(b, instr);370else371unreachable("Unsupported shader stage");372373case nir_intrinsic_load_output:374assert(stage == MESA_SHADER_FRAGMENT);375return agx_emit_load_tile(b, instr);376377case nir_intrinsic_load_ubo:378case nir_intrinsic_load_kernel_input:379return agx_emit_load_ubo(b, instr);380381case nir_intrinsic_load_frag_coord:382return agx_emit_load_frag_coord(b, instr);383384case nir_intrinsic_load_back_face_agx:385return agx_get_sr_to(b, dst, AGX_SR_BACKFACING);386387case nir_intrinsic_load_vertex_id:388return agx_mov_to(b, dst, agx_abs(agx_register(10, AGX_SIZE_32))); /* TODO: RA */389390case nir_intrinsic_load_blend_const_color_r_float: return agx_blend_const(b, dst, 0);391case nir_intrinsic_load_blend_const_color_g_float: return agx_blend_const(b, dst, 1);392case nir_intrinsic_load_blend_const_color_b_float: return agx_blend_const(b, dst, 2);393case nir_intrinsic_load_blend_const_color_a_float: return agx_blend_const(b, dst, 3);394395default:396fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name);397unreachable("Unhandled intrinsic");398}399}400401static agx_index402agx_alu_src_index(agx_builder *b, nir_alu_src src)403{404/* Check well-formedness of the input NIR */405ASSERTED unsigned bitsize = nir_src_bit_size(src.src);406unsigned comps = nir_src_num_components(src.src);407unsigned channel = src.swizzle[0];408409assert(bitsize == 1 || bitsize == 16 || bitsize == 32 || bitsize == 64);410assert(!(src.negate || src.abs));411assert(channel < comps);412413agx_index idx = agx_src_index(&src.src);414415/* We only deal with scalars, emit p_extract if needed */416if (comps > 1)417return agx_p_extract(b, idx, channel);418else419return idx;420}421422static agx_instr *423agx_emit_alu_bool(agx_builder *b, nir_op op,424agx_index dst, agx_index s0, agx_index s1, agx_index s2)425{426/* Handle 1-bit bools as zero/nonzero rather than specifically 0/1 or 0/~0.427* This will give the optimizer flexibility. */428agx_index f = agx_immediate(0);429agx_index t = agx_immediate(0x1);430431switch (op) {432case nir_op_feq: return agx_fcmpsel_to(b, dst, s0, s1, t, f, AGX_FCOND_EQ);433case nir_op_flt: return agx_fcmpsel_to(b, dst, s0, s1, t, f, AGX_FCOND_LT);434case nir_op_fge: return agx_fcmpsel_to(b, dst, s0, s1, t, f, AGX_FCOND_GE);435case nir_op_fneu: return agx_fcmpsel_to(b, dst, s0, s1, f, t, AGX_FCOND_EQ);436437case nir_op_ieq: return agx_icmpsel_to(b, dst, s0, s1, t, f, AGX_ICOND_UEQ);438case nir_op_ine: return agx_icmpsel_to(b, dst, s0, s1, f, t, AGX_ICOND_UEQ);439case nir_op_ilt: return agx_icmpsel_to(b, dst, s0, s1, t, f, AGX_ICOND_SLT);440case nir_op_ige: return agx_icmpsel_to(b, dst, s0, s1, f, t, AGX_ICOND_SLT);441case nir_op_ult: return agx_icmpsel_to(b, dst, s0, s1, t, f, AGX_ICOND_ULT);442case nir_op_uge: return agx_icmpsel_to(b, dst, s0, s1, f, t, AGX_ICOND_ULT);443444case nir_op_mov: return agx_mov_to(b, dst, s0);445case nir_op_iand: return agx_and_to(b, dst, s0, s1);446case nir_op_ior: return agx_or_to(b, dst, s0, s1);447case nir_op_ixor: return agx_xor_to(b, dst, s0, s1);448case nir_op_inot: return agx_xor_to(b, dst, s0, t);449450case nir_op_f2b1: return agx_fcmpsel_to(b, dst, s0, f, f, t, AGX_FCOND_EQ);451case nir_op_i2b1: return agx_icmpsel_to(b, dst, s0, f, f, t, AGX_ICOND_UEQ);452case nir_op_b2b1: return agx_icmpsel_to(b, dst, s0, f, f, t, AGX_ICOND_UEQ);453454case nir_op_bcsel:455return agx_icmpsel_to(b, dst, s0, f, s2, s1, AGX_ICOND_UEQ);456457default:458fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[op].name);459unreachable("Unhandled boolean ALU instruction");460}461}462463static agx_instr *464agx_emit_alu(agx_builder *b, nir_alu_instr *instr)465{466unsigned srcs = nir_op_infos[instr->op].num_inputs;467unsigned sz = nir_dest_bit_size(instr->dest.dest);468unsigned src_sz = srcs ? nir_src_bit_size(instr->src[0].src) : 0;469ASSERTED unsigned comps = nir_dest_num_components(instr->dest.dest);470471assert(comps == 1 || nir_op_is_vec(instr->op));472assert(sz == 1 || sz == 16 || sz == 32 || sz == 64);473474agx_index dst = agx_dest_index(&instr->dest.dest);475agx_index s0 = srcs > 0 ? agx_alu_src_index(b, instr->src[0]) : agx_null();476agx_index s1 = srcs > 1 ? agx_alu_src_index(b, instr->src[1]) : agx_null();477agx_index s2 = srcs > 2 ? agx_alu_src_index(b, instr->src[2]) : agx_null();478agx_index s3 = srcs > 3 ? agx_alu_src_index(b, instr->src[3]) : agx_null();479480/* 1-bit bools are a bit special, only handle with select ops */481if (sz == 1)482return agx_emit_alu_bool(b, instr->op, dst, s0, s1, s2);483484#define UNOP(nop, aop) \485case nir_op_ ## nop: return agx_ ## aop ## _to(b, dst, s0);486#define BINOP(nop, aop) \487case nir_op_ ## nop: return agx_ ## aop ## _to(b, dst, s0, s1);488#define TRIOP(nop, aop) \489case nir_op_ ## nop: return agx_ ## aop ## _to(b, dst, s0, s1, s2);490491switch (instr->op) {492BINOP(fadd, fadd);493BINOP(fmul, fmul);494TRIOP(ffma, fma);495496UNOP(f2f16, fmov);497UNOP(f2f32, fmov);498UNOP(fround_even, roundeven);499UNOP(ftrunc, trunc);500UNOP(ffloor, floor);501UNOP(fceil, ceil);502UNOP(frcp, rcp);503UNOP(frsq, rsqrt);504UNOP(flog2, log2);505UNOP(fexp2, exp2);506507UNOP(fddx, dfdx);508UNOP(fddx_coarse, dfdx);509UNOP(fddx_fine, dfdx);510511UNOP(fddy, dfdy);512UNOP(fddy_coarse, dfdy);513UNOP(fddy_fine, dfdy);514515UNOP(mov, mov);516UNOP(u2u16, mov);517UNOP(u2u32, mov);518UNOP(inot, not);519BINOP(iand, and);520BINOP(ior, or);521BINOP(ixor, xor);522523case nir_op_fsqrt: return agx_fmul_to(b, dst, s0, agx_srsqrt(b, s0));524case nir_op_fsub: return agx_fadd_to(b, dst, s0, agx_neg(s1));525case nir_op_fabs: return agx_fmov_to(b, dst, agx_abs(s0));526case nir_op_fneg: return agx_fmov_to(b, dst, agx_neg(s0));527528case nir_op_fmin: return agx_fcmpsel_to(b, dst, s0, s1, s0, s1, AGX_FCOND_LTN);529case nir_op_fmax: return agx_fcmpsel_to(b, dst, s0, s1, s0, s1, AGX_FCOND_GTN);530case nir_op_imin: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_SLT);531case nir_op_imax: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_SGT);532case nir_op_umin: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_ULT);533case nir_op_umax: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_UGT);534535case nir_op_iadd: return agx_iadd_to(b, dst, s0, s1, 0);536case nir_op_isub: return agx_iadd_to(b, dst, s0, agx_neg(s1), 0);537case nir_op_ineg: return agx_iadd_to(b, dst, agx_zero(), agx_neg(s0), 0);538case nir_op_imul: return agx_imad_to(b, dst, s0, s1, agx_zero(), 0);539540case nir_op_ishl: return agx_bfi_to(b, dst, agx_zero(), s0, s1, 0);541case nir_op_ushr: return agx_bfeil_to(b, dst, agx_zero(), s0, s1, 0);542case nir_op_ishr: return agx_asr_to(b, dst, s0, s1);543544case nir_op_bcsel:545return agx_icmpsel_to(b, dst, s0, agx_zero(), s2, s1, AGX_ICOND_UEQ);546547case nir_op_b2i32:548case nir_op_b2i16:549return agx_icmpsel_to(b, dst, s0, agx_zero(), agx_zero(), agx_immediate(1), AGX_ICOND_UEQ);550551case nir_op_b2f16:552case nir_op_b2f32:553{554/* At this point, boolean is just zero/nonzero, so compare with zero */555agx_index one = (sz == 16) ?556agx_mov_imm(b, 16, _mesa_float_to_half(1.0)) :557agx_mov_imm(b, 32, fui(1.0));558559agx_index zero = agx_zero();560561return agx_fcmpsel_to(b, dst, s0, zero, zero, one, AGX_FCOND_EQ);562}563564case nir_op_i2i32:565{566if (s0.size != AGX_SIZE_16)567unreachable("todo: more conversions");568569return agx_iadd_to(b, dst, s0, agx_zero(), 0);570}571572case nir_op_i2i16:573{574if (s0.size != AGX_SIZE_32)575unreachable("todo: more conversions");576577return agx_iadd_to(b, dst, s0, agx_zero(), 0);578}579580case nir_op_iadd_sat:581{582agx_instr *I = agx_iadd_to(b, dst, s0, s1, 0);583I->saturate = true;584return I;585}586587case nir_op_isub_sat:588{589agx_instr *I = agx_iadd_to(b, dst, s0, agx_neg(s1), 0);590I->saturate = true;591return I;592}593594case nir_op_uadd_sat:595{596agx_instr *I = agx_iadd_to(b, dst, agx_abs(s0), agx_abs(s1), 0);597I->saturate = true;598return I;599}600601case nir_op_usub_sat:602{603agx_instr *I = agx_iadd_to(b, dst, agx_abs(s0), agx_neg(agx_abs(s1)), 0);604I->saturate = true;605return I;606}607608case nir_op_fsat:609{610agx_instr *I = agx_fadd_to(b, dst, s0, agx_negzero());611I->saturate = true;612return I;613}614615case nir_op_fsin_agx:616{617agx_index fixup = agx_sin_pt_1(b, s0);618agx_index sinc = agx_sin_pt_2(b, fixup);619return agx_fmul_to(b, dst, sinc, fixup);620}621622case nir_op_f2i16:623return agx_convert_to(b, dst,624agx_immediate(AGX_CONVERT_F_TO_S16), s0, AGX_ROUND_RTZ);625626case nir_op_f2i32:627return agx_convert_to(b, dst,628agx_immediate(AGX_CONVERT_F_TO_S32), s0, AGX_ROUND_RTZ);629630case nir_op_f2u16:631return agx_convert_to(b, dst,632agx_immediate(AGX_CONVERT_F_TO_U16), s0, AGX_ROUND_RTZ);633634case nir_op_f2u32:635return agx_convert_to(b, dst,636agx_immediate(AGX_CONVERT_F_TO_U32), s0, AGX_ROUND_RTZ);637638case nir_op_u2f16:639case nir_op_u2f32:640{641if (src_sz == 64)642unreachable("64-bit conversions unimplemented");643644enum agx_convert mode =645(src_sz == 32) ? AGX_CONVERT_U32_TO_F :646(src_sz == 16) ? AGX_CONVERT_U16_TO_F :647AGX_CONVERT_U8_TO_F;648649return agx_convert_to(b, dst, agx_immediate(mode), s0, AGX_ROUND_RTE);650}651652case nir_op_i2f16:653case nir_op_i2f32:654{655if (src_sz == 64)656unreachable("64-bit conversions unimplemented");657658enum agx_convert mode =659(src_sz == 32) ? AGX_CONVERT_S32_TO_F :660(src_sz == 16) ? AGX_CONVERT_S16_TO_F :661AGX_CONVERT_S8_TO_F;662663return agx_convert_to(b, dst, agx_immediate(mode), s0, AGX_ROUND_RTE);664}665666case nir_op_vec2:667case nir_op_vec3:668case nir_op_vec4:669return agx_p_combine_to(b, dst, s0, s1, s2, s3);670671case nir_op_vec8:672case nir_op_vec16:673unreachable("should've been lowered");674675default:676fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name);677unreachable("Unhandled ALU instruction");678}679}680681static enum agx_dim682agx_tex_dim(enum glsl_sampler_dim dim, bool array)683{684switch (dim) {685case GLSL_SAMPLER_DIM_1D:686case GLSL_SAMPLER_DIM_BUF:687return array ? AGX_DIM_TEX_1D_ARRAY : AGX_DIM_TEX_1D;688689case GLSL_SAMPLER_DIM_2D:690case GLSL_SAMPLER_DIM_RECT:691case GLSL_SAMPLER_DIM_EXTERNAL:692return array ? AGX_DIM_TEX_2D_ARRAY : AGX_DIM_TEX_2D;693694case GLSL_SAMPLER_DIM_MS:695assert(!array && "multisampled arrays unsupported");696return AGX_DIM_TEX_2D_MS;697698case GLSL_SAMPLER_DIM_3D:699assert(!array && "3D arrays unsupported");700return AGX_DIM_TEX_3D;701702case GLSL_SAMPLER_DIM_CUBE:703return array ? AGX_DIM_TEX_CUBE_ARRAY : AGX_DIM_TEX_CUBE;704705default:706unreachable("Invalid sampler dim\n");707}708}709710static void711agx_emit_tex(agx_builder *b, nir_tex_instr *instr)712{713switch (instr->op) {714case nir_texop_tex:715case nir_texop_txl:716break;717default:718unreachable("Unhandled texture op");719}720721enum agx_lod_mode lod_mode = (instr->op == nir_texop_tex) ?722AGX_LOD_MODE_AUTO_LOD : AGX_LOD_MODE_LOD_MIN;723724agx_index coords = agx_null(),725texture = agx_immediate(instr->texture_index),726sampler = agx_immediate(instr->sampler_index),727lod = agx_immediate(0),728offset = agx_null();729730for (unsigned i = 0; i < instr->num_srcs; ++i) {731agx_index index = agx_src_index(&instr->src[i].src);732733switch (instr->src[i].src_type) {734case nir_tex_src_coord:735coords = index;736break;737738case nir_tex_src_lod:739lod = index;740break;741742case nir_tex_src_bias:743case nir_tex_src_ms_index:744case nir_tex_src_offset:745case nir_tex_src_comparator:746case nir_tex_src_texture_offset:747case nir_tex_src_sampler_offset:748default:749unreachable("todo");750}751}752753agx_texture_sample_to(b, agx_dest_index(&instr->dest),754coords, lod, texture, sampler, offset,755agx_tex_dim(instr->sampler_dim, instr->is_array),756lod_mode,7570xF, /* TODO: wrmask */7580);759760agx_wait(b, 0);761}762763/* NIR loops are treated as a pair of AGX loops:764*765* do {766* do {767* ...768* } while (0);769* } while (cond);770*771* By manipulating the nesting counter (r0l), we may break out of nested loops,772* so under the model, both break and continue may be implemented as breaks,773* where break breaks out of the outer loop (2 layers) and continue breaks out774* of the inner loop (1 layer).775*776* After manipulating the nesting counter directly, pop_exec #0 must be used to777* flush the update to the execution mask.778*/779780static void781agx_emit_jump(agx_builder *b, nir_jump_instr *instr)782{783agx_context *ctx = b->shader;784assert (instr->type == nir_jump_break || instr->type == nir_jump_continue);785786/* Break out of either one or two loops */787unsigned nestings = b->shader->loop_nesting;788789if (instr->type == nir_jump_continue) {790nestings += 1;791agx_block_add_successor(ctx->current_block, ctx->continue_block);792} else if (instr->type == nir_jump_break) {793nestings += 2;794agx_block_add_successor(ctx->current_block, ctx->break_block);795}796797/* Update the counter and flush */798agx_index r0l = agx_register(0, false);799agx_mov_to(b, r0l, agx_immediate(nestings));800agx_pop_exec(b, 0);801802ctx->current_block->unconditional_jumps = true;803}804805static void806agx_emit_instr(agx_builder *b, struct nir_instr *instr)807{808switch (instr->type) {809case nir_instr_type_load_const:810agx_emit_load_const(b, nir_instr_as_load_const(instr));811break;812813case nir_instr_type_intrinsic:814agx_emit_intrinsic(b, nir_instr_as_intrinsic(instr));815break;816817case nir_instr_type_alu:818agx_emit_alu(b, nir_instr_as_alu(instr));819break;820821case nir_instr_type_tex:822agx_emit_tex(b, nir_instr_as_tex(instr));823break;824825case nir_instr_type_jump:826agx_emit_jump(b, nir_instr_as_jump(instr));827break;828829default:830unreachable("should've been lowered");831}832}833834static agx_block *835agx_create_block(agx_context *ctx)836{837agx_block *blk = rzalloc(ctx, agx_block);838839blk->predecessors = _mesa_set_create(blk,840_mesa_hash_pointer, _mesa_key_pointer_equal);841842return blk;843}844845static agx_block *846emit_block(agx_context *ctx, nir_block *block)847{848if (ctx->after_block) {849ctx->current_block = ctx->after_block;850ctx->after_block = NULL;851} else {852ctx->current_block = agx_create_block(ctx);853}854855agx_block *blk = ctx->current_block;856list_addtail(&blk->link, &ctx->blocks);857list_inithead(&blk->instructions);858859agx_builder _b = agx_init_builder(ctx, agx_after_block(blk));860861nir_foreach_instr(instr, block) {862agx_emit_instr(&_b, instr);863}864865return blk;866}867868static agx_block *869emit_cf_list(agx_context *ctx, struct exec_list *list);870871/* Emit if-else as872*873* if_icmp cond != 0874* ...875* else_icmp cond == 0876* ...877* pop_exec878*879* If the else is empty, we can omit the else_icmp. This is not usually880* optimal, but it's a start.881*/882883static void884emit_if(agx_context *ctx, nir_if *nif)885{886nir_block *nir_else_block = nir_if_first_else_block(nif);887bool empty_else_block =888(nir_else_block == nir_if_last_else_block(nif) &&889exec_list_is_empty(&nir_else_block->instr_list));890891agx_block *first_block = ctx->current_block;892agx_builder _b = agx_init_builder(ctx, agx_after_block(first_block));893agx_index cond = agx_src_index(&nif->condition);894895agx_if_icmp(&_b, cond, agx_zero(), 1, AGX_ICOND_UEQ, true);896ctx->loop_nesting++;897898/* Emit the two subblocks. */899agx_block *if_block = emit_cf_list(ctx, &nif->then_list);900agx_block *end_then = ctx->current_block;901902if (!empty_else_block) {903_b.cursor = agx_after_block(ctx->current_block);904agx_else_icmp(&_b, cond, agx_zero(), 1, AGX_ICOND_UEQ, false);905}906907agx_block *else_block = emit_cf_list(ctx, &nif->else_list);908agx_block *end_else = ctx->current_block;909910ctx->after_block = agx_create_block(ctx);911912agx_block_add_successor(first_block, if_block);913agx_block_add_successor(first_block, else_block);914agx_block_add_successor(end_then, ctx->after_block);915agx_block_add_successor(end_else, ctx->after_block);916917_b.cursor = agx_after_block(ctx->current_block);918agx_pop_exec(&_b, 1);919ctx->loop_nesting--;920}921922static void923emit_loop(agx_context *ctx, nir_loop *nloop)924{925/* We only track nesting within the innermost loop, so reset */926ctx->loop_nesting = 0;927928agx_block *popped_break = ctx->break_block;929agx_block *popped_continue = ctx->continue_block;930931ctx->break_block = agx_create_block(ctx);932ctx->continue_block = agx_create_block(ctx);933934/* Make room for break/continue nesting (TODO: skip if no divergent CF) */935agx_builder _b = agx_init_builder(ctx, agx_after_block(ctx->current_block));936agx_push_exec(&_b, 2);937938/* Fallthrough to body */939agx_block_add_successor(ctx->current_block, ctx->continue_block);940941/* Emit the body */942ctx->after_block = ctx->continue_block;943agx_block *start_block = emit_cf_list(ctx, &nloop->body);944945/* Fix up the nesting counter via an always true while_icmp, and branch back946* to start of loop if any lanes are active */947_b.cursor = agx_after_block(ctx->current_block);948agx_while_icmp(&_b, agx_zero(), agx_zero(), 2, AGX_ICOND_UEQ, false);949agx_jmp_exec_any(&_b, start_block);950agx_pop_exec(&_b, 2);951agx_block_add_successor(ctx->current_block, ctx->continue_block);952953/* Pop off */954ctx->after_block = ctx->break_block;955ctx->break_block = popped_break;956ctx->continue_block = popped_continue;957958/* Update shader-db stats */959++ctx->loop_count;960961/* All nested control flow must have finished */962assert(ctx->loop_nesting == 0);963}964965/* Before the first control flow structure, the nesting counter (r0l) needs to966* be zeroed for correct operation. This only happens at most once, since by967* definition this occurs at the end of the first block, which dominates the968* rest of the program. */969970static void971emit_first_cf(agx_context *ctx)972{973if (ctx->any_cf)974return;975976agx_builder _b = agx_init_builder(ctx, agx_after_block(ctx->current_block));977agx_index r0l = agx_register(0, false);978979agx_mov_to(&_b, r0l, agx_immediate(0));980ctx->any_cf = true;981}982983static agx_block *984emit_cf_list(agx_context *ctx, struct exec_list *list)985{986agx_block *start_block = NULL;987988foreach_list_typed(nir_cf_node, node, node, list) {989switch (node->type) {990case nir_cf_node_block: {991agx_block *block = emit_block(ctx, nir_cf_node_as_block(node));992993if (!start_block)994start_block = block;995996break;997}998999case nir_cf_node_if:1000emit_first_cf(ctx);1001emit_if(ctx, nir_cf_node_as_if(node));1002break;10031004case nir_cf_node_loop:1005emit_first_cf(ctx);1006emit_loop(ctx, nir_cf_node_as_loop(node));1007break;10081009default:1010unreachable("Unknown control flow");1011}1012}10131014return start_block;1015}10161017static void1018agx_set_st_vary_final(agx_context *ctx)1019{1020agx_foreach_instr_global_rev(ctx, I) {1021if (I->op == AGX_OPCODE_ST_VARY) {1022I->last = true;1023return;1024}1025}1026}10271028static void1029agx_print_stats(agx_context *ctx, unsigned size, FILE *fp)1030{1031unsigned nr_ins = 0, nr_bytes = 0, nr_threads = 1;10321033/* TODO */1034fprintf(stderr, "%s shader: %u inst, %u bytes, %u threads, %u loops,"1035"%u:%u spills:fills\n",1036ctx->nir->info.label ?: "",1037nr_ins, nr_bytes, nr_threads, ctx->loop_count,1038ctx->spills, ctx->fills);1039}10401041static int1042glsl_type_size(const struct glsl_type *type, bool bindless)1043{1044return glsl_count_attribute_slots(type, false);1045}10461047static bool1048agx_lower_sincos_filter(const nir_instr *instr, UNUSED const void *_)1049{1050if (instr->type != nir_instr_type_alu)1051return false;10521053nir_alu_instr *alu = nir_instr_as_alu(instr);1054return alu->op == nir_op_fsin || alu->op == nir_op_fcos;1055}10561057/* Sine and cosine are implemented via the sin_pt_1 and sin_pt_2 opcodes for1058* heavy lifting. sin_pt_2 implements sinc in the first quadrant, expressed in1059* turns (sin (tau x) / x), while sin_pt_1 implements a piecewise sign/offset1060* fixup to transform a quadrant angle [0, 4] to [-1, 1]. The NIR opcode1061* fsin_agx models the fixup, sinc, and multiply to obtain sine, so we just1062* need to change units from radians to quadrants modulo turns. Cosine is1063* implemented by shifting by one quadrant: cos(x) = sin(x + tau/4).1064*/10651066static nir_ssa_def *1067agx_lower_sincos_impl(struct nir_builder *b, nir_instr *instr, UNUSED void *_)1068{1069nir_alu_instr *alu = nir_instr_as_alu(instr);1070nir_ssa_def *x = nir_mov_alu(b, alu->src[0], 1);1071nir_ssa_def *turns = nir_fmul_imm(b, x, M_1_PI * 0.5f);10721073if (alu->op == nir_op_fcos)1074turns = nir_fadd_imm(b, turns, 0.25f);10751076nir_ssa_def *quadrants = nir_fmul_imm(b, nir_ffract(b, turns), 4.0);1077return nir_fsin_agx(b, quadrants);1078}10791080static bool1081agx_lower_sincos(nir_shader *shader)1082{1083return nir_shader_lower_instructions(shader,1084agx_lower_sincos_filter, agx_lower_sincos_impl, NULL);1085}10861087static bool1088agx_lower_front_face(struct nir_builder *b,1089nir_instr *instr, UNUSED void *data)1090{1091if (instr->type != nir_instr_type_intrinsic)1092return false;10931094nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);1095if (intr->intrinsic != nir_intrinsic_load_front_face)1096return false;10971098assert(intr->dest.is_ssa);1099nir_ssa_def *def = &intr->dest.ssa;1100assert(def->bit_size == 1);11011102b->cursor = nir_before_instr(&intr->instr);1103nir_ssa_def_rewrite_uses(def, nir_inot(b, nir_load_back_face_agx(b, 1)));1104return true;1105}11061107static bool1108agx_lower_point_coord(struct nir_builder *b,1109nir_instr *instr, UNUSED void *data)1110{1111if (instr->type != nir_instr_type_intrinsic)1112return false;11131114nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);11151116if (intr->intrinsic != nir_intrinsic_load_deref)1117return false;11181119nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);1120nir_variable *var = nir_deref_instr_get_variable(deref);11211122if (var->data.mode != nir_var_shader_in)1123return false;11241125if (var->data.location != VARYING_SLOT_PNTC)1126return false;11271128assert(intr->dest.is_ssa);1129assert(intr->dest.ssa.num_components == 2);11301131b->cursor = nir_after_instr(&intr->instr);1132nir_ssa_def *def = nir_load_deref(b, deref);1133nir_ssa_def *y = nir_channel(b, def, 1);1134nir_ssa_def *flipped_y = nir_fadd_imm(b, nir_fneg(b, y), 1.0);1135nir_ssa_def *flipped = nir_vec2(b, nir_channel(b, def, 0), flipped_y);1136nir_ssa_def_rewrite_uses(&intr->dest.ssa, flipped);1137return true;1138}11391140static void1141agx_optimize_nir(nir_shader *nir)1142{1143bool progress;11441145nir_lower_idiv_options idiv_options = {1146.imprecise_32bit_lowering = true,1147.allow_fp16 = true,1148};11491150NIR_PASS_V(nir, nir_lower_regs_to_ssa);1151NIR_PASS_V(nir, nir_lower_int64);1152NIR_PASS_V(nir, nir_lower_idiv, &idiv_options);1153NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);1154NIR_PASS_V(nir, nir_lower_load_const_to_scalar);1155NIR_PASS_V(nir, nir_lower_flrp, 16 | 32 | 64, false);1156NIR_PASS_V(nir, agx_lower_sincos);1157NIR_PASS_V(nir, nir_shader_instructions_pass,1158agx_lower_front_face,1159nir_metadata_block_index | nir_metadata_dominance, NULL);11601161do {1162progress = false;11631164NIR_PASS(progress, nir, nir_lower_var_copies);1165NIR_PASS(progress, nir, nir_lower_vars_to_ssa);11661167NIR_PASS(progress, nir, nir_copy_prop);1168NIR_PASS(progress, nir, nir_opt_remove_phis);1169NIR_PASS(progress, nir, nir_opt_dce);1170NIR_PASS(progress, nir, nir_opt_dead_cf);1171NIR_PASS(progress, nir, nir_opt_cse);1172NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);1173NIR_PASS(progress, nir, nir_opt_algebraic);1174NIR_PASS(progress, nir, nir_opt_constant_folding);11751176NIR_PASS(progress, nir, nir_opt_undef);1177NIR_PASS(progress, nir, nir_lower_undef_to_zero);11781179NIR_PASS(progress, nir, nir_opt_loop_unroll,1180nir_var_shader_in |1181nir_var_shader_out |1182nir_var_function_temp);1183} while (progress);11841185NIR_PASS_V(nir, nir_opt_algebraic_late);1186NIR_PASS_V(nir, nir_opt_constant_folding);1187NIR_PASS_V(nir, nir_copy_prop);1188NIR_PASS_V(nir, nir_opt_dce);1189NIR_PASS_V(nir, nir_opt_cse);1190NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);1191NIR_PASS_V(nir, nir_lower_load_const_to_scalar);11921193/* Cleanup optimizations */1194nir_move_options move_all =1195nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |1196nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;11971198NIR_PASS_V(nir, nir_opt_sink, move_all);1199NIR_PASS_V(nir, nir_opt_move, move_all);1200NIR_PASS_V(nir, nir_convert_from_ssa, true);1201}12021203/* ABI: position first, then user, then psiz */1204static void1205agx_remap_varyings_vs(nir_shader *nir, struct agx_varyings *varyings,1206unsigned *remap)1207{1208unsigned base = 0;12091210nir_variable *pos = nir_find_variable_with_location(nir, nir_var_shader_out, VARYING_SLOT_POS);1211if (pos) {1212assert(pos->data.driver_location < AGX_MAX_VARYINGS);1213remap[pos->data.driver_location] = base;1214base += 4;1215}12161217nir_foreach_shader_out_variable(var, nir) {1218unsigned loc = var->data.location;12191220if(loc == VARYING_SLOT_POS || loc == VARYING_SLOT_PSIZ) {1221continue;1222}12231224assert(var->data.driver_location < AGX_MAX_VARYINGS);1225remap[var->data.driver_location] = base;1226base += 4;1227}12281229nir_variable *psiz = nir_find_variable_with_location(nir, nir_var_shader_out, VARYING_SLOT_PSIZ);1230if (psiz) {1231assert(psiz->data.driver_location < AGX_MAX_VARYINGS);1232remap[psiz->data.driver_location] = base;1233base += 1;1234}12351236varyings->nr_slots = base;1237}12381239static void1240agx_remap_varyings_fs(nir_shader *nir, struct agx_varyings *varyings,1241unsigned *remap)1242{1243struct agx_varying_packed *packed = varyings->packed;1244unsigned base = 0;12451246agx_pack(packed, VARYING, cfg) {1247cfg.type = AGX_VARYING_TYPE_FRAGCOORD_W;1248cfg.components = 1;1249cfg.triangle_slot = cfg.point_slot = base;1250}12511252base++;1253packed++;12541255agx_pack(packed, VARYING, cfg) {1256cfg.type = AGX_VARYING_TYPE_FRAGCOORD_Z;1257cfg.components = 1;1258cfg.triangle_slot = cfg.point_slot = base;1259}12601261base++;1262packed++;12631264unsigned comps[MAX_VARYING] = { 0 };12651266nir_foreach_shader_in_variable(var, nir) {1267unsigned loc = var->data.driver_location;1268const struct glsl_type *column =1269glsl_without_array_or_matrix(var->type);1270unsigned chan = glsl_get_components(column);12711272/* If we have a fractional location added, we need to increase the size1273* so it will fit, i.e. a vec3 in YZW requires us to allocate a vec4.1274* We could do better but this is an edge case as it is, normally1275* packed varyings will be aligned.1276*/1277chan += var->data.location_frac;1278comps[loc] = MAX2(comps[loc], chan);1279}12801281nir_foreach_shader_in_variable(var, nir) {1282unsigned loc = var->data.driver_location;1283unsigned sz = glsl_count_attribute_slots(var->type, FALSE);1284unsigned channels = comps[loc];12851286assert(var->data.driver_location <= AGX_MAX_VARYINGS);1287remap[var->data.driver_location] = base;12881289for (int c = 0; c < sz; ++c) {1290agx_pack(packed, VARYING, cfg) {1291cfg.type = (var->data.location == VARYING_SLOT_PNTC) ?1292AGX_VARYING_TYPE_POINT_COORDINATES :1293(var->data.interpolation == INTERP_MODE_FLAT) ?1294AGX_VARYING_TYPE_FLAT_LAST :1295AGX_VARYING_TYPE_SMOOTH;12961297cfg.components = channels;1298cfg.triangle_slot = cfg.point_slot = base;1299}13001301base += channels;1302packed++;1303}1304}13051306varyings->nr_descs = (packed - varyings->packed);1307varyings->nr_slots = base;1308}13091310void1311agx_compile_shader_nir(nir_shader *nir,1312struct agx_shader_key *key,1313struct util_dynarray *binary,1314struct agx_shader_info *out)1315{1316agx_debug = debug_get_option_agx_debug();13171318agx_context *ctx = rzalloc(NULL, agx_context);1319ctx->nir = nir;1320ctx->out = out;1321ctx->key = key;1322ctx->stage = nir->info.stage;1323list_inithead(&ctx->blocks);13241325if (ctx->stage == MESA_SHADER_VERTEX) {1326out->writes_psiz = nir->info.outputs_written &1327BITFIELD_BIT(VARYING_SLOT_PSIZ);1328}13291330NIR_PASS_V(nir, nir_lower_vars_to_ssa);13311332/* Lower large arrays to scratch and small arrays to csel */1333NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 16,1334glsl_get_natural_size_align_bytes);1335NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);13361337if (ctx->stage == MESA_SHADER_VERTEX) {1338/* Lower from OpenGL [-1, 1] to [0, 1] if half-z is not set */1339if (!key->vs.clip_halfz)1340NIR_PASS_V(nir, nir_lower_clip_halfz);1341} else if (ctx->stage == MESA_SHADER_FRAGMENT) {1342/* Flip point coordinate since OpenGL and Metal disagree */1343NIR_PASS_V(nir, nir_shader_instructions_pass,1344agx_lower_point_coord,1345nir_metadata_block_index | nir_metadata_dominance, NULL);1346}13471348NIR_PASS_V(nir, nir_split_var_copies);1349NIR_PASS_V(nir, nir_lower_global_vars_to_local);1350NIR_PASS_V(nir, nir_lower_var_copies);1351NIR_PASS_V(nir, nir_lower_vars_to_ssa);1352NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,1353glsl_type_size, 0);1354if (ctx->stage == MESA_SHADER_FRAGMENT) {1355NIR_PASS_V(nir, nir_lower_mediump_io,1356nir_var_shader_in | nir_var_shader_out, ~0, false);1357}1358NIR_PASS_V(nir, nir_lower_ssbo);13591360/* Varying output is scalar, other I/O is vector */1361if (ctx->stage == MESA_SHADER_VERTEX) {1362NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out);1363}13641365nir_lower_tex_options lower_tex_options = {1366.lower_txs_lod = true,1367.lower_txp = ~0,1368};13691370nir_tex_src_type_constraints tex_constraints = {1371[nir_tex_src_lod] = { true, 16 }1372};13731374NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options);1375NIR_PASS_V(nir, nir_legalize_16bit_sampler_srcs, tex_constraints);13761377agx_optimize_nir(nir);13781379/* Must be last since NIR passes can remap driver_location freely */1380if (ctx->stage == MESA_SHADER_VERTEX) {1381agx_remap_varyings_vs(nir, &out->varyings, ctx->varyings);1382} else if (ctx->stage == MESA_SHADER_FRAGMENT) {1383agx_remap_varyings_fs(nir, &out->varyings, ctx->varyings);1384}13851386bool skip_internal = nir->info.internal;1387skip_internal &= !(agx_debug & AGX_DBG_INTERNAL);13881389if (agx_debug & AGX_DBG_SHADERS && !skip_internal) {1390nir_print_shader(nir, stdout);1391}13921393nir_foreach_function(func, nir) {1394if (!func->impl)1395continue;13961397/* TODO: Handle phi nodes instead of just convert_from_ssa and yolo'ing1398* the mapping of nir_register to hardware registers and guaranteeing bad1399* performance and breaking spilling... */1400ctx->nir_regalloc = rzalloc_array(ctx, unsigned, func->impl->reg_alloc);14011402/* Leave the last 4 registers for hacky p-copy lowering */1403unsigned nir_regalloc = AGX_NUM_REGS - (4 * 2);14041405/* Assign backwards so we don't need to guess a size */1406nir_foreach_register(reg, &func->impl->registers) {1407/* Ensure alignment */1408if (reg->bit_size >= 32 && (nir_regalloc & 1))1409nir_regalloc--;14101411unsigned size = DIV_ROUND_UP(reg->bit_size * reg->num_components, 16);1412nir_regalloc -= size;1413ctx->nir_regalloc[reg->index] = nir_regalloc;1414}14151416ctx->max_register = nir_regalloc;1417ctx->alloc += func->impl->ssa_alloc;1418emit_cf_list(ctx, &func->impl->body);1419break; /* TODO: Multi-function shaders */1420}14211422/* TODO: Actual RA... this way passes don't need to deal nir_register */1423agx_foreach_instr_global(ctx, I) {1424agx_foreach_dest(I, d) {1425if (I->dest[d].type == AGX_INDEX_NIR_REGISTER) {1426I->dest[d].type = AGX_INDEX_REGISTER;1427I->dest[d].value = ctx->nir_regalloc[I->dest[d].value];1428}1429}14301431agx_foreach_src(I, s) {1432if (I->src[s].type == AGX_INDEX_NIR_REGISTER) {1433I->src[s].type = AGX_INDEX_REGISTER;1434I->src[s].value = ctx->nir_regalloc[I->src[s].value];1435}1436}1437}14381439/* Terminate the shader after the exit block */1440agx_block *last_block = list_last_entry(&ctx->blocks, agx_block, link);1441agx_builder _b = agx_init_builder(ctx, agx_after_block(last_block));1442agx_stop(&_b);14431444/* Also add traps to match the blob, unsure what the function is */1445for (unsigned i = 0; i < 8; ++i)1446agx_trap(&_b);14471448unsigned block_source_count = 0;14491450/* Name blocks now that we're done emitting so the order is consistent */1451agx_foreach_block(ctx, block)1452block->name = block_source_count++;14531454if (agx_debug & AGX_DBG_SHADERS && !skip_internal)1455agx_print_shader(ctx, stdout);14561457agx_optimizer(ctx);1458agx_dce(ctx);14591460if (agx_debug & AGX_DBG_SHADERS && !skip_internal)1461agx_print_shader(ctx, stdout);14621463agx_ra(ctx);14641465if (ctx->stage == MESA_SHADER_VERTEX)1466agx_set_st_vary_final(ctx);14671468if (agx_debug & AGX_DBG_SHADERS && !skip_internal)1469agx_print_shader(ctx, stdout);14701471agx_pack_binary(ctx, binary);14721473if ((agx_debug & AGX_DBG_SHADERDB) && !skip_internal)1474agx_print_stats(ctx, binary->size, stderr);14751476ralloc_free(ctx);1477}147814791480