Path: blob/21.2-virgl/src/gallium/drivers/etnaviv/etnaviv_compiler_nir.c
4570 views
/*1* Copyright (c) 2012-2019 Etnaviv Project2* Copyright (c) 2019 Zodiac Inflight Innovations3*4* Permission is hereby granted, free of charge, to any person obtaining a5* copy of this software and associated documentation files (the "Software"),6* to deal in the Software without restriction, including without limitation7* the rights to use, copy, modify, merge, publish, distribute, sub license,8* and/or sell copies of the Software, and to permit persons to whom the9* Software is furnished to do so, subject to the following conditions:10*11* The above copyright notice and this permission notice (including the12* next paragraph) shall be included in all copies or substantial portions13* of the Software.14*15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,17* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER21* DEALINGS IN THE SOFTWARE.22*23* Authors:24* Jonathan Marek <[email protected]>25* Wladimir J. van der Laan <[email protected]>26*/2728#include "etnaviv_compiler.h"29#include "etnaviv_compiler_nir.h"30#include "etnaviv_asm.h"31#include "etnaviv_context.h"32#include "etnaviv_debug.h"33#include "etnaviv_nir.h"34#include "etnaviv_uniforms.h"35#include "etnaviv_util.h"3637#include <math.h>38#include "util/u_memory.h"39#include "util/register_allocate.h"40#include "compiler/nir/nir_builder.h"4142#include "tgsi/tgsi_strings.h"43#include "util/compiler.h"44#include "util/half_float.h"4546static bool47etna_alu_to_scalar_filter_cb(const nir_instr *instr, const void *data)48{49const struct etna_specs *specs = data;5051if (instr->type != nir_instr_type_alu)52return false;5354nir_alu_instr *alu = nir_instr_as_alu(instr);55switch (alu->op) {56case nir_op_frsq:57case nir_op_frcp:58case nir_op_flog2:59case nir_op_fexp2:60case nir_op_fsqrt:61case nir_op_fcos:62case nir_op_fsin:63case nir_op_fdiv:64case nir_op_imul:65return true;66/* TODO: can do better than alu_to_scalar for vector compares */67case nir_op_b32all_fequal2:68case nir_op_b32all_fequal3:69case nir_op_b32all_fequal4:70case nir_op_b32any_fnequal2:71case nir_op_b32any_fnequal3:72case nir_op_b32any_fnequal4:73case nir_op_b32all_iequal2:74case nir_op_b32all_iequal3:75case nir_op_b32all_iequal4:76case nir_op_b32any_inequal2:77case nir_op_b32any_inequal3:78case nir_op_b32any_inequal4:79return true;80case nir_op_fdot2:81if (!specs->has_halti2_instructions)82return true;83break;84default:85break;86}8788return false;89}9091static void92etna_emit_block_start(struct etna_compile *c, unsigned block)93{94c->block_ptr[block] = c->inst_ptr;95}9697static void98etna_emit_output(struct etna_compile *c, nir_variable *var, struct etna_inst_src src)99{100struct etna_shader_io_file *sf = &c->variant->outfile;101102if (is_fs(c)) {103switch (var->data.location) {104case FRAG_RESULT_COLOR:105case FRAG_RESULT_DATA0: /* DATA0 is used by gallium shaders for color */106c->variant->ps_color_out_reg = src.reg;107break;108case FRAG_RESULT_DEPTH:109c->variant->ps_depth_out_reg = src.reg;110break;111default:112unreachable("Unsupported fs output");113}114return;115}116117switch (var->data.location) {118case VARYING_SLOT_POS:119c->variant->vs_pos_out_reg = src.reg;120break;121case VARYING_SLOT_PSIZ:122c->variant->vs_pointsize_out_reg = src.reg;123break;124default:125sf->reg[sf->num_reg].reg = src.reg;126sf->reg[sf->num_reg].slot = var->data.location;127sf->reg[sf->num_reg].num_components = glsl_get_components(var->type);128sf->num_reg++;129break;130}131}132133#define OPT(nir, pass, ...) ({ \134bool this_progress = false; \135NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \136this_progress; \137})138139static void140etna_optimize_loop(nir_shader *s)141{142bool progress;143do {144progress = false;145146NIR_PASS_V(s, nir_lower_vars_to_ssa);147progress |= OPT(s, nir_opt_copy_prop_vars);148progress |= OPT(s, nir_opt_shrink_vectors, true);149progress |= OPT(s, nir_copy_prop);150progress |= OPT(s, nir_opt_dce);151progress |= OPT(s, nir_opt_cse);152progress |= OPT(s, nir_opt_peephole_select, 16, true, true);153progress |= OPT(s, nir_opt_intrinsics);154progress |= OPT(s, nir_opt_algebraic);155progress |= OPT(s, nir_opt_constant_folding);156progress |= OPT(s, nir_opt_dead_cf);157if (OPT(s, nir_opt_trivial_continues)) {158progress = true;159/* If nir_opt_trivial_continues makes progress, then we need to clean160* things up if we want any hope of nir_opt_if or nir_opt_loop_unroll161* to make progress.162*/163OPT(s, nir_copy_prop);164OPT(s, nir_opt_dce);165}166progress |= OPT(s, nir_opt_loop_unroll, nir_var_all);167progress |= OPT(s, nir_opt_if, false);168progress |= OPT(s, nir_opt_remove_phis);169progress |= OPT(s, nir_opt_undef);170}171while (progress);172}173174static int175etna_glsl_type_size(const struct glsl_type *type, bool bindless)176{177return glsl_count_attribute_slots(type, false);178}179180static void181copy_uniform_state_to_shader(struct etna_shader_variant *sobj, uint64_t *consts, unsigned count)182{183struct etna_shader_uniform_info *uinfo = &sobj->uniforms;184185uinfo->count = count * 4;186uinfo->data = MALLOC(uinfo->count * sizeof(*uinfo->data));187uinfo->contents = MALLOC(uinfo->count * sizeof(*uinfo->contents));188189for (unsigned i = 0; i < uinfo->count; i++) {190uinfo->data[i] = consts[i];191uinfo->contents[i] = consts[i] >> 32;192}193194etna_set_shader_uniforms_dirty_flags(sobj);195}196197#define ALU_SWIZ(s) INST_SWIZ((s)->swizzle[0], (s)->swizzle[1], (s)->swizzle[2], (s)->swizzle[3])198#define SRC_DISABLE ((hw_src){})199#define SRC_CONST(idx, s) ((hw_src){.use=1, .rgroup = INST_RGROUP_UNIFORM_0, .reg=idx, .swiz=s})200#define SRC_REG(idx, s) ((hw_src){.use=1, .rgroup = INST_RGROUP_TEMP, .reg=idx, .swiz=s})201202typedef struct etna_inst_dst hw_dst;203typedef struct etna_inst_src hw_src;204205static inline hw_src206src_swizzle(hw_src src, unsigned swizzle)207{208if (src.rgroup != INST_RGROUP_IMMEDIATE)209src.swiz = inst_swiz_compose(src.swiz, swizzle);210211return src;212}213214/* constants are represented as 64-bit ints215* 32-bit for the value and 32-bit for the type (imm, uniform, etc)216*/217218#define CONST_VAL(a, b) (nir_const_value) {.u64 = (uint64_t)(a) << 32 | (uint64_t)(b)}219#define CONST(x) CONST_VAL(ETNA_UNIFORM_CONSTANT, x)220#define UNIFORM(x) CONST_VAL(ETNA_UNIFORM_UNIFORM, x)221#define TEXSCALE(x, i) CONST_VAL(ETNA_UNIFORM_TEXRECT_SCALE_X + (i), x)222223static int224const_add(uint64_t *c, uint64_t value)225{226for (unsigned i = 0; i < 4; i++) {227if (c[i] == value || !c[i]) {228c[i] = value;229return i;230}231}232return -1;233}234235static hw_src236const_src(struct etna_compile *c, nir_const_value *value, unsigned num_components)237{238/* use inline immediates if possible */239if (c->specs->halti >= 2 && num_components == 1 &&240value[0].u64 >> 32 == ETNA_UNIFORM_CONSTANT) {241uint32_t bits = value[0].u32;242243/* "float" - shifted by 12 */244if ((bits & 0xfff) == 0)245return etna_immediate_src(0, bits >> 12);246247/* "unsigned" - raw 20 bit value */248if (bits < (1 << 20))249return etna_immediate_src(2, bits);250251/* "signed" - sign extended 20-bit (sign included) value */252if (bits >= 0xfff80000)253return etna_immediate_src(1, bits);254}255256unsigned i;257int swiz = -1;258for (i = 0; swiz < 0; i++) {259uint64_t *a = &c->consts[i*4];260uint64_t save[4];261memcpy(save, a, sizeof(save));262swiz = 0;263for (unsigned j = 0; j < num_components; j++) {264int c = const_add(a, value[j].u64);265if (c < 0) {266memcpy(a, save, sizeof(save));267swiz = -1;268break;269}270swiz |= c << j * 2;271}272}273274assert(i <= ETNA_MAX_IMM / 4);275c->const_count = MAX2(c->const_count, i);276277return SRC_CONST(i - 1, swiz);278}279280/* how to swizzle when used as a src */281static const uint8_t282reg_swiz[NUM_REG_TYPES] = {283[REG_TYPE_VEC4] = INST_SWIZ_IDENTITY,284[REG_TYPE_VIRT_SCALAR_X] = INST_SWIZ_IDENTITY,285[REG_TYPE_VIRT_SCALAR_Y] = SWIZZLE(Y, Y, Y, Y),286[REG_TYPE_VIRT_VEC2_XY] = INST_SWIZ_IDENTITY,287[REG_TYPE_VIRT_VEC2T_XY] = INST_SWIZ_IDENTITY,288[REG_TYPE_VIRT_VEC2C_XY] = INST_SWIZ_IDENTITY,289[REG_TYPE_VIRT_SCALAR_Z] = SWIZZLE(Z, Z, Z, Z),290[REG_TYPE_VIRT_VEC2_XZ] = SWIZZLE(X, Z, X, Z),291[REG_TYPE_VIRT_VEC2_YZ] = SWIZZLE(Y, Z, Y, Z),292[REG_TYPE_VIRT_VEC2C_YZ] = SWIZZLE(Y, Z, Y, Z),293[REG_TYPE_VIRT_VEC3_XYZ] = INST_SWIZ_IDENTITY,294[REG_TYPE_VIRT_VEC3C_XYZ] = INST_SWIZ_IDENTITY,295[REG_TYPE_VIRT_SCALAR_W] = SWIZZLE(W, W, W, W),296[REG_TYPE_VIRT_VEC2_XW] = SWIZZLE(X, W, X, W),297[REG_TYPE_VIRT_VEC2_YW] = SWIZZLE(Y, W, Y, W),298[REG_TYPE_VIRT_VEC3_XYW] = SWIZZLE(X, Y, W, X),299[REG_TYPE_VIRT_VEC2_ZW] = SWIZZLE(Z, W, Z, W),300[REG_TYPE_VIRT_VEC2T_ZW] = SWIZZLE(Z, W, Z, W),301[REG_TYPE_VIRT_VEC2C_ZW] = SWIZZLE(Z, W, Z, W),302[REG_TYPE_VIRT_VEC3_XZW] = SWIZZLE(X, Z, W, X),303[REG_TYPE_VIRT_VEC3_YZW] = SWIZZLE(Y, Z, W, X),304[REG_TYPE_VIRT_VEC3C_YZW] = SWIZZLE(Y, Z, W, X),305};306307/* how to swizzle when used as a dest */308static const uint8_t309reg_dst_swiz[NUM_REG_TYPES] = {310[REG_TYPE_VEC4] = INST_SWIZ_IDENTITY,311[REG_TYPE_VIRT_SCALAR_X] = INST_SWIZ_IDENTITY,312[REG_TYPE_VIRT_SCALAR_Y] = SWIZZLE(X, X, X, X),313[REG_TYPE_VIRT_VEC2_XY] = INST_SWIZ_IDENTITY,314[REG_TYPE_VIRT_VEC2T_XY] = INST_SWIZ_IDENTITY,315[REG_TYPE_VIRT_VEC2C_XY] = INST_SWIZ_IDENTITY,316[REG_TYPE_VIRT_SCALAR_Z] = SWIZZLE(X, X, X, X),317[REG_TYPE_VIRT_VEC2_XZ] = SWIZZLE(X, X, Y, Y),318[REG_TYPE_VIRT_VEC2_YZ] = SWIZZLE(X, X, Y, Y),319[REG_TYPE_VIRT_VEC2C_YZ] = SWIZZLE(X, X, Y, Y),320[REG_TYPE_VIRT_VEC3_XYZ] = INST_SWIZ_IDENTITY,321[REG_TYPE_VIRT_VEC3C_XYZ] = INST_SWIZ_IDENTITY,322[REG_TYPE_VIRT_SCALAR_W] = SWIZZLE(X, X, X, X),323[REG_TYPE_VIRT_VEC2_XW] = SWIZZLE(X, X, Y, Y),324[REG_TYPE_VIRT_VEC2_YW] = SWIZZLE(X, X, Y, Y),325[REG_TYPE_VIRT_VEC3_XYW] = SWIZZLE(X, Y, Z, Z),326[REG_TYPE_VIRT_VEC2_ZW] = SWIZZLE(X, X, X, Y),327[REG_TYPE_VIRT_VEC2T_ZW] = SWIZZLE(X, X, X, Y),328[REG_TYPE_VIRT_VEC2C_ZW] = SWIZZLE(X, X, X, Y),329[REG_TYPE_VIRT_VEC3_XZW] = SWIZZLE(X, Y, Y, Z),330[REG_TYPE_VIRT_VEC3_YZW] = SWIZZLE(X, X, Y, Z),331[REG_TYPE_VIRT_VEC3C_YZW] = SWIZZLE(X, X, Y, Z),332};333334/* nir_src to allocated register */335static hw_src336ra_src(struct etna_compile *c, nir_src *src)337{338unsigned reg = ra_get_node_reg(c->g, c->live_map[src_index(c->impl, src)]);339return SRC_REG(reg_get_base(c, reg), reg_swiz[reg_get_type(reg)]);340}341342static hw_src343get_src(struct etna_compile *c, nir_src *src)344{345if (!src->is_ssa)346return ra_src(c, src);347348nir_instr *instr = src->ssa->parent_instr;349350if (instr->pass_flags & BYPASS_SRC) {351assert(instr->type == nir_instr_type_alu);352nir_alu_instr *alu = nir_instr_as_alu(instr);353assert(alu->op == nir_op_mov);354return src_swizzle(get_src(c, &alu->src[0].src), ALU_SWIZ(&alu->src[0]));355}356357switch (instr->type) {358case nir_instr_type_load_const:359return const_src(c, nir_instr_as_load_const(instr)->value, src->ssa->num_components);360case nir_instr_type_intrinsic: {361nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);362switch (intr->intrinsic) {363case nir_intrinsic_load_input:364case nir_intrinsic_load_instance_id:365case nir_intrinsic_load_uniform:366case nir_intrinsic_load_ubo:367return ra_src(c, src);368case nir_intrinsic_load_front_face:369return (hw_src) { .use = 1, .rgroup = INST_RGROUP_INTERNAL };370case nir_intrinsic_load_frag_coord:371return SRC_REG(0, INST_SWIZ_IDENTITY);372case nir_intrinsic_load_texture_rect_scaling: {373int sampler = nir_src_as_int(intr->src[0]);374nir_const_value values[] = {375TEXSCALE(sampler, 0),376TEXSCALE(sampler, 1),377};378379return src_swizzle(const_src(c, values, 2), SWIZZLE(X,Y,X,X));380}381default:382compile_error(c, "Unhandled NIR intrinsic type: %s\n",383nir_intrinsic_infos[intr->intrinsic].name);384break;385}386} break;387case nir_instr_type_alu:388case nir_instr_type_tex:389return ra_src(c, src);390case nir_instr_type_ssa_undef: {391/* return zero to deal with broken Blur demo */392nir_const_value value = CONST(0);393return src_swizzle(const_src(c, &value, 1), SWIZZLE(X,X,X,X));394}395default:396compile_error(c, "Unhandled NIR instruction type: %d\n", instr->type);397break;398}399400return SRC_DISABLE;401}402403static bool404vec_dest_has_swizzle(nir_alu_instr *vec, nir_ssa_def *ssa)405{406for (unsigned i = 0; i < 4; i++) {407if (!(vec->dest.write_mask & (1 << i)) || vec->src[i].src.ssa != ssa)408continue;409410if (vec->src[i].swizzle[0] != i)411return true;412}413414/* don't deal with possible bypassed vec/mov chain */415nir_foreach_use(use_src, ssa) {416nir_instr *instr = use_src->parent_instr;417if (instr->type != nir_instr_type_alu)418continue;419420nir_alu_instr *alu = nir_instr_as_alu(instr);421422switch (alu->op) {423case nir_op_mov:424case nir_op_vec2:425case nir_op_vec3:426case nir_op_vec4:427return true;428default:429break;430}431}432return false;433}434435/* get allocated dest register for nir_dest436* *p_swiz tells how the components need to be placed into register437*/438static hw_dst439ra_dest(struct etna_compile *c, nir_dest *dest, unsigned *p_swiz)440{441unsigned swiz = INST_SWIZ_IDENTITY, mask = 0xf;442dest = real_dest(dest, &swiz, &mask);443444unsigned r = ra_get_node_reg(c->g, c->live_map[dest_index(c->impl, dest)]);445unsigned t = reg_get_type(r);446447*p_swiz = inst_swiz_compose(swiz, reg_dst_swiz[t]);448449return (hw_dst) {450.use = 1,451.reg = reg_get_base(c, r),452.write_mask = inst_write_mask_compose(mask, reg_writemask[t]),453};454}455456static void457emit_alu(struct etna_compile *c, nir_alu_instr * alu)458{459const nir_op_info *info = &nir_op_infos[alu->op];460461/* marked as dead instruction (vecN and other bypassed instr) */462if (alu->instr.pass_flags)463return;464465assert(!(alu->op >= nir_op_vec2 && alu->op <= nir_op_vec4));466467unsigned dst_swiz;468hw_dst dst = ra_dest(c, &alu->dest.dest, &dst_swiz);469470/* compose alu write_mask with RA write mask */471if (!alu->dest.dest.is_ssa)472dst.write_mask = inst_write_mask_compose(alu->dest.write_mask, dst.write_mask);473474switch (alu->op) {475case nir_op_fdot2:476case nir_op_fdot3:477case nir_op_fdot4:478/* not per-component - don't compose dst_swiz */479dst_swiz = INST_SWIZ_IDENTITY;480break;481default:482break;483}484485hw_src srcs[3];486487for (int i = 0; i < info->num_inputs; i++) {488nir_alu_src *asrc = &alu->src[i];489hw_src src;490491src = src_swizzle(get_src(c, &asrc->src), ALU_SWIZ(asrc));492src = src_swizzle(src, dst_swiz);493494if (src.rgroup != INST_RGROUP_IMMEDIATE) {495src.neg = asrc->negate || (alu->op == nir_op_fneg);496src.abs = asrc->abs || (alu->op == nir_op_fabs);497} else {498assert(!asrc->negate && alu->op != nir_op_fneg);499assert(!asrc->abs && alu->op != nir_op_fabs);500}501502srcs[i] = src;503}504505etna_emit_alu(c, alu->op, dst, srcs, alu->dest.saturate || (alu->op == nir_op_fsat));506}507508static void509emit_tex(struct etna_compile *c, nir_tex_instr * tex)510{511unsigned dst_swiz;512hw_dst dst = ra_dest(c, &tex->dest, &dst_swiz);513nir_src *coord = NULL, *lod_bias = NULL, *compare = NULL;514515for (unsigned i = 0; i < tex->num_srcs; i++) {516switch (tex->src[i].src_type) {517case nir_tex_src_coord:518coord = &tex->src[i].src;519break;520case nir_tex_src_bias:521case nir_tex_src_lod:522assert(!lod_bias);523lod_bias = &tex->src[i].src;524break;525case nir_tex_src_comparator:526compare = &tex->src[i].src;527break;528default:529compile_error(c, "Unhandled NIR tex src type: %d\n",530tex->src[i].src_type);531break;532}533}534535etna_emit_tex(c, tex->op, tex->sampler_index, dst_swiz, dst, get_src(c, coord),536lod_bias ? get_src(c, lod_bias) : SRC_DISABLE,537compare ? get_src(c, compare) : SRC_DISABLE);538}539540static void541emit_intrinsic(struct etna_compile *c, nir_intrinsic_instr * intr)542{543switch (intr->intrinsic) {544case nir_intrinsic_store_deref:545etna_emit_output(c, nir_src_as_deref(intr->src[0])->var, get_src(c, &intr->src[1]));546break;547case nir_intrinsic_discard_if:548etna_emit_discard(c, get_src(c, &intr->src[0]));549break;550case nir_intrinsic_discard:551etna_emit_discard(c, SRC_DISABLE);552break;553case nir_intrinsic_load_uniform: {554unsigned dst_swiz;555struct etna_inst_dst dst = ra_dest(c, &intr->dest, &dst_swiz);556557/* TODO: rework so extra MOV isn't required, load up to 4 addresses at once */558emit_inst(c, &(struct etna_inst) {559.opcode = INST_OPCODE_MOVAR,560.dst.write_mask = 0x1,561.src[2] = get_src(c, &intr->src[0]),562});563emit_inst(c, &(struct etna_inst) {564.opcode = INST_OPCODE_MOV,565.dst = dst,566.src[2] = {567.use = 1,568.rgroup = INST_RGROUP_UNIFORM_0,569.reg = nir_intrinsic_base(intr),570.swiz = dst_swiz,571.amode = INST_AMODE_ADD_A_X,572},573});574} break;575case nir_intrinsic_load_ubo: {576/* TODO: if offset is of the form (x + C) then add C to the base instead */577unsigned idx = nir_src_as_const_value(intr->src[0])[0].u32;578unsigned dst_swiz;579emit_inst(c, &(struct etna_inst) {580.opcode = INST_OPCODE_LOAD,581.type = INST_TYPE_U32,582.dst = ra_dest(c, &intr->dest, &dst_swiz),583.src[0] = get_src(c, &intr->src[1]),584.src[1] = const_src(c, &CONST_VAL(ETNA_UNIFORM_UBO0_ADDR + idx, 0), 1),585});586} break;587case nir_intrinsic_load_front_face:588case nir_intrinsic_load_frag_coord:589assert(intr->dest.is_ssa); /* TODO - lower phis could cause this */590break;591case nir_intrinsic_load_input:592case nir_intrinsic_load_instance_id:593case nir_intrinsic_load_texture_rect_scaling:594break;595default:596compile_error(c, "Unhandled NIR intrinsic type: %s\n",597nir_intrinsic_infos[intr->intrinsic].name);598}599}600601static void602emit_instr(struct etna_compile *c, nir_instr * instr)603{604switch (instr->type) {605case nir_instr_type_alu:606emit_alu(c, nir_instr_as_alu(instr));607break;608case nir_instr_type_tex:609emit_tex(c, nir_instr_as_tex(instr));610break;611case nir_instr_type_intrinsic:612emit_intrinsic(c, nir_instr_as_intrinsic(instr));613break;614case nir_instr_type_jump:615assert(nir_instr_is_last(instr));616break;617case nir_instr_type_load_const:618case nir_instr_type_ssa_undef:619case nir_instr_type_deref:620break;621default:622compile_error(c, "Unhandled NIR instruction type: %d\n", instr->type);623break;624}625}626627static void628emit_block(struct etna_compile *c, nir_block * block)629{630etna_emit_block_start(c, block->index);631632nir_foreach_instr(instr, block)633emit_instr(c, instr);634635/* succs->index < block->index is for the loop case */636nir_block *succs = block->successors[0];637if (nir_block_ends_in_jump(block) || succs->index < block->index)638etna_emit_jump(c, succs->index, SRC_DISABLE);639}640641static void642emit_cf_list(struct etna_compile *c, struct exec_list *list);643644static void645emit_if(struct etna_compile *c, nir_if * nif)646{647etna_emit_jump(c, nir_if_first_else_block(nif)->index, get_src(c, &nif->condition));648emit_cf_list(c, &nif->then_list);649650/* jump at end of then_list to skip else_list651* not needed if then_list already ends with a jump or else_list is empty652*/653if (!nir_block_ends_in_jump(nir_if_last_then_block(nif)) &&654!nir_cf_list_is_empty_block(&nif->else_list))655etna_emit_jump(c, nir_if_last_else_block(nif)->successors[0]->index, SRC_DISABLE);656657emit_cf_list(c, &nif->else_list);658}659660static void661emit_cf_list(struct etna_compile *c, struct exec_list *list)662{663foreach_list_typed(nir_cf_node, node, node, list) {664switch (node->type) {665case nir_cf_node_block:666emit_block(c, nir_cf_node_as_block(node));667break;668case nir_cf_node_if:669emit_if(c, nir_cf_node_as_if(node));670break;671case nir_cf_node_loop:672emit_cf_list(c, &nir_cf_node_as_loop(node)->body);673break;674default:675compile_error(c, "Unknown NIR node type\n");676break;677}678}679}680681/* based on nir_lower_vec_to_movs */682static unsigned683insert_vec_mov(nir_alu_instr *vec, unsigned start_idx, nir_shader *shader)684{685assert(start_idx < nir_op_infos[vec->op].num_inputs);686unsigned write_mask = (1u << start_idx);687688nir_alu_instr *mov = nir_alu_instr_create(shader, nir_op_mov);689nir_alu_src_copy(&mov->src[0], &vec->src[start_idx], mov);690691mov->src[0].swizzle[0] = vec->src[start_idx].swizzle[0];692mov->src[0].negate = vec->src[start_idx].negate;693mov->src[0].abs = vec->src[start_idx].abs;694695unsigned num_components = 1;696697for (unsigned i = start_idx + 1; i < 4; i++) {698if (!(vec->dest.write_mask & (1 << i)))699continue;700701if (nir_srcs_equal(vec->src[i].src, vec->src[start_idx].src) &&702vec->src[i].negate == vec->src[start_idx].negate &&703vec->src[i].abs == vec->src[start_idx].abs) {704write_mask |= (1 << i);705mov->src[0].swizzle[num_components] = vec->src[i].swizzle[0];706num_components++;707}708}709710mov->dest.write_mask = (1 << num_components) - 1;711nir_ssa_dest_init(&mov->instr, &mov->dest.dest, num_components, 32, NULL);712713/* replace vec srcs with inserted mov */714for (unsigned i = 0, j = 0; i < 4; i++) {715if (!(write_mask & (1 << i)))716continue;717718nir_instr_rewrite_src(&vec->instr, &vec->src[i].src, nir_src_for_ssa(&mov->dest.dest.ssa));719vec->src[i].swizzle[0] = j++;720}721722nir_instr_insert_before(&vec->instr, &mov->instr);723724return write_mask;725}726727/*728* for vecN instructions:729* -merge constant sources into a single src730* -insert movs (nir_lower_vec_to_movs equivalent)731* for non-vecN instructions:732* -try to merge constants as single constant733* -insert movs for multiple constants (pre-HALTI5)734*/735static void736lower_alu(struct etna_compile *c, nir_alu_instr *alu)737{738const nir_op_info *info = &nir_op_infos[alu->op];739740nir_builder b;741nir_builder_init(&b, c->impl);742b.cursor = nir_before_instr(&alu->instr);743744switch (alu->op) {745case nir_op_vec2:746case nir_op_vec3:747case nir_op_vec4:748break;749default:750/* pre-GC7000L can only have 1 uniform src per instruction */751if (c->specs->halti >= 5)752return;753754nir_const_value value[4] = {};755uint8_t swizzle[4][4] = {};756unsigned swiz_max = 0, num_const = 0;757758for (unsigned i = 0; i < info->num_inputs; i++) {759nir_const_value *cv = nir_src_as_const_value(alu->src[i].src);760if (!cv)761continue;762763unsigned num_components = info->input_sizes[i] ?: alu->dest.dest.ssa.num_components;764for (unsigned j = 0; j < num_components; j++) {765int idx = const_add(&value[0].u64, cv[alu->src[i].swizzle[j]].u64);766swizzle[i][j] = idx;767swiz_max = MAX2(swiz_max, (unsigned) idx);768}769num_const++;770}771772/* nothing to do */773if (num_const <= 1)774return;775776/* resolve with single combined const src */777if (swiz_max < 4) {778nir_ssa_def *def = nir_build_imm(&b, swiz_max + 1, 32, value);779780for (unsigned i = 0; i < info->num_inputs; i++) {781nir_const_value *cv = nir_src_as_const_value(alu->src[i].src);782if (!cv)783continue;784785nir_instr_rewrite_src(&alu->instr, &alu->src[i].src, nir_src_for_ssa(def));786787for (unsigned j = 0; j < 4; j++)788alu->src[i].swizzle[j] = swizzle[i][j];789}790return;791}792793/* resolve with movs */794num_const = 0;795for (unsigned i = 0; i < info->num_inputs; i++) {796nir_const_value *cv = nir_src_as_const_value(alu->src[i].src);797if (!cv)798continue;799800num_const++;801if (num_const == 1)802continue;803804nir_ssa_def *mov = nir_mov(&b, alu->src[i].src.ssa);805nir_instr_rewrite_src(&alu->instr, &alu->src[i].src, nir_src_for_ssa(mov));806}807return;808}809810nir_const_value value[4];811unsigned num_components = 0;812813for (unsigned i = 0; i < info->num_inputs; i++) {814nir_const_value *cv = nir_src_as_const_value(alu->src[i].src);815if (cv)816value[num_components++] = cv[alu->src[i].swizzle[0]];817}818819/* if there is more than one constant source to the vecN, combine them820* into a single load_const (removing the vecN completely if all components821* are constant)822*/823if (num_components > 1) {824nir_ssa_def *def = nir_build_imm(&b, num_components, 32, value);825826if (num_components == info->num_inputs) {827nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, def);828nir_instr_remove(&alu->instr);829return;830}831832for (unsigned i = 0, j = 0; i < info->num_inputs; i++) {833nir_const_value *cv = nir_src_as_const_value(alu->src[i].src);834if (!cv)835continue;836837nir_instr_rewrite_src(&alu->instr, &alu->src[i].src, nir_src_for_ssa(def));838alu->src[i].swizzle[0] = j++;839}840}841842unsigned finished_write_mask = 0;843for (unsigned i = 0; i < 4; i++) {844if (!(alu->dest.write_mask & (1 << i)))845continue;846847nir_ssa_def *ssa = alu->src[i].src.ssa;848849/* check that vecN instruction is only user of this */850bool need_mov = list_length(&ssa->if_uses) != 0;851nir_foreach_use(use_src, ssa) {852if (use_src->parent_instr != &alu->instr)853need_mov = true;854}855856nir_instr *instr = ssa->parent_instr;857switch (instr->type) {858case nir_instr_type_alu:859case nir_instr_type_tex:860break;861case nir_instr_type_intrinsic:862if (nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_input) {863need_mov = vec_dest_has_swizzle(alu, &nir_instr_as_intrinsic(instr)->dest.ssa);864break;865}866FALLTHROUGH;867default:868need_mov = true;869}870871if (need_mov && !(finished_write_mask & (1 << i)))872finished_write_mask |= insert_vec_mov(alu, i, c->nir);873}874}875876static bool877emit_shader(struct etna_compile *c, unsigned *num_temps, unsigned *num_consts)878{879nir_shader *shader = c->nir;880c->impl = nir_shader_get_entrypoint(shader);881882bool have_indirect_uniform = false;883unsigned indirect_max = 0;884885nir_builder b;886nir_builder_init(&b, c->impl);887888/* convert non-dynamic uniform loads to constants, etc */889nir_foreach_block(block, c->impl) {890nir_foreach_instr_safe(instr, block) {891switch(instr->type) {892case nir_instr_type_alu:893/* deals with vecN and const srcs */894lower_alu(c, nir_instr_as_alu(instr));895break;896case nir_instr_type_load_const: {897nir_load_const_instr *load_const = nir_instr_as_load_const(instr);898for (unsigned i = 0; i < load_const->def.num_components; i++)899load_const->value[i] = CONST(load_const->value[i].u32);900} break;901case nir_instr_type_intrinsic: {902nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);903/* TODO: load_ubo can also become a constant in some cases904* (at the moment it can end up emitting a LOAD with two905* uniform sources, which could be a problem on HALTI2)906*/907if (intr->intrinsic != nir_intrinsic_load_uniform)908break;909nir_const_value *off = nir_src_as_const_value(intr->src[0]);910if (!off || off[0].u64 >> 32 != ETNA_UNIFORM_CONSTANT) {911have_indirect_uniform = true;912indirect_max = nir_intrinsic_base(intr) + nir_intrinsic_range(intr);913break;914}915916unsigned base = nir_intrinsic_base(intr);917/* pre halti2 uniform offset will be float */918if (c->specs->halti < 2)919base += (unsigned) off[0].f32;920else921base += off[0].u32;922nir_const_value value[4];923924for (unsigned i = 0; i < intr->dest.ssa.num_components; i++)925value[i] = UNIFORM(base * 4 + i);926927b.cursor = nir_after_instr(instr);928nir_ssa_def *def = nir_build_imm(&b, intr->dest.ssa.num_components, 32, value);929930nir_ssa_def_rewrite_uses(&intr->dest.ssa, def);931nir_instr_remove(instr);932} break;933default:934break;935}936}937}938939/* TODO: only emit required indirect uniform ranges */940if (have_indirect_uniform) {941for (unsigned i = 0; i < indirect_max * 4; i++)942c->consts[i] = UNIFORM(i).u64;943c->const_count = indirect_max;944}945946/* add mov for any store output using sysval/const */947nir_foreach_block(block, c->impl) {948nir_foreach_instr_safe(instr, block) {949if (instr->type != nir_instr_type_intrinsic)950continue;951952nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);953954switch (intr->intrinsic) {955case nir_intrinsic_store_deref: {956nir_src *src = &intr->src[1];957if (nir_src_is_const(*src) || is_sysval(src->ssa->parent_instr)) {958b.cursor = nir_before_instr(instr);959nir_instr_rewrite_src(instr, src, nir_src_for_ssa(nir_mov(&b, src->ssa)));960}961} break;962default:963break;964}965}966}967968/* call directly to avoid validation (load_const don't pass validation at this point) */969nir_convert_from_ssa(shader, true);970nir_opt_dce(shader);971972etna_ra_assign(c, shader);973974emit_cf_list(c, &nir_shader_get_entrypoint(shader)->body);975976*num_temps = etna_ra_finish(c);977*num_consts = c->const_count;978return true;979}980981static bool982etna_compile_check_limits(struct etna_shader_variant *v)983{984const struct etna_specs *specs = v->shader->specs;985int max_uniforms = (v->stage == MESA_SHADER_VERTEX)986? specs->max_vs_uniforms987: specs->max_ps_uniforms;988989if (!specs->has_icache && v->needs_icache) {990DBG("Number of instructions (%d) exceeds maximum %d", v->code_size / 4,991specs->max_instructions);992return false;993}994995if (v->num_temps > specs->max_registers) {996DBG("Number of registers (%d) exceeds maximum %d", v->num_temps,997specs->max_registers);998return false;999}10001001if (v->uniforms.count / 4 > max_uniforms) {1002DBG("Number of uniforms (%d) exceeds maximum %d",1003v->uniforms.count / 4, max_uniforms);1004return false;1005}10061007return true;1008}10091010static void1011fill_vs_mystery(struct etna_shader_variant *v)1012{1013const struct etna_specs *specs = v->shader->specs;10141015v->input_count_unk8 = DIV_ROUND_UP(v->infile.num_reg + 4, 16); /* XXX what is this */10161017/* fill in "mystery meat" load balancing value. This value determines how1018* work is scheduled between VS and PS1019* in the unified shader architecture. More precisely, it is determined from1020* the number of VS outputs, as well as chip-specific1021* vertex output buffer size, vertex cache size, and the number of shader1022* cores.1023*1024* XXX this is a conservative estimate, the "optimal" value is only known for1025* sure at link time because some1026* outputs may be unused and thus unmapped. Then again, in the general use1027* case with GLSL the vertex and fragment1028* shaders are linked already before submitting to Gallium, thus all outputs1029* are used.1030*1031* note: TGSI compiler counts all outputs (including position and pointsize), here1032* v->outfile.num_reg only counts varyings, +1 to compensate for the position output1033* TODO: might have a problem that we don't count pointsize when it is used1034*/10351036int half_out = v->outfile.num_reg / 2 + 1;1037assert(half_out);10381039uint32_t b = ((20480 / (specs->vertex_output_buffer_size -10402 * half_out * specs->vertex_cache_size)) +10419) /104210;1043uint32_t a = (b + 256 / (specs->shader_core_count * half_out)) / 2;1044v->vs_load_balancing = VIVS_VS_LOAD_BALANCING_A(MIN2(a, 255)) |1045VIVS_VS_LOAD_BALANCING_B(MIN2(b, 255)) |1046VIVS_VS_LOAD_BALANCING_C(0x3f) |1047VIVS_VS_LOAD_BALANCING_D(0x0f);1048}10491050bool1051etna_compile_shader_nir(struct etna_shader_variant *v)1052{1053if (unlikely(!v))1054return false;10551056struct etna_compile *c = CALLOC_STRUCT(etna_compile);1057if (!c)1058return false;10591060c->variant = v;1061c->specs = v->shader->specs;1062c->nir = nir_shader_clone(NULL, v->shader->nir);10631064nir_shader *s = c->nir;1065const struct etna_specs *specs = c->specs;10661067v->stage = s->info.stage;1068v->uses_discard = s->info.fs.uses_discard;1069v->num_loops = 0; /* TODO */1070v->vs_id_in_reg = -1;1071v->vs_pos_out_reg = -1;1072v->vs_pointsize_out_reg = -1;1073v->ps_color_out_reg = 0; /* 0 for shader that doesn't write fragcolor.. */1074v->ps_depth_out_reg = -1;10751076/*1077* Lower glTexCoord, fixes e.g. neverball point sprite (exit cylinder stars)1078* and gl4es pointsprite.trace apitrace1079*/1080if (s->info.stage == MESA_SHADER_FRAGMENT && v->key.sprite_coord_enable) {1081NIR_PASS_V(s, nir_lower_texcoord_replace, v->key.sprite_coord_enable,1082false, v->key.sprite_coord_yinvert);1083}10841085/* setup input linking */1086struct etna_shader_io_file *sf = &v->infile;1087if (s->info.stage == MESA_SHADER_VERTEX) {1088nir_foreach_shader_in_variable(var, s) {1089unsigned idx = var->data.driver_location;1090sf->reg[idx].reg = idx;1091sf->reg[idx].slot = var->data.location;1092sf->reg[idx].num_components = glsl_get_components(var->type);1093sf->num_reg = MAX2(sf->num_reg, idx+1);1094}1095} else {1096unsigned count = 0;1097nir_foreach_shader_in_variable(var, s) {1098unsigned idx = var->data.driver_location;1099sf->reg[idx].reg = idx + 1;1100sf->reg[idx].slot = var->data.location;1101sf->reg[idx].num_components = glsl_get_components(var->type);1102sf->num_reg = MAX2(sf->num_reg, idx+1);1103count++;1104}1105assert(sf->num_reg == count);1106}11071108NIR_PASS_V(s, nir_lower_io, nir_var_shader_in | nir_var_uniform, etna_glsl_type_size,1109(nir_lower_io_options)0);11101111NIR_PASS_V(s, nir_lower_regs_to_ssa);1112NIR_PASS_V(s, nir_lower_vars_to_ssa);1113NIR_PASS_V(s, nir_lower_indirect_derefs, nir_var_all, UINT32_MAX);1114NIR_PASS_V(s, nir_lower_tex, &(struct nir_lower_tex_options) { .lower_txp = ~0u });1115NIR_PASS_V(s, nir_lower_alu_to_scalar, etna_alu_to_scalar_filter_cb, specs);1116nir_lower_idiv_options idiv_options = {1117.imprecise_32bit_lowering = true,1118.allow_fp16 = true,1119};1120NIR_PASS_V(s, nir_lower_idiv, &idiv_options);11211122etna_optimize_loop(s);11231124/* TODO: remove this extra run if nir_opt_peephole_select is able to handle ubo's. */1125if (OPT(s, etna_nir_lower_ubo_to_uniform))1126etna_optimize_loop(s);11271128NIR_PASS_V(s, etna_lower_io, v);11291130if (v->shader->specs->vs_need_z_div)1131NIR_PASS_V(s, nir_lower_clip_halfz);11321133/* lower pre-halti2 to float (halti0 has integers, but only scalar..) */1134if (c->specs->halti < 2) {1135/* use opt_algebraic between int_to_float and boot_to_float because1136* int_to_float emits ftrunc, and ftrunc lowering generates bool ops1137*/1138NIR_PASS_V(s, nir_lower_int_to_float);1139NIR_PASS_V(s, nir_opt_algebraic);1140NIR_PASS_V(s, nir_lower_bool_to_float);1141} else {1142NIR_PASS_V(s, nir_lower_bool_to_int32);1143}11441145while( OPT(s, nir_opt_vectorize, NULL, NULL) );1146NIR_PASS_V(s, nir_lower_alu_to_scalar, etna_alu_to_scalar_filter_cb, specs);11471148NIR_PASS_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL);1149NIR_PASS_V(s, nir_opt_algebraic_late);11501151NIR_PASS_V(s, nir_move_vec_src_uses_to_dest);1152NIR_PASS_V(s, nir_copy_prop);1153/* only HW supported integer source mod is ineg for iadd instruction (?) */1154NIR_PASS_V(s, nir_lower_to_source_mods, ~nir_lower_int_source_mods);1155/* need copy prop after uses_to_dest, and before src mods: see1156* dEQP-GLES2.functional.shaders.random.all_features.fragment.951157*/11581159NIR_PASS_V(s, nir_opt_dce);11601161NIR_PASS_V(s, nir_lower_bool_to_bitsize);1162NIR_PASS_V(s, etna_lower_alu, c->specs->has_new_transcendentals);11631164if (DBG_ENABLED(ETNA_DBG_DUMP_SHADERS))1165nir_print_shader(s, stdout);11661167unsigned block_ptr[nir_shader_get_entrypoint(s)->num_blocks];1168c->block_ptr = block_ptr;11691170unsigned num_consts;1171ASSERTED bool ok = emit_shader(c, &v->num_temps, &num_consts);1172assert(ok);11731174/* empty shader, emit NOP */1175if (!c->inst_ptr)1176emit_inst(c, &(struct etna_inst) { .opcode = INST_OPCODE_NOP });11771178/* assemble instructions, fixing up labels */1179uint32_t *code = MALLOC(c->inst_ptr * 16);1180for (unsigned i = 0; i < c->inst_ptr; i++) {1181struct etna_inst *inst = &c->code[i];1182if (inst->opcode == INST_OPCODE_BRANCH)1183inst->imm = block_ptr[inst->imm];11841185inst->halti5 = specs->halti >= 5;1186etna_assemble(&code[i * 4], inst);1187}11881189v->code_size = c->inst_ptr * 4;1190v->code = code;1191v->needs_icache = c->inst_ptr > specs->max_instructions;11921193copy_uniform_state_to_shader(v, c->consts, num_consts);11941195if (s->info.stage == MESA_SHADER_FRAGMENT) {1196v->input_count_unk8 = 31; /* XXX what is this */1197assert(v->ps_depth_out_reg <= 0);1198} else {1199fill_vs_mystery(v);1200}12011202bool result = etna_compile_check_limits(v);1203ralloc_free(c->nir);1204FREE(c);1205return result;1206}12071208static const struct etna_shader_inout *1209etna_shader_vs_lookup(const struct etna_shader_variant *sobj,1210const struct etna_shader_inout *in)1211{1212for (int i = 0; i < sobj->outfile.num_reg; i++)1213if (sobj->outfile.reg[i].slot == in->slot)1214return &sobj->outfile.reg[i];12151216return NULL;1217}12181219bool1220etna_link_shader_nir(struct etna_shader_link_info *info,1221const struct etna_shader_variant *vs,1222const struct etna_shader_variant *fs)1223{1224int comp_ofs = 0;1225/* For each fragment input we need to find the associated vertex shader1226* output, which can be found by matching on semantic name and index. A1227* binary search could be used because the vs outputs are sorted by their1228* semantic index and grouped by semantic type by fill_in_vs_outputs.1229*/1230assert(fs->infile.num_reg < ETNA_NUM_INPUTS);1231info->pcoord_varying_comp_ofs = -1;12321233for (int idx = 0; idx < fs->infile.num_reg; ++idx) {1234const struct etna_shader_inout *fsio = &fs->infile.reg[idx];1235const struct etna_shader_inout *vsio = etna_shader_vs_lookup(vs, fsio);1236struct etna_varying *varying;1237bool interpolate_always = true;12381239assert(fsio->reg > 0 && fsio->reg <= ARRAY_SIZE(info->varyings));12401241if (fsio->reg > info->num_varyings)1242info->num_varyings = fsio->reg;12431244varying = &info->varyings[fsio->reg - 1];1245varying->num_components = fsio->num_components;12461247if (!interpolate_always) /* colors affected by flat shading */1248varying->pa_attributes = 0x200;1249else /* texture coord or other bypasses flat shading */1250varying->pa_attributes = 0x2f1;12511252varying->use[0] = VARYING_COMPONENT_USE_UNUSED;1253varying->use[1] = VARYING_COMPONENT_USE_UNUSED;1254varying->use[2] = VARYING_COMPONENT_USE_UNUSED;1255varying->use[3] = VARYING_COMPONENT_USE_UNUSED;12561257/* point/tex coord is an input to the PS without matching VS output,1258* so it gets a varying slot without being assigned a VS register.1259*/1260if (fsio->slot == VARYING_SLOT_PNTC) {1261varying->use[0] = VARYING_COMPONENT_USE_POINTCOORD_X;1262varying->use[1] = VARYING_COMPONENT_USE_POINTCOORD_Y;12631264info->pcoord_varying_comp_ofs = comp_ofs;1265} else if (util_varying_is_point_coord(fsio->slot, fs->key.sprite_coord_enable)) {1266/*1267* Do nothing, TexCoord is lowered to PointCoord above1268* and the TexCoord here is just a remnant. This needs1269* to be removed with some nir_remove_dead_variables(),1270* but that one removes all FS inputs ... why?1271*/1272} else {1273if (vsio == NULL) { /* not found -- link error */1274BUG("Semantic value not found in vertex shader outputs\n");1275return true;1276}1277varying->reg = vsio->reg;1278}12791280comp_ofs += varying->num_components;1281}12821283assert(info->num_varyings == fs->infile.num_reg);12841285return false;1286}128712881289