Path: blob/21.2-virgl/src/gallium/drivers/vc4/vc4_program.c
4570 views
/*1* Copyright (c) 2014 Scott Mansell2* Copyright © 2014 Broadcom3*4* Permission is hereby granted, free of charge, to any person obtaining a5* copy of this software and associated documentation files (the "Software"),6* to deal in the Software without restriction, including without limitation7* the rights to use, copy, modify, merge, publish, distribute, sublicense,8* and/or sell copies of the Software, and to permit persons to whom the9* Software is furnished to do so, subject to the following conditions:10*11* The above copyright notice and this permission notice (including the next12* paragraph) shall be included in all copies or substantial portions of the13* Software.14*15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS21* IN THE SOFTWARE.22*/2324#include <inttypes.h>25#include "util/format/u_format.h"26#include "util/crc32.h"27#include "util/u_helpers.h"28#include "util/u_math.h"29#include "util/u_memory.h"30#include "util/ralloc.h"31#include "util/hash_table.h"32#include "tgsi/tgsi_dump.h"33#include "tgsi/tgsi_parse.h"34#include "compiler/nir/nir.h"35#include "compiler/nir/nir_builder.h"36#include "compiler/nir_types.h"37#include "nir/tgsi_to_nir.h"38#include "vc4_context.h"39#include "vc4_qpu.h"40#include "vc4_qir.h"4142static struct qreg43ntq_get_src(struct vc4_compile *c, nir_src src, int i);44static void45ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list);4647static int48type_size(const struct glsl_type *type, bool bindless)49{50return glsl_count_attribute_slots(type, false);51}5253static void54resize_qreg_array(struct vc4_compile *c,55struct qreg **regs,56uint32_t *size,57uint32_t decl_size)58{59if (*size >= decl_size)60return;6162uint32_t old_size = *size;63*size = MAX2(*size * 2, decl_size);64*regs = reralloc(c, *regs, struct qreg, *size);65if (!*regs) {66fprintf(stderr, "Malloc failure\n");67abort();68}6970for (uint32_t i = old_size; i < *size; i++)71(*regs)[i] = c->undef;72}7374static void75ntq_emit_thrsw(struct vc4_compile *c)76{77if (!c->fs_threaded)78return;7980/* Always thread switch after each texture operation for now.81*82* We could do better by batching a bunch of texture fetches up and83* then doing one thread switch and collecting all their results84* afterward.85*/86qir_emit_nondef(c, qir_inst(QOP_THRSW, c->undef,87c->undef, c->undef));88c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL);89}9091static struct qreg92indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr)93{94struct qreg indirect_offset = ntq_get_src(c, intr->src[0], 0);9596/* Clamp to [0, array size). Note that MIN/MAX are signed. */97uint32_t range = nir_intrinsic_range(intr);98indirect_offset = qir_MAX(c, indirect_offset, qir_uniform_ui(c, 0));99indirect_offset = qir_MIN_NOIMM(c, indirect_offset,100qir_uniform_ui(c, range - 4));101102qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0),103indirect_offset,104qir_uniform(c, QUNIFORM_UBO0_ADDR,105nir_intrinsic_base(intr)));106107c->num_texture_samples++;108109ntq_emit_thrsw(c);110111return qir_TEX_RESULT(c);112}113114static struct qreg115vc4_ubo_load(struct vc4_compile *c, nir_intrinsic_instr *intr)116{117ASSERTED int buffer_index = nir_src_as_uint(intr->src[0]);118assert(buffer_index == 1);119assert(c->stage == QSTAGE_FRAG);120121struct qreg offset = ntq_get_src(c, intr->src[1], 0);122123/* Clamp to [0, array size). Note that MIN/MAX are signed. */124offset = qir_MAX(c, offset, qir_uniform_ui(c, 0));125offset = qir_MIN_NOIMM(c, offset,126qir_uniform_ui(c, c->fs_key->ubo_1_size - 4));127128qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0),129offset,130qir_uniform(c, QUNIFORM_UBO1_ADDR, 0));131132c->num_texture_samples++;133134ntq_emit_thrsw(c);135136return qir_TEX_RESULT(c);137}138139nir_ssa_def *140vc4_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz)141{142switch (swiz) {143default:144case PIPE_SWIZZLE_NONE:145fprintf(stderr, "warning: unknown swizzle\n");146FALLTHROUGH;147case PIPE_SWIZZLE_0:148return nir_imm_float(b, 0.0);149case PIPE_SWIZZLE_1:150return nir_imm_float(b, 1.0);151case PIPE_SWIZZLE_X:152case PIPE_SWIZZLE_Y:153case PIPE_SWIZZLE_Z:154case PIPE_SWIZZLE_W:155return srcs[swiz];156}157}158159static struct qreg *160ntq_init_ssa_def(struct vc4_compile *c, nir_ssa_def *def)161{162struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,163def->num_components);164_mesa_hash_table_insert(c->def_ht, def, qregs);165return qregs;166}167168/**169* This function is responsible for getting QIR results into the associated170* storage for a NIR instruction.171*172* If it's a NIR SSA def, then we just set the associated hash table entry to173* the new result.174*175* If it's a NIR reg, then we need to update the existing qreg assigned to the176* NIR destination with the incoming value. To do that without introducing177* new MOVs, we require that the incoming qreg either be a uniform, or be178* SSA-defined by the previous QIR instruction in the block and rewritable by179* this function. That lets us sneak ahead and insert the SF flag beforehand180* (knowing that the previous instruction doesn't depend on flags) and rewrite181* its destination to be the NIR reg's destination182*/183static void184ntq_store_dest(struct vc4_compile *c, nir_dest *dest, int chan,185struct qreg result)186{187struct qinst *last_inst = NULL;188if (!list_is_empty(&c->cur_block->instructions))189last_inst = (struct qinst *)c->cur_block->instructions.prev;190191assert(result.file == QFILE_UNIF ||192(result.file == QFILE_TEMP &&193last_inst && last_inst == c->defs[result.index]));194195if (dest->is_ssa) {196assert(chan < dest->ssa.num_components);197198struct qreg *qregs;199struct hash_entry *entry =200_mesa_hash_table_search(c->def_ht, &dest->ssa);201202if (entry)203qregs = entry->data;204else205qregs = ntq_init_ssa_def(c, &dest->ssa);206207qregs[chan] = result;208} else {209nir_register *reg = dest->reg.reg;210assert(dest->reg.base_offset == 0);211assert(reg->num_array_elems == 0);212struct hash_entry *entry =213_mesa_hash_table_search(c->def_ht, reg);214struct qreg *qregs = entry->data;215216/* Insert a MOV if the source wasn't an SSA def in the217* previous instruction.218*/219if (result.file == QFILE_UNIF) {220result = qir_MOV(c, result);221last_inst = c->defs[result.index];222}223224/* We know they're both temps, so just rewrite index. */225c->defs[last_inst->dst.index] = NULL;226last_inst->dst.index = qregs[chan].index;227228/* If we're in control flow, then make this update of the reg229* conditional on the execution mask.230*/231if (c->execute.file != QFILE_NULL) {232last_inst->dst.index = qregs[chan].index;233234/* Set the flags to the current exec mask. To insert235* the SF, we temporarily remove our SSA instruction.236*/237list_del(&last_inst->link);238qir_SF(c, c->execute);239list_addtail(&last_inst->link,240&c->cur_block->instructions);241242last_inst->cond = QPU_COND_ZS;243last_inst->cond_is_exec_mask = true;244}245}246}247248static struct qreg *249ntq_get_dest(struct vc4_compile *c, nir_dest *dest)250{251if (dest->is_ssa) {252struct qreg *qregs = ntq_init_ssa_def(c, &dest->ssa);253for (int i = 0; i < dest->ssa.num_components; i++)254qregs[i] = c->undef;255return qregs;256} else {257nir_register *reg = dest->reg.reg;258assert(dest->reg.base_offset == 0);259assert(reg->num_array_elems == 0);260struct hash_entry *entry =261_mesa_hash_table_search(c->def_ht, reg);262return entry->data;263}264}265266static struct qreg267ntq_get_src(struct vc4_compile *c, nir_src src, int i)268{269struct hash_entry *entry;270if (src.is_ssa) {271entry = _mesa_hash_table_search(c->def_ht, src.ssa);272assert(i < src.ssa->num_components);273} else {274nir_register *reg = src.reg.reg;275entry = _mesa_hash_table_search(c->def_ht, reg);276assert(reg->num_array_elems == 0);277assert(src.reg.base_offset == 0);278assert(i < reg->num_components);279}280281struct qreg *qregs = entry->data;282return qregs[i];283}284285static struct qreg286ntq_get_alu_src(struct vc4_compile *c, nir_alu_instr *instr,287unsigned src)288{289assert(util_is_power_of_two_or_zero(instr->dest.write_mask));290unsigned chan = ffs(instr->dest.write_mask) - 1;291struct qreg r = ntq_get_src(c, instr->src[src].src,292instr->src[src].swizzle[chan]);293294assert(!instr->src[src].abs);295assert(!instr->src[src].negate);296297return r;298};299300static inline struct qreg301qir_SAT(struct vc4_compile *c, struct qreg val)302{303return qir_FMAX(c,304qir_FMIN(c, val, qir_uniform_f(c, 1.0)),305qir_uniform_f(c, 0.0));306}307308static struct qreg309ntq_rcp(struct vc4_compile *c, struct qreg x)310{311struct qreg r = qir_RCP(c, x);312313/* Apply a Newton-Raphson step to improve the accuracy. */314r = qir_FMUL(c, r, qir_FSUB(c,315qir_uniform_f(c, 2.0),316qir_FMUL(c, x, r)));317318return r;319}320321static struct qreg322ntq_rsq(struct vc4_compile *c, struct qreg x)323{324struct qreg r = qir_RSQ(c, x);325326/* Apply a Newton-Raphson step to improve the accuracy. */327r = qir_FMUL(c, r, qir_FSUB(c,328qir_uniform_f(c, 1.5),329qir_FMUL(c,330qir_uniform_f(c, 0.5),331qir_FMUL(c, x,332qir_FMUL(c, r, r)))));333334return r;335}336337static struct qreg338ntq_umul(struct vc4_compile *c, struct qreg src0, struct qreg src1)339{340struct qreg src0_hi = qir_SHR(c, src0,341qir_uniform_ui(c, 24));342struct qreg src1_hi = qir_SHR(c, src1,343qir_uniform_ui(c, 24));344345struct qreg hilo = qir_MUL24(c, src0_hi, src1);346struct qreg lohi = qir_MUL24(c, src0, src1_hi);347struct qreg lolo = qir_MUL24(c, src0, src1);348349return qir_ADD(c, lolo, qir_SHL(c,350qir_ADD(c, hilo, lohi),351qir_uniform_ui(c, 24)));352}353354static struct qreg355ntq_scale_depth_texture(struct vc4_compile *c, struct qreg src)356{357struct qreg depthf = qir_ITOF(c, qir_SHR(c, src,358qir_uniform_ui(c, 8)));359return qir_FMUL(c, depthf, qir_uniform_f(c, 1.0f/0xffffff));360}361362/**363* Emits a lowered TXF_MS from an MSAA texture.364*365* The addressing math has been lowered in NIR, and now we just need to read366* it like a UBO.367*/368static void369ntq_emit_txf(struct vc4_compile *c, nir_tex_instr *instr)370{371uint32_t tile_width = 32;372uint32_t tile_height = 32;373uint32_t tile_size = (tile_height * tile_width *374VC4_MAX_SAMPLES * sizeof(uint32_t));375376unsigned unit = instr->texture_index;377uint32_t w = align(c->key->tex[unit].msaa_width, tile_width);378uint32_t w_tiles = w / tile_width;379uint32_t h = align(c->key->tex[unit].msaa_height, tile_height);380uint32_t h_tiles = h / tile_height;381uint32_t size = w_tiles * h_tiles * tile_size;382383struct qreg addr;384assert(instr->num_srcs == 1);385assert(instr->src[0].src_type == nir_tex_src_coord);386addr = ntq_get_src(c, instr->src[0].src, 0);387388/* Perform the clamping required by kernel validation. */389addr = qir_MAX(c, addr, qir_uniform_ui(c, 0));390addr = qir_MIN_NOIMM(c, addr, qir_uniform_ui(c, size - 4));391392qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0),393addr, qir_uniform(c, QUNIFORM_TEXTURE_MSAA_ADDR, unit));394395ntq_emit_thrsw(c);396397struct qreg tex = qir_TEX_RESULT(c);398c->num_texture_samples++;399400enum pipe_format format = c->key->tex[unit].format;401if (util_format_is_depth_or_stencil(format)) {402struct qreg scaled = ntq_scale_depth_texture(c, tex);403for (int i = 0; i < 4; i++)404ntq_store_dest(c, &instr->dest, i, qir_MOV(c, scaled));405} else {406for (int i = 0; i < 4; i++)407ntq_store_dest(c, &instr->dest, i,408qir_UNPACK_8_F(c, tex, i));409}410}411412static void413ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)414{415struct qreg s, t, r, lod, compare;416bool is_txb = false, is_txl = false;417unsigned unit = instr->texture_index;418419if (instr->op == nir_texop_txf) {420ntq_emit_txf(c, instr);421return;422}423424for (unsigned i = 0; i < instr->num_srcs; i++) {425switch (instr->src[i].src_type) {426case nir_tex_src_coord:427s = ntq_get_src(c, instr->src[i].src, 0);428if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D)429t = qir_uniform_f(c, 0.5);430else431t = ntq_get_src(c, instr->src[i].src, 1);432if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE)433r = ntq_get_src(c, instr->src[i].src, 2);434break;435case nir_tex_src_bias:436lod = ntq_get_src(c, instr->src[i].src, 0);437is_txb = true;438break;439case nir_tex_src_lod:440lod = ntq_get_src(c, instr->src[i].src, 0);441is_txl = true;442break;443case nir_tex_src_comparator:444compare = ntq_get_src(c, instr->src[i].src, 0);445break;446default:447unreachable("unknown texture source");448}449}450451if (c->stage != QSTAGE_FRAG && !is_txl) {452/* From the GLSL 1.20 spec:453*454* "If it is mip-mapped and running on the vertex shader,455* then the base texture is used."456*/457is_txl = true;458lod = qir_uniform_ui(c, 0);459}460461if (c->key->tex[unit].force_first_level) {462lod = qir_uniform(c, QUNIFORM_TEXTURE_FIRST_LEVEL, unit);463is_txl = true;464is_txb = false;465}466467struct qreg texture_u[] = {468qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P0, unit),469qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P1, unit),470qir_uniform(c, QUNIFORM_CONSTANT, 0),471qir_uniform(c, QUNIFORM_CONSTANT, 0),472};473uint32_t next_texture_u = 0;474475if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE || is_txl) {476texture_u[2] = qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P2,477unit | (is_txl << 16));478}479480struct qinst *tmu;481if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {482tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_R, 0), r);483tmu->src[qir_get_tex_uniform_src(tmu)] =484texture_u[next_texture_u++];485} else if (c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP_TO_BORDER ||486c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP ||487c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP_TO_BORDER ||488c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP) {489tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_R, 0),490qir_uniform(c, QUNIFORM_TEXTURE_BORDER_COLOR,491unit));492tmu->src[qir_get_tex_uniform_src(tmu)] =493texture_u[next_texture_u++];494}495496if (c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP) {497s = qir_SAT(c, s);498}499500if (c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP) {501t = qir_SAT(c, t);502}503504tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_T, 0), t);505tmu->src[qir_get_tex_uniform_src(tmu)] =506texture_u[next_texture_u++];507508if (is_txl || is_txb) {509tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_B, 0), lod);510tmu->src[qir_get_tex_uniform_src(tmu)] =511texture_u[next_texture_u++];512}513514tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_S, 0), s);515tmu->src[qir_get_tex_uniform_src(tmu)] = texture_u[next_texture_u++];516517c->num_texture_samples++;518519ntq_emit_thrsw(c);520521struct qreg tex = qir_TEX_RESULT(c);522523enum pipe_format format = c->key->tex[unit].format;524525struct qreg *dest = ntq_get_dest(c, &instr->dest);526if (util_format_is_depth_or_stencil(format)) {527struct qreg normalized = ntq_scale_depth_texture(c, tex);528struct qreg depth_output;529530struct qreg u0 = qir_uniform_f(c, 0.0f);531struct qreg u1 = qir_uniform_f(c, 1.0f);532if (c->key->tex[unit].compare_mode) {533/* From the GL_ARB_shadow spec:534*535* "Let Dt (D subscript t) be the depth texture536* value, in the range [0, 1]. Let R be the537* interpolated texture coordinate clamped to the538* range [0, 1]."539*/540compare = qir_SAT(c, compare);541542switch (c->key->tex[unit].compare_func) {543case PIPE_FUNC_NEVER:544depth_output = qir_uniform_f(c, 0.0f);545break;546case PIPE_FUNC_ALWAYS:547depth_output = u1;548break;549case PIPE_FUNC_EQUAL:550qir_SF(c, qir_FSUB(c, compare, normalized));551depth_output = qir_SEL(c, QPU_COND_ZS, u1, u0);552break;553case PIPE_FUNC_NOTEQUAL:554qir_SF(c, qir_FSUB(c, compare, normalized));555depth_output = qir_SEL(c, QPU_COND_ZC, u1, u0);556break;557case PIPE_FUNC_GREATER:558qir_SF(c, qir_FSUB(c, compare, normalized));559depth_output = qir_SEL(c, QPU_COND_NC, u1, u0);560break;561case PIPE_FUNC_GEQUAL:562qir_SF(c, qir_FSUB(c, normalized, compare));563depth_output = qir_SEL(c, QPU_COND_NS, u1, u0);564break;565case PIPE_FUNC_LESS:566qir_SF(c, qir_FSUB(c, compare, normalized));567depth_output = qir_SEL(c, QPU_COND_NS, u1, u0);568break;569case PIPE_FUNC_LEQUAL:570qir_SF(c, qir_FSUB(c, normalized, compare));571depth_output = qir_SEL(c, QPU_COND_NC, u1, u0);572break;573}574} else {575depth_output = normalized;576}577578for (int i = 0; i < 4; i++)579dest[i] = depth_output;580} else {581for (int i = 0; i < 4; i++)582dest[i] = qir_UNPACK_8_F(c, tex, i);583}584}585586/**587* Computes x - floor(x), which is tricky because our FTOI truncates (rounds588* to zero).589*/590static struct qreg591ntq_ffract(struct vc4_compile *c, struct qreg src)592{593struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src));594struct qreg diff = qir_FSUB(c, src, trunc);595qir_SF(c, diff);596597qir_FADD_dest(c, diff,598diff, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS;599600return qir_MOV(c, diff);601}602603/**604* Computes floor(x), which is tricky because our FTOI truncates (rounds to605* zero).606*/607static struct qreg608ntq_ffloor(struct vc4_compile *c, struct qreg src)609{610struct qreg result = qir_ITOF(c, qir_FTOI(c, src));611612/* This will be < 0 if we truncated and the truncation was of a value613* that was < 0 in the first place.614*/615qir_SF(c, qir_FSUB(c, src, result));616617struct qinst *sub = qir_FSUB_dest(c, result,618result, qir_uniform_f(c, 1.0));619sub->cond = QPU_COND_NS;620621return qir_MOV(c, result);622}623624/**625* Computes ceil(x), which is tricky because our FTOI truncates (rounds to626* zero).627*/628static struct qreg629ntq_fceil(struct vc4_compile *c, struct qreg src)630{631struct qreg result = qir_ITOF(c, qir_FTOI(c, src));632633/* This will be < 0 if we truncated and the truncation was of a value634* that was > 0 in the first place.635*/636qir_SF(c, qir_FSUB(c, result, src));637638qir_FADD_dest(c, result,639result, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS;640641return qir_MOV(c, result);642}643644static struct qreg645ntq_shrink_sincos_input_range(struct vc4_compile *c, struct qreg x)646{647/* Since we're using a Taylor approximation, we want to have a small648* number of coefficients and take advantage of sin/cos repeating649* every 2pi. We keep our x as close to 0 as we can, since the series650* will be less accurate as |x| increases. (Also, be careful of651* shifting the input x value to be tricky with sin/cos relations,652* because getting accurate values for x==0 is very important for SDL653* rendering)654*/655struct qreg scaled_x =656qir_FMUL(c, x,657qir_uniform_f(c, 1.0f / (M_PI * 2.0f)));658/* Note: FTOI truncates toward 0. */659struct qreg x_frac = qir_FSUB(c, scaled_x,660qir_ITOF(c, qir_FTOI(c, scaled_x)));661/* Map [0.5, 1] to [-0.5, 0] */662qir_SF(c, qir_FSUB(c, x_frac, qir_uniform_f(c, 0.5)));663qir_FSUB_dest(c, x_frac, x_frac, qir_uniform_f(c, 1.0))->cond = QPU_COND_NC;664/* Map [-1, -0.5] to [0, 0.5] */665qir_SF(c, qir_FADD(c, x_frac, qir_uniform_f(c, 0.5)));666qir_FADD_dest(c, x_frac, x_frac, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS;667668return x_frac;669}670671static struct qreg672ntq_fsin(struct vc4_compile *c, struct qreg src)673{674float coeff[] = {6752.0 * M_PI,676-pow(2.0 * M_PI, 3) / (3 * 2 * 1),677pow(2.0 * M_PI, 5) / (5 * 4 * 3 * 2 * 1),678-pow(2.0 * M_PI, 7) / (7 * 6 * 5 * 4 * 3 * 2 * 1),679pow(2.0 * M_PI, 9) / (9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),680};681682struct qreg x = ntq_shrink_sincos_input_range(c, src);683struct qreg x2 = qir_FMUL(c, x, x);684struct qreg sum = qir_FMUL(c, x, qir_uniform_f(c, coeff[0]));685for (int i = 1; i < ARRAY_SIZE(coeff); i++) {686x = qir_FMUL(c, x, x2);687sum = qir_FADD(c,688sum,689qir_FMUL(c,690x,691qir_uniform_f(c, coeff[i])));692}693return sum;694}695696static struct qreg697ntq_fcos(struct vc4_compile *c, struct qreg src)698{699float coeff[] = {7001.0f,701-pow(2.0 * M_PI, 2) / (2 * 1),702pow(2.0 * M_PI, 4) / (4 * 3 * 2 * 1),703-pow(2.0 * M_PI, 6) / (6 * 5 * 4 * 3 * 2 * 1),704pow(2.0 * M_PI, 8) / (8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),705-pow(2.0 * M_PI, 10) / (10 * 9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),706};707708struct qreg x_frac = ntq_shrink_sincos_input_range(c, src);709struct qreg sum = qir_uniform_f(c, coeff[0]);710struct qreg x2 = qir_FMUL(c, x_frac, x_frac);711struct qreg x = x2; /* Current x^2, x^4, or x^6 */712for (int i = 1; i < ARRAY_SIZE(coeff); i++) {713if (i != 1)714x = qir_FMUL(c, x, x2);715716sum = qir_FADD(c, qir_FMUL(c,717x,718qir_uniform_f(c, coeff[i])),719sum);720}721return sum;722}723724static struct qreg725ntq_fsign(struct vc4_compile *c, struct qreg src)726{727struct qreg t = qir_get_temp(c);728729qir_SF(c, src);730qir_MOV_dest(c, t, qir_uniform_f(c, 0.0));731qir_MOV_dest(c, t, qir_uniform_f(c, 1.0))->cond = QPU_COND_ZC;732qir_MOV_dest(c, t, qir_uniform_f(c, -1.0))->cond = QPU_COND_NS;733return qir_MOV(c, t);734}735736static void737emit_vertex_input(struct vc4_compile *c, int attr)738{739enum pipe_format format = c->vs_key->attr_formats[attr];740uint32_t attr_size = util_format_get_blocksize(format);741742c->vattr_sizes[attr] = align(attr_size, 4);743for (int i = 0; i < align(attr_size, 4) / 4; i++) {744c->inputs[attr * 4 + i] =745qir_MOV(c, qir_reg(QFILE_VPM, attr * 4 + i));746c->num_inputs++;747}748}749750static void751emit_fragcoord_input(struct vc4_compile *c, int attr)752{753c->inputs[attr * 4 + 0] = qir_ITOF(c, qir_reg(QFILE_FRAG_X, 0));754c->inputs[attr * 4 + 1] = qir_ITOF(c, qir_reg(QFILE_FRAG_Y, 0));755c->inputs[attr * 4 + 2] =756qir_FMUL(c,757qir_ITOF(c, qir_FRAG_Z(c)),758qir_uniform_f(c, 1.0 / 0xffffff));759c->inputs[attr * 4 + 3] = qir_RCP(c, qir_FRAG_W(c));760}761762static struct qreg763emit_fragment_varying(struct vc4_compile *c, gl_varying_slot slot,764uint8_t swizzle)765{766uint32_t i = c->num_input_slots++;767struct qreg vary = {768QFILE_VARY,769i770};771772if (c->num_input_slots >= c->input_slots_array_size) {773c->input_slots_array_size =774MAX2(4, c->input_slots_array_size * 2);775776c->input_slots = reralloc(c, c->input_slots,777struct vc4_varying_slot,778c->input_slots_array_size);779}780781c->input_slots[i].slot = slot;782c->input_slots[i].swizzle = swizzle;783784return qir_VARY_ADD_C(c, qir_FMUL(c, vary, qir_FRAG_W(c)));785}786787static void788emit_fragment_input(struct vc4_compile *c, int attr, gl_varying_slot slot)789{790for (int i = 0; i < 4; i++) {791c->inputs[attr * 4 + i] =792emit_fragment_varying(c, slot, i);793c->num_inputs++;794}795}796797static void798add_output(struct vc4_compile *c,799uint32_t decl_offset,800uint8_t slot,801uint8_t swizzle)802{803uint32_t old_array_size = c->outputs_array_size;804resize_qreg_array(c, &c->outputs, &c->outputs_array_size,805decl_offset + 1);806807if (old_array_size != c->outputs_array_size) {808c->output_slots = reralloc(c,809c->output_slots,810struct vc4_varying_slot,811c->outputs_array_size);812}813814c->output_slots[decl_offset].slot = slot;815c->output_slots[decl_offset].swizzle = swizzle;816}817818static bool819ntq_src_is_only_ssa_def_user(nir_src *src)820{821if (!src->is_ssa)822return false;823824if (!list_is_empty(&src->ssa->if_uses))825return false;826827return (src->ssa->uses.next == &src->use_link &&828src->ssa->uses.next->next == &src->ssa->uses);829}830831/**832* In general, emits a nir_pack_unorm_4x8 as a series of MOVs with the pack833* bit set.834*835* However, as an optimization, it tries to find the instructions generating836* the sources to be packed and just emit the pack flag there, if possible.837*/838static void839ntq_emit_pack_unorm_4x8(struct vc4_compile *c, nir_alu_instr *instr)840{841struct qreg result = qir_get_temp(c);842struct nir_alu_instr *vec4 = NULL;843844/* If packing from a vec4 op (as expected), identify it so that we can845* peek back at what generated its sources.846*/847if (instr->src[0].src.is_ssa &&848instr->src[0].src.ssa->parent_instr->type == nir_instr_type_alu &&849nir_instr_as_alu(instr->src[0].src.ssa->parent_instr)->op ==850nir_op_vec4) {851vec4 = nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);852}853854/* If the pack is replicating the same channel 4 times, use the 8888855* pack flag. This is common for blending using the alpha856* channel.857*/858if (instr->src[0].swizzle[0] == instr->src[0].swizzle[1] &&859instr->src[0].swizzle[0] == instr->src[0].swizzle[2] &&860instr->src[0].swizzle[0] == instr->src[0].swizzle[3]) {861struct qreg rep = ntq_get_src(c,862instr->src[0].src,863instr->src[0].swizzle[0]);864ntq_store_dest(c, &instr->dest.dest, 0, qir_PACK_8888_F(c, rep));865return;866}867868for (int i = 0; i < 4; i++) {869int swiz = instr->src[0].swizzle[i];870struct qreg src;871if (vec4) {872src = ntq_get_src(c, vec4->src[swiz].src,873vec4->src[swiz].swizzle[0]);874} else {875src = ntq_get_src(c, instr->src[0].src, swiz);876}877878if (vec4 &&879ntq_src_is_only_ssa_def_user(&vec4->src[swiz].src) &&880src.file == QFILE_TEMP &&881c->defs[src.index] &&882qir_is_mul(c->defs[src.index]) &&883!c->defs[src.index]->dst.pack) {884struct qinst *rewrite = c->defs[src.index];885c->defs[src.index] = NULL;886rewrite->dst = result;887rewrite->dst.pack = QPU_PACK_MUL_8A + i;888continue;889}890891qir_PACK_8_F(c, result, src, i);892}893894ntq_store_dest(c, &instr->dest.dest, 0, qir_MOV(c, result));895}896897/** Handles sign-extended bitfield extracts for 16 bits. */898static struct qreg899ntq_emit_ibfe(struct vc4_compile *c, struct qreg base, struct qreg offset,900struct qreg bits)901{902assert(bits.file == QFILE_UNIF &&903c->uniform_contents[bits.index] == QUNIFORM_CONSTANT &&904c->uniform_data[bits.index] == 16);905906assert(offset.file == QFILE_UNIF &&907c->uniform_contents[offset.index] == QUNIFORM_CONSTANT);908int offset_bit = c->uniform_data[offset.index];909assert(offset_bit % 16 == 0);910911return qir_UNPACK_16_I(c, base, offset_bit / 16);912}913914/** Handles unsigned bitfield extracts for 8 bits. */915static struct qreg916ntq_emit_ubfe(struct vc4_compile *c, struct qreg base, struct qreg offset,917struct qreg bits)918{919assert(bits.file == QFILE_UNIF &&920c->uniform_contents[bits.index] == QUNIFORM_CONSTANT &&921c->uniform_data[bits.index] == 8);922923assert(offset.file == QFILE_UNIF &&924c->uniform_contents[offset.index] == QUNIFORM_CONSTANT);925int offset_bit = c->uniform_data[offset.index];926assert(offset_bit % 8 == 0);927928return qir_UNPACK_8_I(c, base, offset_bit / 8);929}930931/**932* If compare_instr is a valid comparison instruction, emits the933* compare_instr's comparison and returns the sel_instr's return value based934* on the compare_instr's result.935*/936static bool937ntq_emit_comparison(struct vc4_compile *c, struct qreg *dest,938nir_alu_instr *compare_instr,939nir_alu_instr *sel_instr)940{941enum qpu_cond cond;942943switch (compare_instr->op) {944case nir_op_feq32:945case nir_op_ieq32:946case nir_op_seq:947cond = QPU_COND_ZS;948break;949case nir_op_fneu32:950case nir_op_ine32:951case nir_op_sne:952cond = QPU_COND_ZC;953break;954case nir_op_fge32:955case nir_op_ige32:956case nir_op_uge32:957case nir_op_sge:958cond = QPU_COND_NC;959break;960case nir_op_flt32:961case nir_op_ilt32:962case nir_op_slt:963cond = QPU_COND_NS;964break;965default:966return false;967}968969struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);970struct qreg src1 = ntq_get_alu_src(c, compare_instr, 1);971972unsigned unsized_type =973nir_alu_type_get_base_type(nir_op_infos[compare_instr->op].input_types[0]);974if (unsized_type == nir_type_float)975qir_SF(c, qir_FSUB(c, src0, src1));976else977qir_SF(c, qir_SUB(c, src0, src1));978979switch (sel_instr->op) {980case nir_op_seq:981case nir_op_sne:982case nir_op_sge:983case nir_op_slt:984*dest = qir_SEL(c, cond,985qir_uniform_f(c, 1.0), qir_uniform_f(c, 0.0));986break;987988case nir_op_b32csel:989*dest = qir_SEL(c, cond,990ntq_get_alu_src(c, sel_instr, 1),991ntq_get_alu_src(c, sel_instr, 2));992break;993994default:995*dest = qir_SEL(c, cond,996qir_uniform_ui(c, ~0), qir_uniform_ui(c, 0));997break;998}9991000/* Make the temporary for nir_store_dest(). */1001*dest = qir_MOV(c, *dest);10021003return true;1004}10051006/**1007* Attempts to fold a comparison generating a boolean result into the1008* condition code for selecting between two values, instead of comparing the1009* boolean result against 0 to generate the condition code.1010*/1011static struct qreg ntq_emit_bcsel(struct vc4_compile *c, nir_alu_instr *instr,1012struct qreg *src)1013{1014if (!instr->src[0].src.is_ssa)1015goto out;1016if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)1017goto out;1018nir_alu_instr *compare =1019nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);1020if (!compare)1021goto out;10221023struct qreg dest;1024if (ntq_emit_comparison(c, &dest, compare, instr))1025return dest;10261027out:1028qir_SF(c, src[0]);1029return qir_MOV(c, qir_SEL(c, QPU_COND_NS, src[1], src[2]));1030}10311032static struct qreg1033ntq_fddx(struct vc4_compile *c, struct qreg src)1034{1035/* Make sure that we have a bare temp to use for MUL rotation, so it1036* can be allocated to an accumulator.1037*/1038if (src.pack || src.file != QFILE_TEMP)1039src = qir_MOV(c, src);10401041struct qreg from_left = qir_ROT_MUL(c, src, 1);1042struct qreg from_right = qir_ROT_MUL(c, src, 15);10431044/* Distinguish left/right pixels of the quad. */1045qir_SF(c, qir_AND(c, qir_reg(QFILE_QPU_ELEMENT, 0),1046qir_uniform_ui(c, 1)));10471048return qir_MOV(c, qir_SEL(c, QPU_COND_ZS,1049qir_FSUB(c, from_right, src),1050qir_FSUB(c, src, from_left)));1051}10521053static struct qreg1054ntq_fddy(struct vc4_compile *c, struct qreg src)1055{1056if (src.pack || src.file != QFILE_TEMP)1057src = qir_MOV(c, src);10581059struct qreg from_bottom = qir_ROT_MUL(c, src, 2);1060struct qreg from_top = qir_ROT_MUL(c, src, 14);10611062/* Distinguish top/bottom pixels of the quad. */1063qir_SF(c, qir_AND(c,1064qir_reg(QFILE_QPU_ELEMENT, 0),1065qir_uniform_ui(c, 2)));10661067return qir_MOV(c, qir_SEL(c, QPU_COND_ZS,1068qir_FSUB(c, from_top, src),1069qir_FSUB(c, src, from_bottom)));1070}10711072static void1073ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)1074{1075/* This should always be lowered to ALU operations for VC4. */1076assert(!instr->dest.saturate);10771078/* Vectors are special in that they have non-scalarized writemasks,1079* and just take the first swizzle channel for each argument in order1080* into each writemask channel.1081*/1082if (instr->op == nir_op_vec2 ||1083instr->op == nir_op_vec3 ||1084instr->op == nir_op_vec4) {1085struct qreg srcs[4];1086for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)1087srcs[i] = ntq_get_src(c, instr->src[i].src,1088instr->src[i].swizzle[0]);1089for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)1090ntq_store_dest(c, &instr->dest.dest, i,1091qir_MOV(c, srcs[i]));1092return;1093}10941095if (instr->op == nir_op_pack_unorm_4x8) {1096ntq_emit_pack_unorm_4x8(c, instr);1097return;1098}10991100if (instr->op == nir_op_unpack_unorm_4x8) {1101struct qreg src = ntq_get_src(c, instr->src[0].src,1102instr->src[0].swizzle[0]);1103for (int i = 0; i < 4; i++) {1104if (instr->dest.write_mask & (1 << i))1105ntq_store_dest(c, &instr->dest.dest, i,1106qir_UNPACK_8_F(c, src, i));1107}1108return;1109}11101111/* General case: We can just grab the one used channel per src. */1112struct qreg src[nir_op_infos[instr->op].num_inputs];1113for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {1114src[i] = ntq_get_alu_src(c, instr, i);1115}11161117struct qreg result;11181119switch (instr->op) {1120case nir_op_mov:1121result = qir_MOV(c, src[0]);1122break;1123case nir_op_fmul:1124result = qir_FMUL(c, src[0], src[1]);1125break;1126case nir_op_fadd:1127result = qir_FADD(c, src[0], src[1]);1128break;1129case nir_op_fsub:1130result = qir_FSUB(c, src[0], src[1]);1131break;1132case nir_op_fmin:1133result = qir_FMIN(c, src[0], src[1]);1134break;1135case nir_op_fmax:1136result = qir_FMAX(c, src[0], src[1]);1137break;11381139case nir_op_f2i32:1140case nir_op_f2u32:1141result = qir_FTOI(c, src[0]);1142break;1143case nir_op_i2f32:1144case nir_op_u2f32:1145result = qir_ITOF(c, src[0]);1146break;1147case nir_op_b2f32:1148result = qir_AND(c, src[0], qir_uniform_f(c, 1.0));1149break;1150case nir_op_b2i32:1151result = qir_AND(c, src[0], qir_uniform_ui(c, 1));1152break;1153case nir_op_i2b32:1154case nir_op_f2b32:1155qir_SF(c, src[0]);1156result = qir_MOV(c, qir_SEL(c, QPU_COND_ZC,1157qir_uniform_ui(c, ~0),1158qir_uniform_ui(c, 0)));1159break;11601161case nir_op_iadd:1162result = qir_ADD(c, src[0], src[1]);1163break;1164case nir_op_ushr:1165result = qir_SHR(c, src[0], src[1]);1166break;1167case nir_op_isub:1168result = qir_SUB(c, src[0], src[1]);1169break;1170case nir_op_ishr:1171result = qir_ASR(c, src[0], src[1]);1172break;1173case nir_op_ishl:1174result = qir_SHL(c, src[0], src[1]);1175break;1176case nir_op_imin:1177result = qir_MIN(c, src[0], src[1]);1178break;1179case nir_op_imax:1180result = qir_MAX(c, src[0], src[1]);1181break;1182case nir_op_iand:1183result = qir_AND(c, src[0], src[1]);1184break;1185case nir_op_ior:1186result = qir_OR(c, src[0], src[1]);1187break;1188case nir_op_ixor:1189result = qir_XOR(c, src[0], src[1]);1190break;1191case nir_op_inot:1192result = qir_NOT(c, src[0]);1193break;11941195case nir_op_imul:1196result = ntq_umul(c, src[0], src[1]);1197break;11981199case nir_op_seq:1200case nir_op_sne:1201case nir_op_sge:1202case nir_op_slt:1203case nir_op_feq32:1204case nir_op_fneu32:1205case nir_op_fge32:1206case nir_op_flt32:1207case nir_op_ieq32:1208case nir_op_ine32:1209case nir_op_ige32:1210case nir_op_uge32:1211case nir_op_ilt32:1212if (!ntq_emit_comparison(c, &result, instr, instr)) {1213fprintf(stderr, "Bad comparison instruction\n");1214}1215break;12161217case nir_op_b32csel:1218result = ntq_emit_bcsel(c, instr, src);1219break;1220case nir_op_fcsel:1221qir_SF(c, src[0]);1222result = qir_MOV(c, qir_SEL(c, QPU_COND_ZC, src[1], src[2]));1223break;12241225case nir_op_frcp:1226result = ntq_rcp(c, src[0]);1227break;1228case nir_op_frsq:1229result = ntq_rsq(c, src[0]);1230break;1231case nir_op_fexp2:1232result = qir_EXP2(c, src[0]);1233break;1234case nir_op_flog2:1235result = qir_LOG2(c, src[0]);1236break;12371238case nir_op_ftrunc:1239result = qir_ITOF(c, qir_FTOI(c, src[0]));1240break;1241case nir_op_fceil:1242result = ntq_fceil(c, src[0]);1243break;1244case nir_op_ffract:1245result = ntq_ffract(c, src[0]);1246break;1247case nir_op_ffloor:1248result = ntq_ffloor(c, src[0]);1249break;12501251case nir_op_fsin:1252result = ntq_fsin(c, src[0]);1253break;1254case nir_op_fcos:1255result = ntq_fcos(c, src[0]);1256break;12571258case nir_op_fsign:1259result = ntq_fsign(c, src[0]);1260break;12611262case nir_op_fabs:1263result = qir_FMAXABS(c, src[0], src[0]);1264break;1265case nir_op_iabs:1266result = qir_MAX(c, src[0],1267qir_SUB(c, qir_uniform_ui(c, 0), src[0]));1268break;12691270case nir_op_ibitfield_extract:1271result = ntq_emit_ibfe(c, src[0], src[1], src[2]);1272break;12731274case nir_op_ubitfield_extract:1275result = ntq_emit_ubfe(c, src[0], src[1], src[2]);1276break;12771278case nir_op_usadd_4x8_vc4:1279result = qir_V8ADDS(c, src[0], src[1]);1280break;12811282case nir_op_ussub_4x8_vc4:1283result = qir_V8SUBS(c, src[0], src[1]);1284break;12851286case nir_op_umin_4x8_vc4:1287result = qir_V8MIN(c, src[0], src[1]);1288break;12891290case nir_op_umax_4x8_vc4:1291result = qir_V8MAX(c, src[0], src[1]);1292break;12931294case nir_op_umul_unorm_4x8_vc4:1295result = qir_V8MULD(c, src[0], src[1]);1296break;12971298case nir_op_fddx:1299case nir_op_fddx_coarse:1300case nir_op_fddx_fine:1301result = ntq_fddx(c, src[0]);1302break;13031304case nir_op_fddy:1305case nir_op_fddy_coarse:1306case nir_op_fddy_fine:1307result = ntq_fddy(c, src[0]);1308break;13091310default:1311fprintf(stderr, "unknown NIR ALU inst: ");1312nir_print_instr(&instr->instr, stderr);1313fprintf(stderr, "\n");1314abort();1315}13161317/* We have a scalar result, so the instruction should only have a1318* single channel written to.1319*/1320assert(util_is_power_of_two_or_zero(instr->dest.write_mask));1321ntq_store_dest(c, &instr->dest.dest,1322ffs(instr->dest.write_mask) - 1, result);1323}13241325static void1326emit_frag_end(struct vc4_compile *c)1327{1328struct qreg color;1329if (c->output_color_index != -1) {1330color = c->outputs[c->output_color_index];1331} else {1332color = qir_uniform_ui(c, 0);1333}13341335uint32_t discard_cond = QPU_COND_ALWAYS;1336if (c->s->info.fs.uses_discard) {1337qir_SF(c, c->discard);1338discard_cond = QPU_COND_ZS;1339}13401341if (c->fs_key->stencil_enabled) {1342qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0),1343qir_uniform(c, QUNIFORM_STENCIL, 0));1344if (c->fs_key->stencil_twoside) {1345qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0),1346qir_uniform(c, QUNIFORM_STENCIL, 1));1347}1348if (c->fs_key->stencil_full_writemasks) {1349qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0),1350qir_uniform(c, QUNIFORM_STENCIL, 2));1351}1352}13531354if (c->output_sample_mask_index != -1) {1355qir_MS_MASK(c, c->outputs[c->output_sample_mask_index]);1356}13571358if (c->fs_key->depth_enabled) {1359if (c->output_position_index != -1) {1360qir_FTOI_dest(c, qir_reg(QFILE_TLB_Z_WRITE, 0),1361qir_FMUL(c,1362c->outputs[c->output_position_index],1363qir_uniform_f(c, 0xffffff)))->cond = discard_cond;1364} else {1365qir_MOV_dest(c, qir_reg(QFILE_TLB_Z_WRITE, 0),1366qir_FRAG_Z(c))->cond = discard_cond;1367}1368}13691370if (!c->msaa_per_sample_output) {1371qir_MOV_dest(c, qir_reg(QFILE_TLB_COLOR_WRITE, 0),1372color)->cond = discard_cond;1373} else {1374for (int i = 0; i < VC4_MAX_SAMPLES; i++) {1375qir_MOV_dest(c, qir_reg(QFILE_TLB_COLOR_WRITE_MS, 0),1376c->sample_colors[i])->cond = discard_cond;1377}1378}1379}13801381static void1382emit_scaled_viewport_write(struct vc4_compile *c, struct qreg rcp_w)1383{1384struct qreg packed = qir_get_temp(c);13851386for (int i = 0; i < 2; i++) {1387struct qreg scale =1388qir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE + i, 0);13891390struct qreg packed_chan = packed;1391packed_chan.pack = QPU_PACK_A_16A + i;13921393qir_FTOI_dest(c, packed_chan,1394qir_FMUL(c,1395qir_FMUL(c,1396c->outputs[c->output_position_index + i],1397scale),1398rcp_w));1399}14001401qir_VPM_WRITE(c, packed);1402}14031404static void1405emit_zs_write(struct vc4_compile *c, struct qreg rcp_w)1406{1407struct qreg zscale = qir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0);1408struct qreg zoffset = qir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0);14091410qir_VPM_WRITE(c, qir_FADD(c, qir_FMUL(c, qir_FMUL(c,1411c->outputs[c->output_position_index + 2],1412zscale),1413rcp_w),1414zoffset));1415}14161417static void1418emit_rcp_wc_write(struct vc4_compile *c, struct qreg rcp_w)1419{1420qir_VPM_WRITE(c, rcp_w);1421}14221423static void1424emit_point_size_write(struct vc4_compile *c)1425{1426struct qreg point_size;14271428if (c->output_point_size_index != -1)1429point_size = c->outputs[c->output_point_size_index];1430else1431point_size = qir_uniform_f(c, 1.0);14321433qir_VPM_WRITE(c, point_size);1434}14351436/**1437* Emits a VPM read of the stub vertex attribute set up by vc4_draw.c.1438*1439* The simulator insists that there be at least one vertex attribute, so1440* vc4_draw.c will emit one if it wouldn't have otherwise. The simulator also1441* insists that all vertex attributes loaded get read by the VS/CS, so we have1442* to consume it here.1443*/1444static void1445emit_stub_vpm_read(struct vc4_compile *c)1446{1447if (c->num_inputs)1448return;14491450c->vattr_sizes[0] = 4;1451(void)qir_MOV(c, qir_reg(QFILE_VPM, 0));1452c->num_inputs++;1453}14541455static void1456emit_vert_end(struct vc4_compile *c,1457struct vc4_varying_slot *fs_inputs,1458uint32_t num_fs_inputs)1459{1460struct qreg rcp_w = ntq_rcp(c, c->outputs[c->output_position_index + 3]);14611462emit_stub_vpm_read(c);14631464emit_scaled_viewport_write(c, rcp_w);1465emit_zs_write(c, rcp_w);1466emit_rcp_wc_write(c, rcp_w);1467if (c->vs_key->per_vertex_point_size)1468emit_point_size_write(c);14691470for (int i = 0; i < num_fs_inputs; i++) {1471struct vc4_varying_slot *input = &fs_inputs[i];1472int j;14731474for (j = 0; j < c->num_outputs; j++) {1475struct vc4_varying_slot *output =1476&c->output_slots[j];14771478if (input->slot == output->slot &&1479input->swizzle == output->swizzle) {1480qir_VPM_WRITE(c, c->outputs[j]);1481break;1482}1483}1484/* Emit padding if we didn't find a declared VS output for1485* this FS input.1486*/1487if (j == c->num_outputs)1488qir_VPM_WRITE(c, qir_uniform_f(c, 0.0));1489}1490}14911492static void1493emit_coord_end(struct vc4_compile *c)1494{1495struct qreg rcp_w = ntq_rcp(c, c->outputs[c->output_position_index + 3]);14961497emit_stub_vpm_read(c);14981499for (int i = 0; i < 4; i++)1500qir_VPM_WRITE(c, c->outputs[c->output_position_index + i]);15011502emit_scaled_viewport_write(c, rcp_w);1503emit_zs_write(c, rcp_w);1504emit_rcp_wc_write(c, rcp_w);1505if (c->vs_key->per_vertex_point_size)1506emit_point_size_write(c);1507}15081509static void1510vc4_optimize_nir(struct nir_shader *s)1511{1512bool progress;1513unsigned lower_flrp =1514(s->options->lower_flrp16 ? 16 : 0) |1515(s->options->lower_flrp32 ? 32 : 0) |1516(s->options->lower_flrp64 ? 64 : 0);15171518do {1519progress = false;15201521NIR_PASS_V(s, nir_lower_vars_to_ssa);1522NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);1523NIR_PASS(progress, s, nir_lower_phis_to_scalar, false);1524NIR_PASS(progress, s, nir_copy_prop);1525NIR_PASS(progress, s, nir_opt_remove_phis);1526NIR_PASS(progress, s, nir_opt_dce);1527NIR_PASS(progress, s, nir_opt_dead_cf);1528NIR_PASS(progress, s, nir_opt_cse);1529NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);1530NIR_PASS(progress, s, nir_opt_algebraic);1531NIR_PASS(progress, s, nir_opt_constant_folding);1532if (lower_flrp != 0) {1533bool lower_flrp_progress = false;15341535NIR_PASS(lower_flrp_progress, s, nir_lower_flrp,1536lower_flrp,1537false /* always_precise */);1538if (lower_flrp_progress) {1539NIR_PASS(progress, s, nir_opt_constant_folding);1540progress = true;1541}15421543/* Nothing should rematerialize any flrps, so we only1544* need to do this lowering once.1545*/1546lower_flrp = 0;1547}15481549NIR_PASS(progress, s, nir_opt_undef);1550NIR_PASS(progress, s, nir_opt_loop_unroll,1551nir_var_shader_in |1552nir_var_shader_out |1553nir_var_function_temp);1554} while (progress);1555}15561557static int1558driver_location_compare(const void *in_a, const void *in_b)1559{1560const nir_variable *const *a = in_a;1561const nir_variable *const *b = in_b;15621563return (*a)->data.driver_location - (*b)->data.driver_location;1564}15651566static void1567ntq_setup_inputs(struct vc4_compile *c)1568{1569unsigned num_entries = 0;1570nir_foreach_shader_in_variable(var, c->s)1571num_entries++;15721573nir_variable *vars[num_entries];15741575unsigned i = 0;1576nir_foreach_shader_in_variable(var, c->s)1577vars[i++] = var;15781579/* Sort the variables so that we emit the input setup in1580* driver_location order. This is required for VPM reads, whose data1581* is fetched into the VPM in driver_location (TGSI register index)1582* order.1583*/1584qsort(&vars, num_entries, sizeof(*vars), driver_location_compare);15851586for (unsigned i = 0; i < num_entries; i++) {1587nir_variable *var = vars[i];1588unsigned array_len = MAX2(glsl_get_length(var->type), 1);1589unsigned loc = var->data.driver_location;15901591assert(array_len == 1);1592(void)array_len;1593resize_qreg_array(c, &c->inputs, &c->inputs_array_size,1594(loc + 1) * 4);15951596if (c->stage == QSTAGE_FRAG) {1597if (var->data.location == VARYING_SLOT_POS) {1598emit_fragcoord_input(c, loc);1599} else if (util_varying_is_point_coord(var->data.location,1600c->fs_key->point_sprite_mask)) {1601c->inputs[loc * 4 + 0] = c->point_x;1602c->inputs[loc * 4 + 1] = c->point_y;1603} else {1604emit_fragment_input(c, loc, var->data.location);1605}1606} else {1607emit_vertex_input(c, loc);1608}1609}1610}16111612static void1613ntq_setup_outputs(struct vc4_compile *c)1614{1615nir_foreach_shader_out_variable(var, c->s) {1616unsigned array_len = MAX2(glsl_get_length(var->type), 1);1617unsigned loc = var->data.driver_location * 4;16181619assert(array_len == 1);1620(void)array_len;16211622for (int i = 0; i < 4; i++)1623add_output(c, loc + i, var->data.location, i);16241625if (c->stage == QSTAGE_FRAG) {1626switch (var->data.location) {1627case FRAG_RESULT_COLOR:1628case FRAG_RESULT_DATA0:1629c->output_color_index = loc;1630break;1631case FRAG_RESULT_DEPTH:1632c->output_position_index = loc;1633break;1634case FRAG_RESULT_SAMPLE_MASK:1635c->output_sample_mask_index = loc;1636break;1637}1638} else {1639switch (var->data.location) {1640case VARYING_SLOT_POS:1641c->output_position_index = loc;1642break;1643case VARYING_SLOT_PSIZ:1644c->output_point_size_index = loc;1645break;1646}1647}1648}1649}16501651/**1652* Sets up the mapping from nir_register to struct qreg *.1653*1654* Each nir_register gets a struct qreg per 32-bit component being stored.1655*/1656static void1657ntq_setup_registers(struct vc4_compile *c, struct exec_list *list)1658{1659foreach_list_typed(nir_register, nir_reg, node, list) {1660unsigned array_len = MAX2(nir_reg->num_array_elems, 1);1661struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,1662array_len *1663nir_reg->num_components);16641665_mesa_hash_table_insert(c->def_ht, nir_reg, qregs);16661667for (int i = 0; i < array_len * nir_reg->num_components; i++)1668qregs[i] = qir_get_temp(c);1669}1670}16711672static void1673ntq_emit_load_const(struct vc4_compile *c, nir_load_const_instr *instr)1674{1675struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);1676for (int i = 0; i < instr->def.num_components; i++)1677qregs[i] = qir_uniform_ui(c, instr->value[i].u32);16781679_mesa_hash_table_insert(c->def_ht, &instr->def, qregs);1680}16811682static void1683ntq_emit_ssa_undef(struct vc4_compile *c, nir_ssa_undef_instr *instr)1684{1685struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);16861687/* QIR needs there to be *some* value, so pick 0 (same as for1688* ntq_setup_registers().1689*/1690for (int i = 0; i < instr->def.num_components; i++)1691qregs[i] = qir_uniform_ui(c, 0);1692}16931694static void1695ntq_emit_color_read(struct vc4_compile *c, nir_intrinsic_instr *instr)1696{1697assert(nir_src_as_uint(instr->src[0]) == 0);16981699/* Reads of the per-sample color need to be done in1700* order.1701*/1702int sample_index = (nir_intrinsic_base(instr) -1703VC4_NIR_TLB_COLOR_READ_INPUT);1704for (int i = 0; i <= sample_index; i++) {1705if (c->color_reads[i].file == QFILE_NULL) {1706c->color_reads[i] =1707qir_TLB_COLOR_READ(c);1708}1709}1710ntq_store_dest(c, &instr->dest, 0,1711qir_MOV(c, c->color_reads[sample_index]));1712}17131714static void1715ntq_emit_load_input(struct vc4_compile *c, nir_intrinsic_instr *instr)1716{1717assert(instr->num_components == 1);1718assert(nir_src_is_const(instr->src[0]) &&1719"vc4 doesn't support indirect inputs");17201721if (c->stage == QSTAGE_FRAG &&1722nir_intrinsic_base(instr) >= VC4_NIR_TLB_COLOR_READ_INPUT) {1723ntq_emit_color_read(c, instr);1724return;1725}17261727uint32_t offset = nir_intrinsic_base(instr) +1728nir_src_as_uint(instr->src[0]);1729int comp = nir_intrinsic_component(instr);1730ntq_store_dest(c, &instr->dest, 0,1731qir_MOV(c, c->inputs[offset * 4 + comp]));1732}17331734static void1735ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)1736{1737unsigned offset;17381739switch (instr->intrinsic) {1740case nir_intrinsic_load_uniform:1741assert(instr->num_components == 1);1742if (nir_src_is_const(instr->src[0])) {1743offset = nir_intrinsic_base(instr) +1744nir_src_as_uint(instr->src[0]);1745assert(offset % 4 == 0);1746/* We need dwords */1747offset = offset / 4;1748ntq_store_dest(c, &instr->dest, 0,1749qir_uniform(c, QUNIFORM_UNIFORM,1750offset));1751} else {1752ntq_store_dest(c, &instr->dest, 0,1753indirect_uniform_load(c, instr));1754}1755break;17561757case nir_intrinsic_load_ubo:1758assert(instr->num_components == 1);1759ntq_store_dest(c, &instr->dest, 0, vc4_ubo_load(c, instr));1760break;17611762case nir_intrinsic_load_user_clip_plane:1763for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) {1764ntq_store_dest(c, &instr->dest, i,1765qir_uniform(c, QUNIFORM_USER_CLIP_PLANE,1766nir_intrinsic_ucp_id(instr) *17674 + i));1768}1769break;17701771case nir_intrinsic_load_blend_const_color_r_float:1772case nir_intrinsic_load_blend_const_color_g_float:1773case nir_intrinsic_load_blend_const_color_b_float:1774case nir_intrinsic_load_blend_const_color_a_float:1775ntq_store_dest(c, &instr->dest, 0,1776qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR_X +1777(instr->intrinsic -1778nir_intrinsic_load_blend_const_color_r_float),17790));1780break;17811782case nir_intrinsic_load_blend_const_color_rgba8888_unorm:1783ntq_store_dest(c, &instr->dest, 0,1784qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR_RGBA,17850));1786break;17871788case nir_intrinsic_load_blend_const_color_aaaa8888_unorm:1789ntq_store_dest(c, &instr->dest, 0,1790qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR_AAAA,17910));1792break;17931794case nir_intrinsic_load_sample_mask_in:1795ntq_store_dest(c, &instr->dest, 0,1796qir_uniform(c, QUNIFORM_SAMPLE_MASK, 0));1797break;17981799case nir_intrinsic_load_front_face:1800/* The register contains 0 (front) or 1 (back), and we need to1801* turn it into a NIR bool where true means front.1802*/1803ntq_store_dest(c, &instr->dest, 0,1804qir_ADD(c,1805qir_uniform_ui(c, -1),1806qir_reg(QFILE_FRAG_REV_FLAG, 0)));1807break;18081809case nir_intrinsic_load_input:1810ntq_emit_load_input(c, instr);1811break;18121813case nir_intrinsic_store_output:1814assert(nir_src_is_const(instr->src[1]) &&1815"vc4 doesn't support indirect outputs");1816offset = nir_intrinsic_base(instr) +1817nir_src_as_uint(instr->src[1]);18181819/* MSAA color outputs are the only case where we have an1820* output that's not lowered to being a store of a single 321821* bit value.1822*/1823if (c->stage == QSTAGE_FRAG && instr->num_components == 4) {1824assert(offset == c->output_color_index);1825for (int i = 0; i < 4; i++) {1826c->sample_colors[i] =1827qir_MOV(c, ntq_get_src(c, instr->src[0],1828i));1829}1830} else {1831offset = offset * 4 + nir_intrinsic_component(instr);1832assert(instr->num_components == 1);1833c->outputs[offset] =1834qir_MOV(c, ntq_get_src(c, instr->src[0], 0));1835c->num_outputs = MAX2(c->num_outputs, offset + 1);1836}1837break;18381839case nir_intrinsic_discard:1840if (c->execute.file != QFILE_NULL) {1841qir_SF(c, c->execute);1842qir_MOV_cond(c, QPU_COND_ZS, c->discard,1843qir_uniform_ui(c, ~0));1844} else {1845qir_MOV_dest(c, c->discard, qir_uniform_ui(c, ~0));1846}1847break;18481849case nir_intrinsic_discard_if: {1850/* true (~0) if we're discarding */1851struct qreg cond = ntq_get_src(c, instr->src[0], 0);18521853if (c->execute.file != QFILE_NULL) {1854/* execute == 0 means the channel is active. Invert1855* the condition so that we can use zero as "executing1856* and discarding."1857*/1858qir_SF(c, qir_AND(c, c->execute, qir_NOT(c, cond)));1859qir_MOV_cond(c, QPU_COND_ZS, c->discard, cond);1860} else {1861qir_OR_dest(c, c->discard, c->discard,1862ntq_get_src(c, instr->src[0], 0));1863}18641865break;1866}18671868case nir_intrinsic_load_texture_rect_scaling: {1869assert(nir_src_is_const(instr->src[0]));1870int sampler = nir_src_as_int(instr->src[0]);18711872ntq_store_dest(c, &instr->dest, 0,1873qir_uniform(c, QUNIFORM_TEXRECT_SCALE_X, sampler));1874ntq_store_dest(c, &instr->dest, 1,1875qir_uniform(c, QUNIFORM_TEXRECT_SCALE_Y, sampler));1876break;1877}18781879default:1880fprintf(stderr, "Unknown intrinsic: ");1881nir_print_instr(&instr->instr, stderr);1882fprintf(stderr, "\n");1883break;1884}1885}18861887/* Clears (activates) the execute flags for any channels whose jump target1888* matches this block.1889*/1890static void1891ntq_activate_execute_for_block(struct vc4_compile *c)1892{1893qir_SF(c, qir_SUB(c,1894c->execute,1895qir_uniform_ui(c, c->cur_block->index)));1896qir_MOV_cond(c, QPU_COND_ZS, c->execute, qir_uniform_ui(c, 0));1897}18981899static void1900ntq_emit_if(struct vc4_compile *c, nir_if *if_stmt)1901{1902if (!c->vc4->screen->has_control_flow) {1903fprintf(stderr,1904"IF statement support requires updated kernel.\n");1905return;1906}19071908nir_block *nir_else_block = nir_if_first_else_block(if_stmt);1909bool empty_else_block =1910(nir_else_block == nir_if_last_else_block(if_stmt) &&1911exec_list_is_empty(&nir_else_block->instr_list));19121913struct qblock *then_block = qir_new_block(c);1914struct qblock *after_block = qir_new_block(c);1915struct qblock *else_block;1916if (empty_else_block)1917else_block = after_block;1918else1919else_block = qir_new_block(c);19201921bool was_top_level = false;1922if (c->execute.file == QFILE_NULL) {1923c->execute = qir_MOV(c, qir_uniform_ui(c, 0));1924was_top_level = true;1925}19261927/* Set ZS for executing (execute == 0) and jumping (if->condition ==1928* 0) channels, and then update execute flags for those to point to1929* the ELSE block.1930*/1931qir_SF(c, qir_OR(c,1932c->execute,1933ntq_get_src(c, if_stmt->condition, 0)));1934qir_MOV_cond(c, QPU_COND_ZS, c->execute,1935qir_uniform_ui(c, else_block->index));19361937/* Jump to ELSE if nothing is active for THEN, otherwise fall1938* through.1939*/1940qir_SF(c, c->execute);1941qir_BRANCH(c, QPU_COND_BRANCH_ALL_ZC);1942qir_link_blocks(c->cur_block, else_block);1943qir_link_blocks(c->cur_block, then_block);19441945/* Process the THEN block. */1946qir_set_emit_block(c, then_block);1947ntq_emit_cf_list(c, &if_stmt->then_list);19481949if (!empty_else_block) {1950/* Handle the end of the THEN block. First, all currently1951* active channels update their execute flags to point to1952* ENDIF1953*/1954qir_SF(c, c->execute);1955qir_MOV_cond(c, QPU_COND_ZS, c->execute,1956qir_uniform_ui(c, after_block->index));19571958/* If everything points at ENDIF, then jump there immediately. */1959qir_SF(c, qir_SUB(c, c->execute, qir_uniform_ui(c, after_block->index)));1960qir_BRANCH(c, QPU_COND_BRANCH_ALL_ZS);1961qir_link_blocks(c->cur_block, after_block);1962qir_link_blocks(c->cur_block, else_block);19631964qir_set_emit_block(c, else_block);1965ntq_activate_execute_for_block(c);1966ntq_emit_cf_list(c, &if_stmt->else_list);1967}19681969qir_link_blocks(c->cur_block, after_block);19701971qir_set_emit_block(c, after_block);1972if (was_top_level) {1973c->execute = c->undef;1974c->last_top_block = c->cur_block;1975} else {1976ntq_activate_execute_for_block(c);1977}1978}19791980static void1981ntq_emit_jump(struct vc4_compile *c, nir_jump_instr *jump)1982{1983struct qblock *jump_block;1984switch (jump->type) {1985case nir_jump_break:1986jump_block = c->loop_break_block;1987break;1988case nir_jump_continue:1989jump_block = c->loop_cont_block;1990break;1991default:1992unreachable("Unsupported jump type\n");1993}19941995qir_SF(c, c->execute);1996qir_MOV_cond(c, QPU_COND_ZS, c->execute,1997qir_uniform_ui(c, jump_block->index));19981999/* Jump to the destination block if everyone has taken the jump. */2000qir_SF(c, qir_SUB(c, c->execute, qir_uniform_ui(c, jump_block->index)));2001qir_BRANCH(c, QPU_COND_BRANCH_ALL_ZS);2002struct qblock *new_block = qir_new_block(c);2003qir_link_blocks(c->cur_block, jump_block);2004qir_link_blocks(c->cur_block, new_block);2005qir_set_emit_block(c, new_block);2006}20072008static void2009ntq_emit_instr(struct vc4_compile *c, nir_instr *instr)2010{2011switch (instr->type) {2012case nir_instr_type_alu:2013ntq_emit_alu(c, nir_instr_as_alu(instr));2014break;20152016case nir_instr_type_intrinsic:2017ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr));2018break;20192020case nir_instr_type_load_const:2021ntq_emit_load_const(c, nir_instr_as_load_const(instr));2022break;20232024case nir_instr_type_ssa_undef:2025ntq_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr));2026break;20272028case nir_instr_type_tex:2029ntq_emit_tex(c, nir_instr_as_tex(instr));2030break;20312032case nir_instr_type_jump:2033ntq_emit_jump(c, nir_instr_as_jump(instr));2034break;20352036default:2037fprintf(stderr, "Unknown NIR instr type: ");2038nir_print_instr(instr, stderr);2039fprintf(stderr, "\n");2040abort();2041}2042}20432044static void2045ntq_emit_block(struct vc4_compile *c, nir_block *block)2046{2047nir_foreach_instr(instr, block) {2048ntq_emit_instr(c, instr);2049}2050}20512052static void ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list);20532054static void2055ntq_emit_loop(struct vc4_compile *c, nir_loop *loop)2056{2057if (!c->vc4->screen->has_control_flow) {2058fprintf(stderr,2059"loop support requires updated kernel.\n");2060ntq_emit_cf_list(c, &loop->body);2061return;2062}20632064bool was_top_level = false;2065if (c->execute.file == QFILE_NULL) {2066c->execute = qir_MOV(c, qir_uniform_ui(c, 0));2067was_top_level = true;2068}20692070struct qblock *save_loop_cont_block = c->loop_cont_block;2071struct qblock *save_loop_break_block = c->loop_break_block;20722073c->loop_cont_block = qir_new_block(c);2074c->loop_break_block = qir_new_block(c);20752076qir_link_blocks(c->cur_block, c->loop_cont_block);2077qir_set_emit_block(c, c->loop_cont_block);2078ntq_activate_execute_for_block(c);20792080ntq_emit_cf_list(c, &loop->body);20812082/* If anything had explicitly continued, or is here at the end of the2083* loop, then we need to loop again. SF updates are masked by the2084* instruction's condition, so we can do the OR of the two conditions2085* within SF.2086*/2087qir_SF(c, c->execute);2088struct qinst *cont_check =2089qir_SUB_dest(c,2090c->undef,2091c->execute,2092qir_uniform_ui(c, c->loop_cont_block->index));2093cont_check->cond = QPU_COND_ZC;2094cont_check->sf = true;20952096qir_BRANCH(c, QPU_COND_BRANCH_ANY_ZS);2097qir_link_blocks(c->cur_block, c->loop_cont_block);2098qir_link_blocks(c->cur_block, c->loop_break_block);20992100qir_set_emit_block(c, c->loop_break_block);2101if (was_top_level) {2102c->execute = c->undef;2103c->last_top_block = c->cur_block;2104} else {2105ntq_activate_execute_for_block(c);2106}21072108c->loop_break_block = save_loop_break_block;2109c->loop_cont_block = save_loop_cont_block;2110}21112112static void2113ntq_emit_function(struct vc4_compile *c, nir_function_impl *func)2114{2115fprintf(stderr, "FUNCTIONS not handled.\n");2116abort();2117}21182119static void2120ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list)2121{2122foreach_list_typed(nir_cf_node, node, node, list) {2123switch (node->type) {2124case nir_cf_node_block:2125ntq_emit_block(c, nir_cf_node_as_block(node));2126break;21272128case nir_cf_node_if:2129ntq_emit_if(c, nir_cf_node_as_if(node));2130break;21312132case nir_cf_node_loop:2133ntq_emit_loop(c, nir_cf_node_as_loop(node));2134break;21352136case nir_cf_node_function:2137ntq_emit_function(c, nir_cf_node_as_function(node));2138break;21392140default:2141fprintf(stderr, "Unknown NIR node type\n");2142abort();2143}2144}2145}21462147static void2148ntq_emit_impl(struct vc4_compile *c, nir_function_impl *impl)2149{2150ntq_setup_registers(c, &impl->registers);2151ntq_emit_cf_list(c, &impl->body);2152}21532154static void2155nir_to_qir(struct vc4_compile *c)2156{2157if (c->stage == QSTAGE_FRAG && c->s->info.fs.uses_discard)2158c->discard = qir_MOV(c, qir_uniform_ui(c, 0));21592160ntq_setup_inputs(c);2161ntq_setup_outputs(c);21622163/* Find the main function and emit the body. */2164nir_foreach_function(function, c->s) {2165assert(strcmp(function->name, "main") == 0);2166assert(function->impl);2167ntq_emit_impl(c, function->impl);2168}2169}21702171static const nir_shader_compiler_options nir_options = {2172.lower_all_io_to_temps = true,2173.lower_extract_byte = true,2174.lower_extract_word = true,2175.lower_insert_byte = true,2176.lower_insert_word = true,2177.lower_fdiv = true,2178.lower_ffma16 = true,2179.lower_ffma32 = true,2180.lower_ffma64 = true,2181.lower_flrp32 = true,2182.lower_fmod = true,2183.lower_fpow = true,2184.lower_fsat = true,2185.lower_fsqrt = true,2186.lower_ldexp = true,2187.lower_fneg = true,2188.lower_ineg = true,2189.lower_rotate = true,2190.lower_to_scalar = true,2191.lower_umax = true,2192.lower_umin = true,2193.lower_isign = true,2194.has_fsub = true,2195.has_isub = true,2196.max_unroll_iterations = 32,2197};21982199const void *2200vc4_screen_get_compiler_options(struct pipe_screen *pscreen,2201enum pipe_shader_ir ir,2202enum pipe_shader_type shader)2203{2204return &nir_options;2205}22062207static int2208count_nir_instrs(nir_shader *nir)2209{2210int count = 0;2211nir_foreach_function(function, nir) {2212if (!function->impl)2213continue;2214nir_foreach_block(block, function->impl) {2215nir_foreach_instr(instr, block)2216count++;2217}2218}2219return count;2220}22212222static struct vc4_compile *2223vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,2224struct vc4_key *key, bool fs_threaded)2225{2226struct vc4_compile *c = qir_compile_init();22272228c->vc4 = vc4;2229c->stage = stage;2230c->shader_state = &key->shader_state->base;2231c->program_id = key->shader_state->program_id;2232c->variant_id =2233p_atomic_inc_return(&key->shader_state->compiled_variant_count);2234c->fs_threaded = fs_threaded;22352236c->key = key;2237switch (stage) {2238case QSTAGE_FRAG:2239c->fs_key = (struct vc4_fs_key *)key;2240if (c->fs_key->is_points) {2241c->point_x = emit_fragment_varying(c, ~0, 0);2242c->point_y = emit_fragment_varying(c, ~0, 0);2243} else if (c->fs_key->is_lines) {2244c->line_x = emit_fragment_varying(c, ~0, 0);2245}2246break;2247case QSTAGE_VERT:2248c->vs_key = (struct vc4_vs_key *)key;2249break;2250case QSTAGE_COORD:2251c->vs_key = (struct vc4_vs_key *)key;2252break;2253}22542255c->s = nir_shader_clone(c, key->shader_state->base.ir.nir);22562257if (stage == QSTAGE_FRAG) {2258NIR_PASS_V(c->s, vc4_nir_lower_blend, c);2259}22602261struct nir_lower_tex_options tex_options = {2262.lower_txp = ~0,22632264/* Apply swizzles to all samplers. */2265.swizzle_result = ~0,2266};22672268/* Lower the format swizzle and ARB_texture_swizzle-style swizzle.2269* The format swizzling applies before sRGB decode, and2270* ARB_texture_swizzle is the last thing before returning the sample.2271*/2272for (int i = 0; i < ARRAY_SIZE(key->tex); i++) {2273enum pipe_format format = c->key->tex[i].format;22742275if (!format)2276continue;22772278const uint8_t *format_swizzle = vc4_get_format_swizzle(format);22792280for (int j = 0; j < 4; j++) {2281uint8_t arb_swiz = c->key->tex[i].swizzle[j];22822283if (arb_swiz <= 3) {2284tex_options.swizzles[i][j] =2285format_swizzle[arb_swiz];2286} else {2287tex_options.swizzles[i][j] = arb_swiz;2288}2289}22902291if (util_format_is_srgb(format))2292tex_options.lower_srgb |= (1 << i);2293}22942295NIR_PASS_V(c->s, nir_lower_tex, &tex_options);22962297if (c->key->ucp_enables) {2298if (stage == QSTAGE_FRAG) {2299NIR_PASS_V(c->s, nir_lower_clip_fs,2300c->key->ucp_enables, false);2301} else {2302NIR_PASS_V(c->s, nir_lower_clip_vs,2303c->key->ucp_enables, false, false, NULL);2304NIR_PASS_V(c->s, nir_lower_io_to_scalar,2305nir_var_shader_out);2306}2307}23082309/* FS input scalarizing must happen after nir_lower_two_sided_color,2310* which only handles a vec4 at a time. Similarly, VS output2311* scalarizing must happen after nir_lower_clip_vs.2312*/2313if (c->stage == QSTAGE_FRAG)2314NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in);2315else2316NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out);23172318NIR_PASS_V(c->s, vc4_nir_lower_io, c);2319NIR_PASS_V(c->s, vc4_nir_lower_txf_ms, c);2320nir_lower_idiv_options idiv_options = {2321.imprecise_32bit_lowering = true,2322.allow_fp16 = true,2323};2324NIR_PASS_V(c->s, nir_lower_idiv, &idiv_options);23252326vc4_optimize_nir(c->s);23272328/* Do late algebraic optimization to turn add(a, neg(b)) back into2329* subs, then the mandatory cleanup after algebraic. Note that it may2330* produce fnegs, and if so then we need to keep running to squash2331* fneg(fneg(a)).2332*/2333bool more_late_algebraic = true;2334while (more_late_algebraic) {2335more_late_algebraic = false;2336NIR_PASS(more_late_algebraic, c->s, nir_opt_algebraic_late);2337NIR_PASS_V(c->s, nir_opt_constant_folding);2338NIR_PASS_V(c->s, nir_copy_prop);2339NIR_PASS_V(c->s, nir_opt_dce);2340NIR_PASS_V(c->s, nir_opt_cse);2341}23422343NIR_PASS_V(c->s, nir_lower_bool_to_int32);23442345NIR_PASS_V(c->s, nir_convert_from_ssa, true);23462347if (vc4_debug & VC4_DEBUG_SHADERDB) {2348fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d NIR instructions\n",2349qir_get_stage_name(c->stage),2350c->program_id, c->variant_id,2351count_nir_instrs(c->s));2352}23532354if (vc4_debug & VC4_DEBUG_NIR) {2355fprintf(stderr, "%s prog %d/%d NIR:\n",2356qir_get_stage_name(c->stage),2357c->program_id, c->variant_id);2358nir_print_shader(c->s, stderr);2359}23602361nir_to_qir(c);23622363switch (stage) {2364case QSTAGE_FRAG:2365/* FS threading requires that the thread execute2366* QPU_SIG_LAST_THREAD_SWITCH exactly once before terminating2367* (with no other THRSW afterwards, obviously). If we didn't2368* fetch a texture at a top level block, this wouldn't be2369* true.2370*/2371if (c->fs_threaded && !c->last_thrsw_at_top_level) {2372c->failed = true;2373return c;2374}23752376emit_frag_end(c);2377break;2378case QSTAGE_VERT:2379emit_vert_end(c,2380c->vs_key->fs_inputs->input_slots,2381c->vs_key->fs_inputs->num_inputs);2382break;2383case QSTAGE_COORD:2384emit_coord_end(c);2385break;2386}23872388if (vc4_debug & VC4_DEBUG_QIR) {2389fprintf(stderr, "%s prog %d/%d pre-opt QIR:\n",2390qir_get_stage_name(c->stage),2391c->program_id, c->variant_id);2392qir_dump(c);2393fprintf(stderr, "\n");2394}23952396qir_optimize(c);2397qir_lower_uniforms(c);23982399qir_schedule_instructions(c);2400qir_emit_uniform_stream_resets(c);24012402if (vc4_debug & VC4_DEBUG_QIR) {2403fprintf(stderr, "%s prog %d/%d QIR:\n",2404qir_get_stage_name(c->stage),2405c->program_id, c->variant_id);2406qir_dump(c);2407fprintf(stderr, "\n");2408}24092410qir_reorder_uniforms(c);2411vc4_generate_code(vc4, c);24122413if (vc4_debug & VC4_DEBUG_SHADERDB) {2414fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d instructions\n",2415qir_get_stage_name(c->stage),2416c->program_id, c->variant_id,2417c->qpu_inst_count);2418fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d uniforms\n",2419qir_get_stage_name(c->stage),2420c->program_id, c->variant_id,2421c->num_uniforms);2422}24232424ralloc_free(c->s);24252426return c;2427}24282429static void *2430vc4_shader_state_create(struct pipe_context *pctx,2431const struct pipe_shader_state *cso)2432{2433struct vc4_context *vc4 = vc4_context(pctx);2434struct vc4_uncompiled_shader *so = CALLOC_STRUCT(vc4_uncompiled_shader);2435if (!so)2436return NULL;24372438so->program_id = vc4->next_uncompiled_program_id++;24392440nir_shader *s;24412442if (cso->type == PIPE_SHADER_IR_NIR) {2443/* The backend takes ownership of the NIR shader on state2444* creation.2445*/2446s = cso->ir.nir;2447} else {2448assert(cso->type == PIPE_SHADER_IR_TGSI);24492450if (vc4_debug & VC4_DEBUG_TGSI) {2451fprintf(stderr, "prog %d TGSI:\n",2452so->program_id);2453tgsi_dump(cso->tokens, 0);2454fprintf(stderr, "\n");2455}2456s = tgsi_to_nir(cso->tokens, pctx->screen, false);2457}24582459if (s->info.stage == MESA_SHADER_VERTEX)2460NIR_PASS_V(s, nir_lower_point_size, 1.0f, 0.0f);24612462NIR_PASS_V(s, nir_lower_io,2463nir_var_shader_in | nir_var_shader_out | nir_var_uniform,2464type_size, (nir_lower_io_options)0);24652466NIR_PASS_V(s, nir_lower_regs_to_ssa);2467NIR_PASS_V(s, nir_normalize_cubemap_coords);24682469NIR_PASS_V(s, nir_lower_load_const_to_scalar);24702471vc4_optimize_nir(s);24722473NIR_PASS_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL);24742475/* Garbage collect dead instructions */2476nir_sweep(s);24772478so->base.type = PIPE_SHADER_IR_NIR;2479so->base.ir.nir = s;24802481if (vc4_debug & VC4_DEBUG_NIR) {2482fprintf(stderr, "%s prog %d NIR:\n",2483gl_shader_stage_name(s->info.stage),2484so->program_id);2485nir_print_shader(s, stderr);2486fprintf(stderr, "\n");2487}24882489return so;2490}24912492static void2493copy_uniform_state_to_shader(struct vc4_compiled_shader *shader,2494struct vc4_compile *c)2495{2496int count = c->num_uniforms;2497struct vc4_shader_uniform_info *uinfo = &shader->uniforms;24982499uinfo->count = count;2500uinfo->data = ralloc_array(shader, uint32_t, count);2501memcpy(uinfo->data, c->uniform_data,2502count * sizeof(*uinfo->data));2503uinfo->contents = ralloc_array(shader, enum quniform_contents, count);2504memcpy(uinfo->contents, c->uniform_contents,2505count * sizeof(*uinfo->contents));2506uinfo->num_texture_samples = c->num_texture_samples;25072508vc4_set_shader_uniform_dirty_flags(shader);2509}25102511static void2512vc4_setup_compiled_fs_inputs(struct vc4_context *vc4, struct vc4_compile *c,2513struct vc4_compiled_shader *shader)2514{2515struct vc4_fs_inputs inputs;25162517memset(&inputs, 0, sizeof(inputs));2518inputs.input_slots = ralloc_array(shader,2519struct vc4_varying_slot,2520c->num_input_slots);25212522bool input_live[c->num_input_slots];25232524memset(input_live, 0, sizeof(input_live));2525qir_for_each_inst_inorder(inst, c) {2526for (int i = 0; i < qir_get_nsrc(inst); i++) {2527if (inst->src[i].file == QFILE_VARY)2528input_live[inst->src[i].index] = true;2529}2530}25312532for (int i = 0; i < c->num_input_slots; i++) {2533struct vc4_varying_slot *slot = &c->input_slots[i];25342535if (!input_live[i])2536continue;25372538/* Skip non-VS-output inputs. */2539if (slot->slot == (uint8_t)~0)2540continue;25412542if (slot->slot == VARYING_SLOT_COL0 ||2543slot->slot == VARYING_SLOT_COL1 ||2544slot->slot == VARYING_SLOT_BFC0 ||2545slot->slot == VARYING_SLOT_BFC1) {2546shader->color_inputs |= (1 << inputs.num_inputs);2547}25482549inputs.input_slots[inputs.num_inputs] = *slot;2550inputs.num_inputs++;2551}2552shader->num_inputs = inputs.num_inputs;25532554/* Add our set of inputs to the set of all inputs seen. This way, we2555* can have a single pointer that identifies an FS inputs set,2556* allowing VS to avoid recompiling when the FS is recompiled (or a2557* new one is bound using separate shader objects) but the inputs2558* don't change.2559*/2560struct set_entry *entry = _mesa_set_search(vc4->fs_inputs_set, &inputs);2561if (entry) {2562shader->fs_inputs = entry->key;2563ralloc_free(inputs.input_slots);2564} else {2565struct vc4_fs_inputs *alloc_inputs;25662567alloc_inputs = rzalloc(vc4->fs_inputs_set, struct vc4_fs_inputs);2568memcpy(alloc_inputs, &inputs, sizeof(inputs));2569ralloc_steal(alloc_inputs, inputs.input_slots);2570_mesa_set_add(vc4->fs_inputs_set, alloc_inputs);25712572shader->fs_inputs = alloc_inputs;2573}2574}25752576static struct vc4_compiled_shader *2577vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,2578struct vc4_key *key)2579{2580struct hash_table *ht;2581uint32_t key_size;2582bool try_threading;25832584if (stage == QSTAGE_FRAG) {2585ht = vc4->fs_cache;2586key_size = sizeof(struct vc4_fs_key);2587try_threading = vc4->screen->has_threaded_fs;2588} else {2589ht = vc4->vs_cache;2590key_size = sizeof(struct vc4_vs_key);2591try_threading = false;2592}25932594struct vc4_compiled_shader *shader;2595struct hash_entry *entry = _mesa_hash_table_search(ht, key);2596if (entry)2597return entry->data;25982599struct vc4_compile *c = vc4_shader_ntq(vc4, stage, key, try_threading);2600/* If the FS failed to compile threaded, fall back to single threaded. */2601if (try_threading && c->failed) {2602qir_compile_destroy(c);2603c = vc4_shader_ntq(vc4, stage, key, false);2604}26052606shader = rzalloc(NULL, struct vc4_compiled_shader);26072608shader->program_id = vc4->next_compiled_program_id++;2609if (stage == QSTAGE_FRAG) {2610vc4_setup_compiled_fs_inputs(vc4, c, shader);26112612/* Note: the temporary clone in c->s has been freed. */2613nir_shader *orig_shader = key->shader_state->base.ir.nir;2614if (orig_shader->info.outputs_written & (1 << FRAG_RESULT_DEPTH))2615shader->disable_early_z = true;2616} else {2617shader->num_inputs = c->num_inputs;26182619shader->vattr_offsets[0] = 0;2620for (int i = 0; i < 8; i++) {2621shader->vattr_offsets[i + 1] =2622shader->vattr_offsets[i] + c->vattr_sizes[i];26232624if (c->vattr_sizes[i])2625shader->vattrs_live |= (1 << i);2626}2627}26282629shader->failed = c->failed;2630if (c->failed) {2631shader->failed = true;2632} else {2633copy_uniform_state_to_shader(shader, c);2634shader->bo = vc4_bo_alloc_shader(vc4->screen, c->qpu_insts,2635c->qpu_inst_count *2636sizeof(uint64_t));2637}26382639shader->fs_threaded = c->fs_threaded;26402641if ((vc4_debug & VC4_DEBUG_SHADERDB) && stage == QSTAGE_FRAG) {2642fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d FS threads\n",2643qir_get_stage_name(c->stage),2644c->program_id, c->variant_id,26451 + shader->fs_threaded);2646}26472648qir_compile_destroy(c);26492650struct vc4_key *dup_key;2651dup_key = rzalloc_size(shader, key_size); /* TODO: don't use rzalloc */2652memcpy(dup_key, key, key_size);2653_mesa_hash_table_insert(ht, dup_key, shader);26542655return shader;2656}26572658static void2659vc4_setup_shared_key(struct vc4_context *vc4, struct vc4_key *key,2660struct vc4_texture_stateobj *texstate)2661{2662for (int i = 0; i < texstate->num_textures; i++) {2663struct pipe_sampler_view *sampler = texstate->textures[i];2664struct vc4_sampler_view *vc4_sampler = vc4_sampler_view(sampler);2665struct pipe_sampler_state *sampler_state =2666texstate->samplers[i];26672668if (!sampler)2669continue;26702671key->tex[i].format = sampler->format;2672key->tex[i].swizzle[0] = sampler->swizzle_r;2673key->tex[i].swizzle[1] = sampler->swizzle_g;2674key->tex[i].swizzle[2] = sampler->swizzle_b;2675key->tex[i].swizzle[3] = sampler->swizzle_a;26762677if (sampler->texture->nr_samples > 1) {2678key->tex[i].msaa_width = sampler->texture->width0;2679key->tex[i].msaa_height = sampler->texture->height0;2680} else if (sampler){2681key->tex[i].compare_mode = sampler_state->compare_mode;2682key->tex[i].compare_func = sampler_state->compare_func;2683key->tex[i].wrap_s = sampler_state->wrap_s;2684key->tex[i].wrap_t = sampler_state->wrap_t;2685key->tex[i].force_first_level =2686vc4_sampler->force_first_level;2687}2688}26892690key->ucp_enables = vc4->rasterizer->base.clip_plane_enable;2691}26922693static void2694vc4_update_compiled_fs(struct vc4_context *vc4, uint8_t prim_mode)2695{2696struct vc4_job *job = vc4->job;2697struct vc4_fs_key local_key;2698struct vc4_fs_key *key = &local_key;26992700if (!(vc4->dirty & (VC4_DIRTY_PRIM_MODE |2701VC4_DIRTY_BLEND |2702VC4_DIRTY_FRAMEBUFFER |2703VC4_DIRTY_ZSA |2704VC4_DIRTY_RASTERIZER |2705VC4_DIRTY_SAMPLE_MASK |2706VC4_DIRTY_FRAGTEX |2707VC4_DIRTY_UNCOMPILED_FS |2708VC4_DIRTY_UBO_1_SIZE))) {2709return;2710}27112712memset(key, 0, sizeof(*key));2713vc4_setup_shared_key(vc4, &key->base, &vc4->fragtex);2714key->base.shader_state = vc4->prog.bind_fs;2715key->is_points = (prim_mode == PIPE_PRIM_POINTS);2716key->is_lines = (prim_mode >= PIPE_PRIM_LINES &&2717prim_mode <= PIPE_PRIM_LINE_STRIP);2718key->blend = vc4->blend->rt[0];2719if (vc4->blend->logicop_enable) {2720key->logicop_func = vc4->blend->logicop_func;2721} else {2722key->logicop_func = PIPE_LOGICOP_COPY;2723}2724if (job->msaa) {2725key->msaa = vc4->rasterizer->base.multisample;2726key->sample_coverage = (vc4->sample_mask != (1 << VC4_MAX_SAMPLES) - 1);2727key->sample_alpha_to_coverage = vc4->blend->alpha_to_coverage;2728key->sample_alpha_to_one = vc4->blend->alpha_to_one;2729}27302731if (vc4->framebuffer.cbufs[0])2732key->color_format = vc4->framebuffer.cbufs[0]->format;27332734key->stencil_enabled = vc4->zsa->stencil_uniforms[0] != 0;2735key->stencil_twoside = vc4->zsa->stencil_uniforms[1] != 0;2736key->stencil_full_writemasks = vc4->zsa->stencil_uniforms[2] != 0;2737key->depth_enabled = (vc4->zsa->base.depth_enabled ||2738key->stencil_enabled);27392740if (key->is_points) {2741key->point_sprite_mask =2742vc4->rasterizer->base.sprite_coord_enable;2743key->point_coord_upper_left =2744(vc4->rasterizer->base.sprite_coord_mode ==2745PIPE_SPRITE_COORD_UPPER_LEFT);2746}27472748key->ubo_1_size = vc4->constbuf[PIPE_SHADER_FRAGMENT].cb[1].buffer_size;27492750struct vc4_compiled_shader *old_fs = vc4->prog.fs;2751vc4->prog.fs = vc4_get_compiled_shader(vc4, QSTAGE_FRAG, &key->base);2752if (vc4->prog.fs == old_fs)2753return;27542755vc4->dirty |= VC4_DIRTY_COMPILED_FS;27562757if (vc4->rasterizer->base.flatshade &&2758(!old_fs || vc4->prog.fs->color_inputs != old_fs->color_inputs)) {2759vc4->dirty |= VC4_DIRTY_FLAT_SHADE_FLAGS;2760}27612762if (!old_fs || vc4->prog.fs->fs_inputs != old_fs->fs_inputs)2763vc4->dirty |= VC4_DIRTY_FS_INPUTS;2764}27652766static void2767vc4_update_compiled_vs(struct vc4_context *vc4, uint8_t prim_mode)2768{2769struct vc4_vs_key local_key;2770struct vc4_vs_key *key = &local_key;27712772if (!(vc4->dirty & (VC4_DIRTY_PRIM_MODE |2773VC4_DIRTY_RASTERIZER |2774VC4_DIRTY_VERTTEX |2775VC4_DIRTY_VTXSTATE |2776VC4_DIRTY_UNCOMPILED_VS |2777VC4_DIRTY_FS_INPUTS))) {2778return;2779}27802781memset(key, 0, sizeof(*key));2782vc4_setup_shared_key(vc4, &key->base, &vc4->verttex);2783key->base.shader_state = vc4->prog.bind_vs;2784key->fs_inputs = vc4->prog.fs->fs_inputs;27852786for (int i = 0; i < ARRAY_SIZE(key->attr_formats); i++)2787key->attr_formats[i] = vc4->vtx->pipe[i].src_format;27882789key->per_vertex_point_size =2790(prim_mode == PIPE_PRIM_POINTS &&2791vc4->rasterizer->base.point_size_per_vertex);27922793struct vc4_compiled_shader *vs =2794vc4_get_compiled_shader(vc4, QSTAGE_VERT, &key->base);2795if (vs != vc4->prog.vs) {2796vc4->prog.vs = vs;2797vc4->dirty |= VC4_DIRTY_COMPILED_VS;2798}27992800key->is_coord = true;2801/* Coord shaders don't care what the FS inputs are. */2802key->fs_inputs = NULL;2803struct vc4_compiled_shader *cs =2804vc4_get_compiled_shader(vc4, QSTAGE_COORD, &key->base);2805if (cs != vc4->prog.cs) {2806vc4->prog.cs = cs;2807vc4->dirty |= VC4_DIRTY_COMPILED_CS;2808}2809}28102811bool2812vc4_update_compiled_shaders(struct vc4_context *vc4, uint8_t prim_mode)2813{2814vc4_update_compiled_fs(vc4, prim_mode);2815vc4_update_compiled_vs(vc4, prim_mode);28162817return !(vc4->prog.cs->failed ||2818vc4->prog.vs->failed ||2819vc4->prog.fs->failed);2820}28212822static uint32_t2823fs_cache_hash(const void *key)2824{2825return _mesa_hash_data(key, sizeof(struct vc4_fs_key));2826}28272828static uint32_t2829vs_cache_hash(const void *key)2830{2831return _mesa_hash_data(key, sizeof(struct vc4_vs_key));2832}28332834static bool2835fs_cache_compare(const void *key1, const void *key2)2836{2837return memcmp(key1, key2, sizeof(struct vc4_fs_key)) == 0;2838}28392840static bool2841vs_cache_compare(const void *key1, const void *key2)2842{2843return memcmp(key1, key2, sizeof(struct vc4_vs_key)) == 0;2844}28452846static uint32_t2847fs_inputs_hash(const void *key)2848{2849const struct vc4_fs_inputs *inputs = key;28502851return _mesa_hash_data(inputs->input_slots,2852sizeof(*inputs->input_slots) *2853inputs->num_inputs);2854}28552856static bool2857fs_inputs_compare(const void *key1, const void *key2)2858{2859const struct vc4_fs_inputs *inputs1 = key1;2860const struct vc4_fs_inputs *inputs2 = key2;28612862return (inputs1->num_inputs == inputs2->num_inputs &&2863memcmp(inputs1->input_slots,2864inputs2->input_slots,2865sizeof(*inputs1->input_slots) *2866inputs1->num_inputs) == 0);2867}28682869static void2870delete_from_cache_if_matches(struct hash_table *ht,2871struct vc4_compiled_shader **last_compile,2872struct hash_entry *entry,2873struct vc4_uncompiled_shader *so)2874{2875const struct vc4_key *key = entry->key;28762877if (key->shader_state == so) {2878struct vc4_compiled_shader *shader = entry->data;2879_mesa_hash_table_remove(ht, entry);2880vc4_bo_unreference(&shader->bo);28812882if (shader == *last_compile)2883*last_compile = NULL;28842885ralloc_free(shader);2886}2887}28882889static void2890vc4_shader_state_delete(struct pipe_context *pctx, void *hwcso)2891{2892struct vc4_context *vc4 = vc4_context(pctx);2893struct vc4_uncompiled_shader *so = hwcso;28942895hash_table_foreach(vc4->fs_cache, entry) {2896delete_from_cache_if_matches(vc4->fs_cache, &vc4->prog.fs,2897entry, so);2898}2899hash_table_foreach(vc4->vs_cache, entry) {2900delete_from_cache_if_matches(vc4->vs_cache, &vc4->prog.vs,2901entry, so);2902}29032904ralloc_free(so->base.ir.nir);2905free(so);2906}29072908static void2909vc4_fp_state_bind(struct pipe_context *pctx, void *hwcso)2910{2911struct vc4_context *vc4 = vc4_context(pctx);2912vc4->prog.bind_fs = hwcso;2913vc4->dirty |= VC4_DIRTY_UNCOMPILED_FS;2914}29152916static void2917vc4_vp_state_bind(struct pipe_context *pctx, void *hwcso)2918{2919struct vc4_context *vc4 = vc4_context(pctx);2920vc4->prog.bind_vs = hwcso;2921vc4->dirty |= VC4_DIRTY_UNCOMPILED_VS;2922}29232924void2925vc4_program_init(struct pipe_context *pctx)2926{2927struct vc4_context *vc4 = vc4_context(pctx);29282929pctx->create_vs_state = vc4_shader_state_create;2930pctx->delete_vs_state = vc4_shader_state_delete;29312932pctx->create_fs_state = vc4_shader_state_create;2933pctx->delete_fs_state = vc4_shader_state_delete;29342935pctx->bind_fs_state = vc4_fp_state_bind;2936pctx->bind_vs_state = vc4_vp_state_bind;29372938vc4->fs_cache = _mesa_hash_table_create(pctx, fs_cache_hash,2939fs_cache_compare);2940vc4->vs_cache = _mesa_hash_table_create(pctx, vs_cache_hash,2941vs_cache_compare);2942vc4->fs_inputs_set = _mesa_set_create(pctx, fs_inputs_hash,2943fs_inputs_compare);2944}29452946void2947vc4_program_fini(struct pipe_context *pctx)2948{2949struct vc4_context *vc4 = vc4_context(pctx);29502951hash_table_foreach(vc4->fs_cache, entry) {2952struct vc4_compiled_shader *shader = entry->data;2953vc4_bo_unreference(&shader->bo);2954ralloc_free(shader);2955_mesa_hash_table_remove(vc4->fs_cache, entry);2956}29572958hash_table_foreach(vc4->vs_cache, entry) {2959struct vc4_compiled_shader *shader = entry->data;2960vc4_bo_unreference(&shader->bo);2961ralloc_free(shader);2962_mesa_hash_table_remove(vc4->vs_cache, entry);2963}2964}296529662967