Path: blob/21.2-virgl/src/broadcom/compiler/nir_to_vir.c
4564 views
/*1* Copyright © 2016 Broadcom2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*/2223#include <inttypes.h>24#include "util/format/u_format.h"25#include "util/u_helpers.h"26#include "util/u_math.h"27#include "util/u_memory.h"28#include "util/ralloc.h"29#include "util/hash_table.h"30#include "compiler/nir/nir.h"31#include "compiler/nir/nir_builder.h"32#include "common/v3d_device_info.h"33#include "v3d_compiler.h"3435/* We don't do any address packing. */36#define __gen_user_data void37#define __gen_address_type uint32_t38#define __gen_address_offset(reloc) (*reloc)39#define __gen_emit_reloc(cl, reloc)40#include "cle/v3d_packet_v41_pack.h"4142#define GENERAL_TMU_LOOKUP_PER_QUAD (0 << 7)43#define GENERAL_TMU_LOOKUP_PER_PIXEL (1 << 7)44#define GENERAL_TMU_LOOKUP_TYPE_8BIT_I (0 << 0)45#define GENERAL_TMU_LOOKUP_TYPE_16BIT_I (1 << 0)46#define GENERAL_TMU_LOOKUP_TYPE_VEC2 (2 << 0)47#define GENERAL_TMU_LOOKUP_TYPE_VEC3 (3 << 0)48#define GENERAL_TMU_LOOKUP_TYPE_VEC4 (4 << 0)49#define GENERAL_TMU_LOOKUP_TYPE_8BIT_UI (5 << 0)50#define GENERAL_TMU_LOOKUP_TYPE_16BIT_UI (6 << 0)51#define GENERAL_TMU_LOOKUP_TYPE_32BIT_UI (7 << 0)5253#define V3D_TSY_SET_QUORUM 054#define V3D_TSY_INC_WAITERS 155#define V3D_TSY_DEC_WAITERS 256#define V3D_TSY_INC_QUORUM 357#define V3D_TSY_DEC_QUORUM 458#define V3D_TSY_FREE_ALL 559#define V3D_TSY_RELEASE 660#define V3D_TSY_ACQUIRE 761#define V3D_TSY_WAIT 862#define V3D_TSY_WAIT_INC 963#define V3D_TSY_WAIT_CHECK 1064#define V3D_TSY_WAIT_INC_CHECK 1165#define V3D_TSY_WAIT_CV 1266#define V3D_TSY_INC_SEMAPHORE 1367#define V3D_TSY_DEC_SEMAPHORE 1468#define V3D_TSY_SET_QUORUM_FREE_ALL 156970enum v3d_tmu_op_type71{72V3D_TMU_OP_TYPE_REGULAR,73V3D_TMU_OP_TYPE_ATOMIC,74V3D_TMU_OP_TYPE_CACHE75};7677static enum v3d_tmu_op_type78v3d_tmu_get_type_from_op(uint32_t tmu_op, bool is_write)79{80switch(tmu_op) {81case V3D_TMU_OP_WRITE_ADD_READ_PREFETCH:82case V3D_TMU_OP_WRITE_SUB_READ_CLEAR:83case V3D_TMU_OP_WRITE_XCHG_READ_FLUSH:84case V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH:85case V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR:86return is_write ? V3D_TMU_OP_TYPE_ATOMIC : V3D_TMU_OP_TYPE_CACHE;87case V3D_TMU_OP_WRITE_UMAX:88case V3D_TMU_OP_WRITE_SMIN:89case V3D_TMU_OP_WRITE_SMAX:90assert(is_write);91FALLTHROUGH;92case V3D_TMU_OP_WRITE_AND_READ_INC:93case V3D_TMU_OP_WRITE_OR_READ_DEC:94case V3D_TMU_OP_WRITE_XOR_READ_NOT:95return V3D_TMU_OP_TYPE_ATOMIC;96case V3D_TMU_OP_REGULAR:97return V3D_TMU_OP_TYPE_REGULAR;9899default:100unreachable("Unknown tmu_op\n");101}102}103static void104ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);105106static void107resize_qreg_array(struct v3d_compile *c,108struct qreg **regs,109uint32_t *size,110uint32_t decl_size)111{112if (*size >= decl_size)113return;114115uint32_t old_size = *size;116*size = MAX2(*size * 2, decl_size);117*regs = reralloc(c, *regs, struct qreg, *size);118if (!*regs) {119fprintf(stderr, "Malloc failure\n");120abort();121}122123for (uint32_t i = old_size; i < *size; i++)124(*regs)[i] = c->undef;125}126127static void128resize_interp_array(struct v3d_compile *c,129struct v3d_interp_input **regs,130uint32_t *size,131uint32_t decl_size)132{133if (*size >= decl_size)134return;135136uint32_t old_size = *size;137*size = MAX2(*size * 2, decl_size);138*regs = reralloc(c, *regs, struct v3d_interp_input, *size);139if (!*regs) {140fprintf(stderr, "Malloc failure\n");141abort();142}143144for (uint32_t i = old_size; i < *size; i++) {145(*regs)[i].vp = c->undef;146(*regs)[i].C = c->undef;147}148}149150void151vir_emit_thrsw(struct v3d_compile *c)152{153if (c->threads == 1)154return;155156/* Always thread switch after each texture operation for now.157*158* We could do better by batching a bunch of texture fetches up and159* then doing one thread switch and collecting all their results160* afterward.161*/162c->last_thrsw = vir_NOP(c);163c->last_thrsw->qpu.sig.thrsw = true;164c->last_thrsw_at_top_level = !c->in_control_flow;165166/* We need to lock the scoreboard before any tlb acess happens. If this167* thread switch comes after we have emitted a tlb load, then it means168* that we can't lock on the last thread switch any more.169*/170if (c->emitted_tlb_load)171c->lock_scoreboard_on_first_thrsw = true;172}173174uint32_t175v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src)176{177if (nir_src_is_const(instr->src[src])) {178int64_t add_val = nir_src_as_int(instr->src[src]);179if (add_val == 1)180return V3D_TMU_OP_WRITE_AND_READ_INC;181else if (add_val == -1)182return V3D_TMU_OP_WRITE_OR_READ_DEC;183}184185return V3D_TMU_OP_WRITE_ADD_READ_PREFETCH;186}187188static uint32_t189v3d_general_tmu_op(nir_intrinsic_instr *instr)190{191switch (instr->intrinsic) {192case nir_intrinsic_load_ssbo:193case nir_intrinsic_load_ubo:194case nir_intrinsic_load_uniform:195case nir_intrinsic_load_shared:196case nir_intrinsic_load_scratch:197case nir_intrinsic_store_ssbo:198case nir_intrinsic_store_shared:199case nir_intrinsic_store_scratch:200return V3D_TMU_OP_REGULAR;201case nir_intrinsic_ssbo_atomic_add:202return v3d_get_op_for_atomic_add(instr, 2);203case nir_intrinsic_shared_atomic_add:204return v3d_get_op_for_atomic_add(instr, 1);205case nir_intrinsic_ssbo_atomic_imin:206case nir_intrinsic_shared_atomic_imin:207return V3D_TMU_OP_WRITE_SMIN;208case nir_intrinsic_ssbo_atomic_umin:209case nir_intrinsic_shared_atomic_umin:210return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;211case nir_intrinsic_ssbo_atomic_imax:212case nir_intrinsic_shared_atomic_imax:213return V3D_TMU_OP_WRITE_SMAX;214case nir_intrinsic_ssbo_atomic_umax:215case nir_intrinsic_shared_atomic_umax:216return V3D_TMU_OP_WRITE_UMAX;217case nir_intrinsic_ssbo_atomic_and:218case nir_intrinsic_shared_atomic_and:219return V3D_TMU_OP_WRITE_AND_READ_INC;220case nir_intrinsic_ssbo_atomic_or:221case nir_intrinsic_shared_atomic_or:222return V3D_TMU_OP_WRITE_OR_READ_DEC;223case nir_intrinsic_ssbo_atomic_xor:224case nir_intrinsic_shared_atomic_xor:225return V3D_TMU_OP_WRITE_XOR_READ_NOT;226case nir_intrinsic_ssbo_atomic_exchange:227case nir_intrinsic_shared_atomic_exchange:228return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;229case nir_intrinsic_ssbo_atomic_comp_swap:230case nir_intrinsic_shared_atomic_comp_swap:231return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;232default:233unreachable("unknown intrinsic op");234}235}236237/**238* Checks if pipelining a new TMU operation requiring 'components' LDTMUs239* would overflow the Output TMU fifo.240*241* It is not allowed to overflow the Output fifo, however, we can overflow242* Input and Config fifos. Doing that makes the shader stall, but only for as243* long as it needs to be able to continue so it is better for pipelining to244* let the QPU stall on these if needed than trying to emit TMU flushes in the245* driver.246*/247bool248ntq_tmu_fifo_overflow(struct v3d_compile *c, uint32_t components)249{250if (c->tmu.flush_count >= MAX_TMU_QUEUE_SIZE)251return true;252253return components > 0 &&254c->tmu.output_fifo_size + components > 16 / c->threads;255}256257/**258* Emits the thread switch and LDTMU/TMUWT for all outstanding TMU operations,259* popping all TMU fifo entries.260*/261void262ntq_flush_tmu(struct v3d_compile *c)263{264if (c->tmu.flush_count == 0)265return;266267vir_emit_thrsw(c);268269bool emitted_tmuwt = false;270for (int i = 0; i < c->tmu.flush_count; i++) {271if (c->tmu.flush[i].component_mask > 0) {272nir_dest *dest = c->tmu.flush[i].dest;273assert(dest);274275for (int j = 0; j < 4; j++) {276if (c->tmu.flush[i].component_mask & (1 << j)) {277ntq_store_dest(c, dest, j,278vir_MOV(c, vir_LDTMU(c)));279}280}281} else if (!emitted_tmuwt) {282vir_TMUWT(c);283emitted_tmuwt = true;284}285}286287c->tmu.output_fifo_size = 0;288c->tmu.flush_count = 0;289_mesa_set_clear(c->tmu.outstanding_regs, NULL);290}291292/**293* Queues a pending thread switch + LDTMU/TMUWT for a TMU operation. The caller294* is reponsible for ensuring that doing this doesn't overflow the TMU fifos,295* and more specifically, the output fifo, since that can't stall.296*/297void298ntq_add_pending_tmu_flush(struct v3d_compile *c,299nir_dest *dest,300uint32_t component_mask)301{302const uint32_t num_components = util_bitcount(component_mask);303assert(!ntq_tmu_fifo_overflow(c, num_components));304305if (num_components > 0) {306c->tmu.output_fifo_size += num_components;307if (!dest->is_ssa)308_mesa_set_add(c->tmu.outstanding_regs, dest->reg.reg);309}310311c->tmu.flush[c->tmu.flush_count].dest = dest;312c->tmu.flush[c->tmu.flush_count].component_mask = component_mask;313c->tmu.flush_count++;314315if (c->disable_tmu_pipelining)316ntq_flush_tmu(c);317else if (c->tmu.flush_count > 1)318c->pipelined_any_tmu = true;319}320321enum emit_mode {322MODE_COUNT = 0,323MODE_EMIT,324MODE_LAST,325};326327/**328* For a TMU general store instruction:329*330* In MODE_COUNT mode, records the number of TMU writes required and flushes331* any outstanding TMU operations the instruction depends on, but it doesn't332* emit any actual register writes.333*334* In MODE_EMIT mode, emits the data register writes required by the335* instruction.336*/337static void338emit_tmu_general_store_writes(struct v3d_compile *c,339enum emit_mode mode,340nir_intrinsic_instr *instr,341uint32_t base_const_offset,342uint32_t *writemask,343uint32_t *const_offset,344uint32_t *tmu_writes)345{346struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD);347348/* Find the first set of consecutive components that349* are enabled in the writemask and emit the TMUD350* instructions for them.351*/352assert(*writemask != 0);353uint32_t first_component = ffs(*writemask) - 1;354uint32_t last_component = first_component;355while (*writemask & BITFIELD_BIT(last_component + 1))356last_component++;357358assert(first_component <= last_component &&359last_component < instr->num_components);360361for (int i = first_component; i <= last_component; i++) {362struct qreg data = ntq_get_src(c, instr->src[0], i);363if (mode == MODE_COUNT)364(*tmu_writes)++;365else366vir_MOV_dest(c, tmud, data);367}368369if (mode == MODE_EMIT) {370/* Update the offset for the TMU write based on the371* the first component we are writing.372*/373*const_offset = base_const_offset + first_component * 4;374375/* Clear these components from the writemask */376uint32_t written_mask =377BITFIELD_RANGE(first_component, *tmu_writes);378(*writemask) &= ~written_mask;379}380}381382/**383* For a TMU general atomic instruction:384*385* In MODE_COUNT mode, records the number of TMU writes required and flushes386* any outstanding TMU operations the instruction depends on, but it doesn't387* emit any actual register writes.388*389* In MODE_EMIT mode, emits the data register writes required by the390* instruction.391*/392static void393emit_tmu_general_atomic_writes(struct v3d_compile *c,394enum emit_mode mode,395nir_intrinsic_instr *instr,396uint32_t tmu_op,397bool has_index,398uint32_t *tmu_writes)399{400struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD);401402struct qreg data = ntq_get_src(c, instr->src[1 + has_index], 0);403if (mode == MODE_COUNT)404(*tmu_writes)++;405else406vir_MOV_dest(c, tmud, data);407408if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {409data = ntq_get_src(c, instr->src[2 + has_index], 0);410if (mode == MODE_COUNT)411(*tmu_writes)++;412else413vir_MOV_dest(c, tmud, data);414}415}416417/**418* For any TMU general instruction:419*420* In MODE_COUNT mode, records the number of TMU writes required to emit the421* address parameter and flushes any outstanding TMU operations the instruction422* depends on, but it doesn't emit any actual register writes.423*424* In MODE_EMIT mode, emits register writes required to emit the address.425*/426static void427emit_tmu_general_address_write(struct v3d_compile *c,428enum emit_mode mode,429nir_intrinsic_instr *instr,430uint32_t config,431bool dynamic_src,432int offset_src,433struct qreg base_offset,434uint32_t const_offset,435uint32_t *tmu_writes)436{437if (mode == MODE_COUNT) {438(*tmu_writes)++;439if (dynamic_src)440ntq_get_src(c, instr->src[offset_src], 0);441return;442}443444if (vir_in_nonuniform_control_flow(c)) {445vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),446V3D_QPU_PF_PUSHZ);447}448449struct qreg tmua;450if (config == ~0)451tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA);452else453tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);454455struct qinst *tmu;456if (dynamic_src) {457struct qreg offset = base_offset;458if (const_offset != 0) {459offset = vir_ADD(c, offset,460vir_uniform_ui(c, const_offset));461}462struct qreg data = ntq_get_src(c, instr->src[offset_src], 0);463tmu = vir_ADD_dest(c, tmua, offset, data);464} else {465if (const_offset != 0) {466tmu = vir_ADD_dest(c, tmua, base_offset,467vir_uniform_ui(c, const_offset));468} else {469tmu = vir_MOV_dest(c, tmua, base_offset);470}471}472473if (config != ~0) {474tmu->uniform =475vir_get_uniform_index(c, QUNIFORM_CONSTANT, config);476}477478if (vir_in_nonuniform_control_flow(c))479vir_set_cond(tmu, V3D_QPU_COND_IFA);480}481482/**483* Implements indirect uniform loads and SSBO accesses through the TMU general484* memory access interface.485*/486static void487ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,488bool is_shared_or_scratch)489{490uint32_t tmu_op = v3d_general_tmu_op(instr);491492/* If we were able to replace atomic_add for an inc/dec, then we493* need/can to do things slightly different, like not loading the494* amount to add/sub, as that is implicit.495*/496bool atomic_add_replaced =497((instr->intrinsic == nir_intrinsic_ssbo_atomic_add ||498instr->intrinsic == nir_intrinsic_shared_atomic_add) &&499(tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC ||500tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC));501502bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||503instr->intrinsic == nir_intrinsic_store_scratch ||504instr->intrinsic == nir_intrinsic_store_shared);505506bool is_load = (instr->intrinsic == nir_intrinsic_load_uniform ||507instr->intrinsic == nir_intrinsic_load_ubo ||508instr->intrinsic == nir_intrinsic_load_ssbo ||509instr->intrinsic == nir_intrinsic_load_scratch ||510instr->intrinsic == nir_intrinsic_load_shared);511512if (!is_load)513c->tmu_dirty_rcl = true;514515bool has_index = !is_shared_or_scratch;516517int offset_src;518if (instr->intrinsic == nir_intrinsic_load_uniform) {519offset_src = 0;520} else if (instr->intrinsic == nir_intrinsic_load_ssbo ||521instr->intrinsic == nir_intrinsic_load_ubo ||522instr->intrinsic == nir_intrinsic_load_scratch ||523instr->intrinsic == nir_intrinsic_load_shared ||524atomic_add_replaced) {525offset_src = 0 + has_index;526} else if (is_store) {527offset_src = 1 + has_index;528} else {529offset_src = 0 + has_index;530}531532bool dynamic_src = !nir_src_is_const(instr->src[offset_src]);533uint32_t const_offset = 0;534if (!dynamic_src)535const_offset = nir_src_as_uint(instr->src[offset_src]);536537struct qreg base_offset;538if (instr->intrinsic == nir_intrinsic_load_uniform) {539const_offset += nir_intrinsic_base(instr);540base_offset = vir_uniform(c, QUNIFORM_UBO_ADDR,541v3d_unit_data_create(0, const_offset));542const_offset = 0;543} else if (instr->intrinsic == nir_intrinsic_load_ubo) {544uint32_t index = nir_src_as_uint(instr->src[0]);545/* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index546* shifted up by 1 (0 is gallium's constant buffer 0).547*/548if (c->key->environment == V3D_ENVIRONMENT_OPENGL)549index++;550551base_offset =552vir_uniform(c, QUNIFORM_UBO_ADDR,553v3d_unit_data_create(index, const_offset));554const_offset = 0;555} else if (is_shared_or_scratch) {556/* Shared and scratch variables have no buffer index, and all557* start from a common base that we set up at the start of558* dispatch.559*/560if (instr->intrinsic == nir_intrinsic_load_scratch ||561instr->intrinsic == nir_intrinsic_store_scratch) {562base_offset = c->spill_base;563} else {564base_offset = c->cs_shared_offset;565const_offset += nir_intrinsic_base(instr);566}567} else {568base_offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET,569nir_src_as_uint(instr->src[is_store ?5701 : 0]));571}572573/* We are ready to emit TMU register writes now, but before we actually574* emit them we need to flush outstanding TMU operations if any of our575* writes reads from the result of an outstanding TMU operation before576* we start the TMU sequence for this operation, since otherwise the577* flush could happen in the middle of the TMU sequence we are about to578* emit, which is illegal. To do this we run this logic twice, the579* first time it will count required register writes and flush pending580* TMU requests if necessary due to a dependency, and the second one581* will emit the actual TMU writes.582*/583const uint32_t dest_components = nir_intrinsic_dest_components(instr);584uint32_t base_const_offset = const_offset;585uint32_t writemask = is_store ? nir_intrinsic_write_mask(instr) : 0;586uint32_t tmu_writes = 0;587for (enum emit_mode mode = MODE_COUNT; mode != MODE_LAST; mode++) {588assert(mode == MODE_COUNT || tmu_writes > 0);589590if (is_store) {591emit_tmu_general_store_writes(c, mode, instr,592base_const_offset,593&writemask,594&const_offset,595&tmu_writes);596} else if (!is_load && !atomic_add_replaced) {597emit_tmu_general_atomic_writes(c, mode, instr,598tmu_op, has_index,599&tmu_writes);600}601602/* For atomics we use 32bit except for CMPXCHG, that we need603* to use VEC2. For the rest of the cases we use the number of604* tmud writes we did to decide the type. For cache operations605* the type is ignored.606*/607uint32_t config = 0;608if (mode == MODE_EMIT) {609uint32_t num_components;610if (is_load || atomic_add_replaced) {611num_components = instr->num_components;612} else {613assert(tmu_writes > 0);614num_components = tmu_writes - 1;615}616bool is_atomic =617v3d_tmu_get_type_from_op(tmu_op, !is_load) ==618V3D_TMU_OP_TYPE_ATOMIC;619620uint32_t perquad =621is_load && !vir_in_nonuniform_control_flow(c)622? GENERAL_TMU_LOOKUP_PER_QUAD623: GENERAL_TMU_LOOKUP_PER_PIXEL;624config = 0xffffff00 | tmu_op << 3 | perquad;625626if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {627config |= GENERAL_TMU_LOOKUP_TYPE_VEC2;628} else if (is_atomic || num_components == 1) {629config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;630} else {631config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 +632num_components - 2;633}634}635636emit_tmu_general_address_write(c, mode, instr, config,637dynamic_src, offset_src,638base_offset, const_offset,639&tmu_writes);640641assert(tmu_writes > 0);642if (mode == MODE_COUNT) {643/* Make sure we won't exceed the 16-entry TMU644* fifo if each thread is storing at the same645* time.646*/647while (tmu_writes > 16 / c->threads)648c->threads /= 2;649650/* If pipelining this TMU operation would651* overflow TMU fifos, we need to flush.652*/653if (ntq_tmu_fifo_overflow(c, dest_components))654ntq_flush_tmu(c);655} else {656/* Delay emission of the thread switch and657* LDTMU/TMUWT until we really need to do it to658* improve pipelining.659*/660const uint32_t component_mask =661(1 << dest_components) - 1;662ntq_add_pending_tmu_flush(c, &instr->dest,663component_mask);664}665}666667/* nir_lower_wrmasks should've ensured that any writemask on a store668* operation only has consecutive bits set, in which case we should've669* processed the full writemask above.670*/671assert(writemask == 0);672}673674static struct qreg *675ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def)676{677struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,678def->num_components);679_mesa_hash_table_insert(c->def_ht, def, qregs);680return qregs;681}682683static bool684is_ld_signal(const struct v3d_qpu_sig *sig)685{686return (sig->ldunif ||687sig->ldunifa ||688sig->ldunifrf ||689sig->ldunifarf ||690sig->ldtmu ||691sig->ldvary ||692sig->ldvpm ||693sig->ldtlb ||694sig->ldtlbu);695}696697static inline bool698is_ldunif_signal(const struct v3d_qpu_sig *sig)699{700return sig->ldunif || sig->ldunifrf;701}702703/**704* This function is responsible for getting VIR results into the associated705* storage for a NIR instruction.706*707* If it's a NIR SSA def, then we just set the associated hash table entry to708* the new result.709*710* If it's a NIR reg, then we need to update the existing qreg assigned to the711* NIR destination with the incoming value. To do that without introducing712* new MOVs, we require that the incoming qreg either be a uniform, or be713* SSA-defined by the previous VIR instruction in the block and rewritable by714* this function. That lets us sneak ahead and insert the SF flag beforehand715* (knowing that the previous instruction doesn't depend on flags) and rewrite716* its destination to be the NIR reg's destination717*/718void719ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,720struct qreg result)721{722struct qinst *last_inst = NULL;723if (!list_is_empty(&c->cur_block->instructions))724last_inst = (struct qinst *)c->cur_block->instructions.prev;725726bool is_reused_uniform =727is_ldunif_signal(&c->defs[result.index]->qpu.sig) &&728last_inst != c->defs[result.index];729730assert(result.file == QFILE_TEMP && last_inst &&731(last_inst == c->defs[result.index] || is_reused_uniform));732733if (dest->is_ssa) {734assert(chan < dest->ssa.num_components);735736struct qreg *qregs;737struct hash_entry *entry =738_mesa_hash_table_search(c->def_ht, &dest->ssa);739740if (entry)741qregs = entry->data;742else743qregs = ntq_init_ssa_def(c, &dest->ssa);744745qregs[chan] = result;746} else {747nir_register *reg = dest->reg.reg;748assert(dest->reg.base_offset == 0);749assert(reg->num_array_elems == 0);750struct hash_entry *entry =751_mesa_hash_table_search(c->def_ht, reg);752struct qreg *qregs = entry->data;753754/* If the previous instruction can't be predicated for755* the store into the nir_register, then emit a MOV756* that can be.757*/758if (is_reused_uniform ||759(vir_in_nonuniform_control_flow(c) &&760is_ld_signal(&c->defs[last_inst->dst.index]->qpu.sig))) {761result = vir_MOV(c, result);762last_inst = c->defs[result.index];763}764765/* We know they're both temps, so just rewrite index. */766c->defs[last_inst->dst.index] = NULL;767last_inst->dst.index = qregs[chan].index;768769/* If we're in control flow, then make this update of the reg770* conditional on the execution mask.771*/772if (vir_in_nonuniform_control_flow(c)) {773last_inst->dst.index = qregs[chan].index;774775/* Set the flags to the current exec mask.776*/777c->cursor = vir_before_inst(last_inst);778vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),779V3D_QPU_PF_PUSHZ);780c->cursor = vir_after_inst(last_inst);781782vir_set_cond(last_inst, V3D_QPU_COND_IFA);783}784}785}786787/**788* This looks up the qreg associated with a particular ssa/reg used as a source789* in any instruction.790*791* It is expected that the definition for any NIR value read as a source has792* been emitted by a previous instruction, however, in the case of TMU793* operations we may have postponed emission of the thread switch and LDTMUs794* required to read the TMU results until the results are actually used to795* improve pipelining, which then would lead to us not finding them here796* (for SSA defs) or finding them in the list of registers awaiting a TMU flush797* (for registers), meaning that we need to flush outstanding TMU operations798* to read the correct value.799*/800struct qreg801ntq_get_src(struct v3d_compile *c, nir_src src, int i)802{803struct hash_entry *entry;804if (src.is_ssa) {805assert(i < src.ssa->num_components);806807entry = _mesa_hash_table_search(c->def_ht, src.ssa);808if (!entry) {809ntq_flush_tmu(c);810entry = _mesa_hash_table_search(c->def_ht, src.ssa);811}812} else {813nir_register *reg = src.reg.reg;814assert(reg->num_array_elems == 0);815assert(src.reg.base_offset == 0);816assert(i < reg->num_components);817818if (_mesa_set_search(c->tmu.outstanding_regs, reg))819ntq_flush_tmu(c);820entry = _mesa_hash_table_search(c->def_ht, reg);821}822assert(entry);823824struct qreg *qregs = entry->data;825return qregs[i];826}827828static struct qreg829ntq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr,830unsigned src)831{832assert(util_is_power_of_two_or_zero(instr->dest.write_mask));833unsigned chan = ffs(instr->dest.write_mask) - 1;834struct qreg r = ntq_get_src(c, instr->src[src].src,835instr->src[src].swizzle[chan]);836837assert(!instr->src[src].abs);838assert(!instr->src[src].negate);839840return r;841};842843static struct qreg844ntq_minify(struct v3d_compile *c, struct qreg size, struct qreg level)845{846return vir_MAX(c, vir_SHR(c, size, level), vir_uniform_ui(c, 1));847}848849static void850ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr)851{852unsigned unit = instr->texture_index;853int lod_index = nir_tex_instr_src_index(instr, nir_tex_src_lod);854int dest_size = nir_tex_instr_dest_size(instr);855856struct qreg lod = c->undef;857if (lod_index != -1)858lod = ntq_get_src(c, instr->src[lod_index].src, 0);859860for (int i = 0; i < dest_size; i++) {861assert(i < 3);862enum quniform_contents contents;863864if (instr->is_array && i == dest_size - 1)865contents = QUNIFORM_TEXTURE_ARRAY_SIZE;866else867contents = QUNIFORM_TEXTURE_WIDTH + i;868869struct qreg size = vir_uniform(c, contents, unit);870871switch (instr->sampler_dim) {872case GLSL_SAMPLER_DIM_1D:873case GLSL_SAMPLER_DIM_2D:874case GLSL_SAMPLER_DIM_MS:875case GLSL_SAMPLER_DIM_3D:876case GLSL_SAMPLER_DIM_CUBE:877case GLSL_SAMPLER_DIM_BUF:878/* Don't minify the array size. */879if (!(instr->is_array && i == dest_size - 1)) {880size = ntq_minify(c, size, lod);881}882break;883884case GLSL_SAMPLER_DIM_RECT:885/* There's no LOD field for rects */886break;887888default:889unreachable("Bad sampler type");890}891892ntq_store_dest(c, &instr->dest, i, size);893}894}895896static void897ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)898{899unsigned unit = instr->texture_index;900901/* Since each texture sampling op requires uploading uniforms to902* reference the texture, there's no HW support for texture size and903* you just upload uniforms containing the size.904*/905switch (instr->op) {906case nir_texop_query_levels:907ntq_store_dest(c, &instr->dest, 0,908vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit));909return;910case nir_texop_texture_samples:911ntq_store_dest(c, &instr->dest, 0,912vir_uniform(c, QUNIFORM_TEXTURE_SAMPLES, unit));913return;914case nir_texop_txs:915ntq_emit_txs(c, instr);916return;917default:918break;919}920921if (c->devinfo->ver >= 40)922v3d40_vir_emit_tex(c, instr);923else924v3d33_vir_emit_tex(c, instr);925}926927static struct qreg928ntq_fsincos(struct v3d_compile *c, struct qreg src, bool is_cos)929{930struct qreg input = vir_FMUL(c, src, vir_uniform_f(c, 1.0f / M_PI));931if (is_cos)932input = vir_FADD(c, input, vir_uniform_f(c, 0.5));933934struct qreg periods = vir_FROUND(c, input);935struct qreg sin_output = vir_SIN(c, vir_FSUB(c, input, periods));936return vir_XOR(c, sin_output, vir_SHL(c,937vir_FTOIN(c, periods),938vir_uniform_ui(c, -1)));939}940941static struct qreg942ntq_fsign(struct v3d_compile *c, struct qreg src)943{944struct qreg t = vir_get_temp(c);945946vir_MOV_dest(c, t, vir_uniform_f(c, 0.0));947vir_set_pf(c, vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHZ);948vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_f(c, 1.0));949vir_set_pf(c, vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHN);950vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_f(c, -1.0));951return vir_MOV(c, t);952}953954static void955emit_fragcoord_input(struct v3d_compile *c, int attr)956{957c->inputs[attr * 4 + 0] = vir_FXCD(c);958c->inputs[attr * 4 + 1] = vir_FYCD(c);959c->inputs[attr * 4 + 2] = c->payload_z;960c->inputs[attr * 4 + 3] = vir_RECIP(c, c->payload_w);961}962963static struct qreg964emit_smooth_varying(struct v3d_compile *c,965struct qreg vary, struct qreg w, struct qreg r5)966{967return vir_FADD(c, vir_FMUL(c, vary, w), r5);968}969970static struct qreg971emit_noperspective_varying(struct v3d_compile *c,972struct qreg vary, struct qreg r5)973{974return vir_FADD(c, vir_MOV(c, vary), r5);975}976977static struct qreg978emit_flat_varying(struct v3d_compile *c,979struct qreg vary, struct qreg r5)980{981vir_MOV_dest(c, c->undef, vary);982return vir_MOV(c, r5);983}984985static struct qreg986emit_fragment_varying(struct v3d_compile *c, nir_variable *var,987int8_t input_idx, uint8_t swizzle, int array_index)988{989struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3);990struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);991992struct qinst *ldvary = NULL;993struct qreg vary;994if (c->devinfo->ver >= 41) {995ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef,996c->undef, c->undef);997ldvary->qpu.sig.ldvary = true;998vary = vir_emit_def(c, ldvary);999} else {1000vir_NOP(c)->qpu.sig.ldvary = true;1001vary = r3;1002}10031004/* Store the input value before interpolation so we can implement1005* GLSL's interpolateAt functions if the shader uses them.1006*/1007if (input_idx >= 0) {1008assert(var);1009c->interp[input_idx].vp = vary;1010c->interp[input_idx].C = vir_MOV(c, r5);1011c->interp[input_idx].mode = var->data.interpolation;1012}10131014/* For gl_PointCoord input or distance along a line, we'll be called1015* with no nir_variable, and we don't count toward VPM size so we1016* don't track an input slot.1017*/1018if (!var) {1019assert(input_idx < 0);1020return emit_smooth_varying(c, vary, c->payload_w, r5);1021}10221023int i = c->num_inputs++;1024c->input_slots[i] =1025v3d_slot_from_slot_and_component(var->data.location +1026array_index, swizzle);10271028struct qreg result;1029switch (var->data.interpolation) {1030case INTERP_MODE_NONE:1031case INTERP_MODE_SMOOTH:1032if (var->data.centroid) {1033BITSET_SET(c->centroid_flags, i);1034result = emit_smooth_varying(c, vary,1035c->payload_w_centroid, r5);1036} else {1037result = emit_smooth_varying(c, vary, c->payload_w, r5);1038}1039break;10401041case INTERP_MODE_NOPERSPECTIVE:1042BITSET_SET(c->noperspective_flags, i);1043result = emit_noperspective_varying(c, vary, r5);1044break;10451046case INTERP_MODE_FLAT:1047BITSET_SET(c->flat_shade_flags, i);1048result = emit_flat_varying(c, vary, r5);1049break;10501051default:1052unreachable("Bad interp mode");1053}10541055if (input_idx >= 0)1056c->inputs[input_idx] = result;1057return result;1058}10591060static void1061emit_fragment_input(struct v3d_compile *c, int base_attr, nir_variable *var,1062int array_index, unsigned nelem)1063{1064for (int i = 0; i < nelem ; i++) {1065int chan = var->data.location_frac + i;1066int input_idx = (base_attr + array_index) * 4 + chan;1067emit_fragment_varying(c, var, input_idx, chan, array_index);1068}1069}10701071static void1072emit_compact_fragment_input(struct v3d_compile *c, int attr, nir_variable *var,1073int array_index)1074{1075/* Compact variables are scalar arrays where each set of 4 elements1076* consumes a single location.1077*/1078int loc_offset = array_index / 4;1079int chan = var->data.location_frac + array_index % 4;1080int input_idx = (attr + loc_offset) * 4 + chan;1081emit_fragment_varying(c, var, input_idx, chan, loc_offset);1082}10831084static void1085add_output(struct v3d_compile *c,1086uint32_t decl_offset,1087uint8_t slot,1088uint8_t swizzle)1089{1090uint32_t old_array_size = c->outputs_array_size;1091resize_qreg_array(c, &c->outputs, &c->outputs_array_size,1092decl_offset + 1);10931094if (old_array_size != c->outputs_array_size) {1095c->output_slots = reralloc(c,1096c->output_slots,1097struct v3d_varying_slot,1098c->outputs_array_size);1099}11001101c->output_slots[decl_offset] =1102v3d_slot_from_slot_and_component(slot, swizzle);1103}11041105/**1106* If compare_instr is a valid comparison instruction, emits the1107* compare_instr's comparison and returns the sel_instr's return value based1108* on the compare_instr's result.1109*/1110static bool1111ntq_emit_comparison(struct v3d_compile *c,1112nir_alu_instr *compare_instr,1113enum v3d_qpu_cond *out_cond)1114{1115struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);1116struct qreg src1;1117if (nir_op_infos[compare_instr->op].num_inputs > 1)1118src1 = ntq_get_alu_src(c, compare_instr, 1);1119bool cond_invert = false;1120struct qreg nop = vir_nop_reg();11211122switch (compare_instr->op) {1123case nir_op_feq32:1124case nir_op_seq:1125vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);1126break;1127case nir_op_ieq32:1128vir_set_pf(c, vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);1129break;11301131case nir_op_fneu32:1132case nir_op_sne:1133vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);1134cond_invert = true;1135break;1136case nir_op_ine32:1137vir_set_pf(c, vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);1138cond_invert = true;1139break;11401141case nir_op_fge32:1142case nir_op_sge:1143vir_set_pf(c, vir_FCMP_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);1144break;1145case nir_op_ige32:1146vir_set_pf(c, vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);1147cond_invert = true;1148break;1149case nir_op_uge32:1150vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);1151cond_invert = true;1152break;11531154case nir_op_slt:1155case nir_op_flt32:1156vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHN);1157break;1158case nir_op_ilt32:1159vir_set_pf(c, vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);1160break;1161case nir_op_ult32:1162vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);1163break;11641165case nir_op_i2b32:1166vir_set_pf(c, vir_MOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ);1167cond_invert = true;1168break;11691170case nir_op_f2b32:1171vir_set_pf(c, vir_FMOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ);1172cond_invert = true;1173break;11741175default:1176return false;1177}11781179*out_cond = cond_invert ? V3D_QPU_COND_IFNA : V3D_QPU_COND_IFA;11801181return true;1182}11831184/* Finds an ALU instruction that generates our src value that could1185* (potentially) be greedily emitted in the consuming instruction.1186*/1187static struct nir_alu_instr *1188ntq_get_alu_parent(nir_src src)1189{1190if (!src.is_ssa || src.ssa->parent_instr->type != nir_instr_type_alu)1191return NULL;1192nir_alu_instr *instr = nir_instr_as_alu(src.ssa->parent_instr);1193if (!instr)1194return NULL;11951196/* If the ALU instr's srcs are non-SSA, then we would have to avoid1197* moving emission of the ALU instr down past another write of the1198* src.1199*/1200for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {1201if (!instr->src[i].src.is_ssa)1202return NULL;1203}12041205return instr;1206}12071208/* Turns a NIR bool into a condition code to predicate on. */1209static enum v3d_qpu_cond1210ntq_emit_bool_to_cond(struct v3d_compile *c, nir_src src)1211{1212struct qreg qsrc = ntq_get_src(c, src, 0);1213/* skip if we already have src in the flags */1214if (qsrc.file == QFILE_TEMP && c->flags_temp == qsrc.index)1215return c->flags_cond;12161217nir_alu_instr *compare = ntq_get_alu_parent(src);1218if (!compare)1219goto out;12201221enum v3d_qpu_cond cond;1222if (ntq_emit_comparison(c, compare, &cond))1223return cond;12241225out:12261227vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), ntq_get_src(c, src, 0)),1228V3D_QPU_PF_PUSHZ);1229return V3D_QPU_COND_IFNA;1230}12311232static struct qreg1233ntq_emit_cond_to_bool(struct v3d_compile *c, enum v3d_qpu_cond cond)1234{1235struct qreg result =1236vir_MOV(c, vir_SEL(c, cond,1237vir_uniform_ui(c, ~0),1238vir_uniform_ui(c, 0)));1239c->flags_temp = result.index;1240c->flags_cond = cond;1241return result;1242}12431244static void1245ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)1246{1247/* This should always be lowered to ALU operations for V3D. */1248assert(!instr->dest.saturate);12491250/* Vectors are special in that they have non-scalarized writemasks,1251* and just take the first swizzle channel for each argument in order1252* into each writemask channel.1253*/1254if (instr->op == nir_op_vec2 ||1255instr->op == nir_op_vec3 ||1256instr->op == nir_op_vec4) {1257struct qreg srcs[4];1258for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)1259srcs[i] = ntq_get_src(c, instr->src[i].src,1260instr->src[i].swizzle[0]);1261for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)1262ntq_store_dest(c, &instr->dest.dest, i,1263vir_MOV(c, srcs[i]));1264return;1265}12661267/* General case: We can just grab the one used channel per src. */1268struct qreg src[nir_op_infos[instr->op].num_inputs];1269for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {1270src[i] = ntq_get_alu_src(c, instr, i);1271}12721273struct qreg result;12741275switch (instr->op) {1276case nir_op_mov:1277result = vir_MOV(c, src[0]);1278break;12791280case nir_op_fneg:1281result = vir_XOR(c, src[0], vir_uniform_ui(c, 1 << 31));1282break;1283case nir_op_ineg:1284result = vir_NEG(c, src[0]);1285break;12861287case nir_op_fmul:1288result = vir_FMUL(c, src[0], src[1]);1289break;1290case nir_op_fadd:1291result = vir_FADD(c, src[0], src[1]);1292break;1293case nir_op_fsub:1294result = vir_FSUB(c, src[0], src[1]);1295break;1296case nir_op_fmin:1297result = vir_FMIN(c, src[0], src[1]);1298break;1299case nir_op_fmax:1300result = vir_FMAX(c, src[0], src[1]);1301break;13021303case nir_op_f2i32: {1304nir_alu_instr *src0_alu = ntq_get_alu_parent(instr->src[0].src);1305if (src0_alu && src0_alu->op == nir_op_fround_even) {1306result = vir_FTOIN(c, ntq_get_alu_src(c, src0_alu, 0));1307} else {1308result = vir_FTOIZ(c, src[0]);1309}1310break;1311}13121313case nir_op_f2u32:1314result = vir_FTOUZ(c, src[0]);1315break;1316case nir_op_i2f32:1317result = vir_ITOF(c, src[0]);1318break;1319case nir_op_u2f32:1320result = vir_UTOF(c, src[0]);1321break;1322case nir_op_b2f32:1323result = vir_AND(c, src[0], vir_uniform_f(c, 1.0));1324break;1325case nir_op_b2i32:1326result = vir_AND(c, src[0], vir_uniform_ui(c, 1));1327break;13281329case nir_op_iadd:1330result = vir_ADD(c, src[0], src[1]);1331break;1332case nir_op_ushr:1333result = vir_SHR(c, src[0], src[1]);1334break;1335case nir_op_isub:1336result = vir_SUB(c, src[0], src[1]);1337break;1338case nir_op_ishr:1339result = vir_ASR(c, src[0], src[1]);1340break;1341case nir_op_ishl:1342result = vir_SHL(c, src[0], src[1]);1343break;1344case nir_op_imin:1345result = vir_MIN(c, src[0], src[1]);1346break;1347case nir_op_umin:1348result = vir_UMIN(c, src[0], src[1]);1349break;1350case nir_op_imax:1351result = vir_MAX(c, src[0], src[1]);1352break;1353case nir_op_umax:1354result = vir_UMAX(c, src[0], src[1]);1355break;1356case nir_op_iand:1357result = vir_AND(c, src[0], src[1]);1358break;1359case nir_op_ior:1360result = vir_OR(c, src[0], src[1]);1361break;1362case nir_op_ixor:1363result = vir_XOR(c, src[0], src[1]);1364break;1365case nir_op_inot:1366result = vir_NOT(c, src[0]);1367break;13681369case nir_op_ufind_msb:1370result = vir_SUB(c, vir_uniform_ui(c, 31), vir_CLZ(c, src[0]));1371break;13721373case nir_op_imul:1374result = vir_UMUL(c, src[0], src[1]);1375break;13761377case nir_op_seq:1378case nir_op_sne:1379case nir_op_sge:1380case nir_op_slt: {1381enum v3d_qpu_cond cond;1382ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond);1383assert(ok);1384result = vir_MOV(c, vir_SEL(c, cond,1385vir_uniform_f(c, 1.0),1386vir_uniform_f(c, 0.0)));1387c->flags_temp = result.index;1388c->flags_cond = cond;1389break;1390}13911392case nir_op_i2b32:1393case nir_op_f2b32:1394case nir_op_feq32:1395case nir_op_fneu32:1396case nir_op_fge32:1397case nir_op_flt32:1398case nir_op_ieq32:1399case nir_op_ine32:1400case nir_op_ige32:1401case nir_op_uge32:1402case nir_op_ilt32:1403case nir_op_ult32: {1404enum v3d_qpu_cond cond;1405ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond);1406assert(ok);1407result = ntq_emit_cond_to_bool(c, cond);1408break;1409}14101411case nir_op_b32csel:1412result = vir_MOV(c,1413vir_SEL(c,1414ntq_emit_bool_to_cond(c, instr->src[0].src),1415src[1], src[2]));1416break;14171418case nir_op_fcsel:1419vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), src[0]),1420V3D_QPU_PF_PUSHZ);1421result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA,1422src[1], src[2]));1423break;14241425case nir_op_frcp:1426result = vir_RECIP(c, src[0]);1427break;1428case nir_op_frsq:1429result = vir_RSQRT(c, src[0]);1430break;1431case nir_op_fexp2:1432result = vir_EXP(c, src[0]);1433break;1434case nir_op_flog2:1435result = vir_LOG(c, src[0]);1436break;14371438case nir_op_fceil:1439result = vir_FCEIL(c, src[0]);1440break;1441case nir_op_ffloor:1442result = vir_FFLOOR(c, src[0]);1443break;1444case nir_op_fround_even:1445result = vir_FROUND(c, src[0]);1446break;1447case nir_op_ftrunc:1448result = vir_FTRUNC(c, src[0]);1449break;14501451case nir_op_fsin:1452result = ntq_fsincos(c, src[0], false);1453break;1454case nir_op_fcos:1455result = ntq_fsincos(c, src[0], true);1456break;14571458case nir_op_fsign:1459result = ntq_fsign(c, src[0]);1460break;14611462case nir_op_fabs: {1463result = vir_FMOV(c, src[0]);1464vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_ABS);1465break;1466}14671468case nir_op_iabs:1469result = vir_MAX(c, src[0], vir_NEG(c, src[0]));1470break;14711472case nir_op_fddx:1473case nir_op_fddx_coarse:1474case nir_op_fddx_fine:1475result = vir_FDX(c, src[0]);1476break;14771478case nir_op_fddy:1479case nir_op_fddy_coarse:1480case nir_op_fddy_fine:1481result = vir_FDY(c, src[0]);1482break;14831484case nir_op_uadd_carry:1485vir_set_pf(c, vir_ADD_dest(c, vir_nop_reg(), src[0], src[1]),1486V3D_QPU_PF_PUSHC);1487result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);1488break;14891490case nir_op_pack_half_2x16_split:1491result = vir_VFPACK(c, src[0], src[1]);1492break;14931494case nir_op_unpack_half_2x16_split_x:1495result = vir_FMOV(c, src[0]);1496vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);1497break;14981499case nir_op_unpack_half_2x16_split_y:1500result = vir_FMOV(c, src[0]);1501vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_H);1502break;15031504case nir_op_fquantize2f16: {1505/* F32 -> F16 -> F32 conversion */1506struct qreg tmp = vir_FMOV(c, src[0]);1507vir_set_pack(c->defs[tmp.index], V3D_QPU_PACK_L);1508tmp = vir_FMOV(c, tmp);1509vir_set_unpack(c->defs[tmp.index], 0, V3D_QPU_UNPACK_L);15101511/* Check for denorm */1512struct qreg abs_src = vir_FMOV(c, src[0]);1513vir_set_unpack(c->defs[abs_src.index], 0, V3D_QPU_UNPACK_ABS);1514struct qreg threshold = vir_uniform_f(c, ldexpf(1.0, -14));1515vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(), abs_src, threshold),1516V3D_QPU_PF_PUSHC);15171518/* Return +/-0 for denorms */1519struct qreg zero =1520vir_AND(c, src[0], vir_uniform_ui(c, 0x80000000));1521result = vir_FMOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, tmp, zero));1522break;1523}15241525default:1526fprintf(stderr, "unknown NIR ALU inst: ");1527nir_print_instr(&instr->instr, stderr);1528fprintf(stderr, "\n");1529abort();1530}15311532/* We have a scalar result, so the instruction should only have a1533* single channel written to.1534*/1535assert(util_is_power_of_two_or_zero(instr->dest.write_mask));1536ntq_store_dest(c, &instr->dest.dest,1537ffs(instr->dest.write_mask) - 1, result);1538}15391540/* Each TLB read/write setup (a render target or depth buffer) takes an 8-bit1541* specifier. They come from a register that's preloaded with 0xffffffff1542* (0xff gets you normal vec4 f16 RT0 writes), and when one is neaded the low1543* 8 bits are shifted off the bottom and 0xff shifted in from the top.1544*/1545#define TLB_TYPE_F16_COLOR (3 << 6)1546#define TLB_TYPE_I32_COLOR (1 << 6)1547#define TLB_TYPE_F32_COLOR (0 << 6)1548#define TLB_RENDER_TARGET_SHIFT 3 /* Reversed! 7 = RT 0, 0 = RT 7. */1549#define TLB_SAMPLE_MODE_PER_SAMPLE (0 << 2)1550#define TLB_SAMPLE_MODE_PER_PIXEL (1 << 2)1551#define TLB_F16_SWAP_HI_LO (1 << 1)1552#define TLB_VEC_SIZE_4_F16 (1 << 0)1553#define TLB_VEC_SIZE_2_F16 (0 << 0)1554#define TLB_VEC_SIZE_MINUS_1_SHIFT 015551556/* Triggers Z/Stencil testing, used when the shader state's "FS modifies Z"1557* flag is set.1558*/1559#define TLB_TYPE_DEPTH ((2 << 6) | (0 << 4))1560#define TLB_DEPTH_TYPE_INVARIANT (0 << 2) /* Unmodified sideband input used */1561#define TLB_DEPTH_TYPE_PER_PIXEL (1 << 2) /* QPU result used */1562#define TLB_V42_DEPTH_TYPE_INVARIANT (0 << 3) /* Unmodified sideband input used */1563#define TLB_V42_DEPTH_TYPE_PER_PIXEL (1 << 3) /* QPU result used */15641565/* Stencil is a single 32-bit write. */1566#define TLB_TYPE_STENCIL_ALPHA ((2 << 6) | (1 << 4))15671568static void1569vir_emit_tlb_color_write(struct v3d_compile *c, unsigned rt)1570{1571if (!(c->fs_key->cbufs & (1 << rt)) || !c->output_color_var[rt])1572return;15731574struct qreg tlb_reg = vir_magic_reg(V3D_QPU_WADDR_TLB);1575struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);15761577nir_variable *var = c->output_color_var[rt];1578int num_components = glsl_get_vector_elements(var->type);1579uint32_t conf = 0xffffff00;1580struct qinst *inst;15811582conf |= c->msaa_per_sample_output ? TLB_SAMPLE_MODE_PER_SAMPLE :1583TLB_SAMPLE_MODE_PER_PIXEL;1584conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;15851586if (c->fs_key->swap_color_rb & (1 << rt))1587num_components = MAX2(num_components, 3);1588assert(num_components != 0);15891590enum glsl_base_type type = glsl_get_base_type(var->type);1591bool is_int_format = type == GLSL_TYPE_INT || type == GLSL_TYPE_UINT;1592bool is_32b_tlb_format = is_int_format ||1593(c->fs_key->f32_color_rb & (1 << rt));15941595if (is_int_format) {1596/* The F32 vs I32 distinction was dropped in 4.2. */1597if (c->devinfo->ver < 42)1598conf |= TLB_TYPE_I32_COLOR;1599else1600conf |= TLB_TYPE_F32_COLOR;1601conf |= ((num_components - 1) << TLB_VEC_SIZE_MINUS_1_SHIFT);1602} else {1603if (c->fs_key->f32_color_rb & (1 << rt)) {1604conf |= TLB_TYPE_F32_COLOR;1605conf |= ((num_components - 1) <<1606TLB_VEC_SIZE_MINUS_1_SHIFT);1607} else {1608conf |= TLB_TYPE_F16_COLOR;1609conf |= TLB_F16_SWAP_HI_LO;1610if (num_components >= 3)1611conf |= TLB_VEC_SIZE_4_F16;1612else1613conf |= TLB_VEC_SIZE_2_F16;1614}1615}16161617int num_samples = c->msaa_per_sample_output ? V3D_MAX_SAMPLES : 1;1618for (int i = 0; i < num_samples; i++) {1619struct qreg *color = c->msaa_per_sample_output ?1620&c->sample_colors[(rt * V3D_MAX_SAMPLES + i) * 4] :1621&c->outputs[var->data.driver_location * 4];16221623struct qreg r = color[0];1624struct qreg g = color[1];1625struct qreg b = color[2];1626struct qreg a = color[3];16271628if (c->fs_key->swap_color_rb & (1 << rt)) {1629r = color[2];1630b = color[0];1631}16321633if (c->fs_key->sample_alpha_to_one)1634a = vir_uniform_f(c, 1.0);16351636if (is_32b_tlb_format) {1637if (i == 0) {1638inst = vir_MOV_dest(c, tlbu_reg, r);1639inst->uniform =1640vir_get_uniform_index(c,1641QUNIFORM_CONSTANT,1642conf);1643} else {1644vir_MOV_dest(c, tlb_reg, r);1645}16461647if (num_components >= 2)1648vir_MOV_dest(c, tlb_reg, g);1649if (num_components >= 3)1650vir_MOV_dest(c, tlb_reg, b);1651if (num_components >= 4)1652vir_MOV_dest(c, tlb_reg, a);1653} else {1654inst = vir_VFPACK_dest(c, tlb_reg, r, g);1655if (conf != ~0 && i == 0) {1656inst->dst = tlbu_reg;1657inst->uniform =1658vir_get_uniform_index(c,1659QUNIFORM_CONSTANT,1660conf);1661}16621663if (num_components >= 3)1664vir_VFPACK_dest(c, tlb_reg, b, a);1665}1666}1667}16681669static void1670emit_frag_end(struct v3d_compile *c)1671{1672if (c->output_sample_mask_index != -1) {1673vir_SETMSF_dest(c, vir_nop_reg(),1674vir_AND(c,1675vir_MSF(c),1676c->outputs[c->output_sample_mask_index]));1677}16781679bool has_any_tlb_color_write = false;1680for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++) {1681if (c->fs_key->cbufs & (1 << rt) && c->output_color_var[rt])1682has_any_tlb_color_write = true;1683}16841685if (c->fs_key->sample_alpha_to_coverage && c->output_color_var[0]) {1686struct nir_variable *var = c->output_color_var[0];1687struct qreg *color = &c->outputs[var->data.driver_location * 4];16881689vir_SETMSF_dest(c, vir_nop_reg(),1690vir_AND(c,1691vir_MSF(c),1692vir_FTOC(c, color[3])));1693}16941695struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);1696if (c->output_position_index != -1) {1697struct qinst *inst = vir_MOV_dest(c, tlbu_reg,1698c->outputs[c->output_position_index]);1699uint8_t tlb_specifier = TLB_TYPE_DEPTH;17001701if (c->devinfo->ver >= 42) {1702tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL |1703TLB_SAMPLE_MODE_PER_PIXEL);1704} else1705tlb_specifier |= TLB_DEPTH_TYPE_PER_PIXEL;17061707inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,1708tlb_specifier |17090xffffff00);1710c->writes_z = true;1711} else if (c->s->info.fs.uses_discard ||1712!c->s->info.fs.early_fragment_tests ||1713c->fs_key->sample_alpha_to_coverage ||1714!has_any_tlb_color_write) {1715/* Emit passthrough Z if it needed to be delayed until shader1716* end due to potential discards.1717*1718* Since (single-threaded) fragment shaders always need a TLB1719* write, emit passthrouh Z if we didn't have any color1720* buffers and flag us as potentially discarding, so that we1721* can use Z as the TLB write.1722*/1723c->s->info.fs.uses_discard = true;17241725struct qinst *inst = vir_MOV_dest(c, tlbu_reg,1726vir_nop_reg());1727uint8_t tlb_specifier = TLB_TYPE_DEPTH;17281729if (c->devinfo->ver >= 42) {1730/* The spec says the PER_PIXEL flag is ignored for1731* invariant writes, but the simulator demands it.1732*/1733tlb_specifier |= (TLB_V42_DEPTH_TYPE_INVARIANT |1734TLB_SAMPLE_MODE_PER_PIXEL);1735} else {1736tlb_specifier |= TLB_DEPTH_TYPE_INVARIANT;1737}17381739inst->uniform = vir_get_uniform_index(c,1740QUNIFORM_CONSTANT,1741tlb_specifier |17420xffffff00);1743c->writes_z = true;1744}17451746/* XXX: Performance improvement: Merge Z write and color writes TLB1747* uniform setup1748*/1749for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++)1750vir_emit_tlb_color_write(c, rt);1751}17521753static inline void1754vir_VPM_WRITE_indirect(struct v3d_compile *c,1755struct qreg val,1756struct qreg vpm_index,1757bool uniform_vpm_index)1758{1759assert(c->devinfo->ver >= 40);1760if (uniform_vpm_index)1761vir_STVPMV(c, vpm_index, val);1762else1763vir_STVPMD(c, vpm_index, val);1764}17651766static void1767vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index)1768{1769if (c->devinfo->ver >= 40) {1770vir_VPM_WRITE_indirect(c, val,1771vir_uniform_ui(c, vpm_index), true);1772} else {1773/* XXX: v3d33_vir_vpm_write_setup(c); */1774vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val);1775}1776}17771778static void1779emit_vert_end(struct v3d_compile *c)1780{1781/* GFXH-1684: VPM writes need to be complete by the end of the shader.1782*/1783if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42)1784vir_VPMWT(c);1785}17861787static void1788emit_geom_end(struct v3d_compile *c)1789{1790/* GFXH-1684: VPM writes need to be complete by the end of the shader.1791*/1792if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42)1793vir_VPMWT(c);1794}17951796static bool1797mem_vectorize_callback(unsigned align_mul, unsigned align_offset,1798unsigned bit_size,1799unsigned num_components,1800nir_intrinsic_instr *low,1801nir_intrinsic_instr *high,1802void *data)1803{1804/* Our backend is 32-bit only at present */1805if (bit_size != 32)1806return false;18071808if (align_mul % 4 != 0 || align_offset % 4 != 0)1809return false;18101811/* Vector accesses wrap at 16-byte boundaries so we can't vectorize1812* if the resulting vector crosses a 16-byte boundary.1813*/1814assert(util_is_power_of_two_nonzero(align_mul));1815align_mul = MIN2(align_mul, 16);1816align_offset &= 0xf;1817if (16 - align_mul + align_offset + num_components * 4 > 16)1818return false;18191820return true;1821}18221823void1824v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)1825{1826bool progress;1827unsigned lower_flrp =1828(s->options->lower_flrp16 ? 16 : 0) |1829(s->options->lower_flrp32 ? 32 : 0) |1830(s->options->lower_flrp64 ? 64 : 0);18311832do {1833progress = false;18341835NIR_PASS_V(s, nir_lower_vars_to_ssa);1836NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);1837NIR_PASS(progress, s, nir_lower_phis_to_scalar, false);1838NIR_PASS(progress, s, nir_copy_prop);1839NIR_PASS(progress, s, nir_opt_remove_phis);1840NIR_PASS(progress, s, nir_opt_dce);1841NIR_PASS(progress, s, nir_opt_dead_cf);1842NIR_PASS(progress, s, nir_opt_cse);1843NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);1844NIR_PASS(progress, s, nir_opt_algebraic);1845NIR_PASS(progress, s, nir_opt_constant_folding);18461847nir_load_store_vectorize_options vectorize_opts = {1848.modes = nir_var_mem_ssbo | nir_var_mem_ubo |1849nir_var_mem_push_const | nir_var_mem_shared |1850nir_var_mem_global,1851.callback = mem_vectorize_callback,1852.robust_modes = 0,1853};1854NIR_PASS(progress, s, nir_opt_load_store_vectorize, &vectorize_opts);18551856if (lower_flrp != 0) {1857bool lower_flrp_progress = false;18581859NIR_PASS(lower_flrp_progress, s, nir_lower_flrp,1860lower_flrp,1861false /* always_precise */);1862if (lower_flrp_progress) {1863NIR_PASS(progress, s, nir_opt_constant_folding);1864progress = true;1865}18661867/* Nothing should rematerialize any flrps, so we only1868* need to do this lowering once.1869*/1870lower_flrp = 0;1871}18721873NIR_PASS(progress, s, nir_opt_undef);1874NIR_PASS(progress, s, nir_lower_undef_to_zero);18751876if (c && !c->disable_loop_unrolling &&1877s->options->max_unroll_iterations > 0) {1878bool local_progress = false;1879NIR_PASS(local_progress, s, nir_opt_loop_unroll,1880nir_var_shader_in |1881nir_var_function_temp);1882c->unrolled_any_loops |= local_progress;1883progress |= local_progress;1884}1885} while (progress);18861887nir_move_options sink_opts =1888nir_move_const_undef | nir_move_comparisons | nir_move_copies |1889nir_move_load_ubo;1890NIR_PASS(progress, s, nir_opt_sink, sink_opts);18911892NIR_PASS(progress, s, nir_opt_move, nir_move_load_ubo);1893}18941895static int1896driver_location_compare(const nir_variable *a, const nir_variable *b)1897{1898return a->data.driver_location == b->data.driver_location ?1899a->data.location_frac - b->data.location_frac :1900a->data.driver_location - b->data.driver_location;1901}19021903static struct qreg1904ntq_emit_vpm_read(struct v3d_compile *c,1905uint32_t *num_components_queued,1906uint32_t *remaining,1907uint32_t vpm_index)1908{1909struct qreg vpm = vir_reg(QFILE_VPM, vpm_index);19101911if (c->devinfo->ver >= 40 ) {1912return vir_LDVPMV_IN(c,1913vir_uniform_ui(c,1914(*num_components_queued)++));1915}19161917if (*num_components_queued != 0) {1918(*num_components_queued)--;1919return vir_MOV(c, vpm);1920}19211922uint32_t num_components = MIN2(*remaining, 32);19231924v3d33_vir_vpm_read_setup(c, num_components);19251926*num_components_queued = num_components - 1;1927*remaining -= num_components;19281929return vir_MOV(c, vpm);1930}19311932static void1933ntq_setup_vs_inputs(struct v3d_compile *c)1934{1935/* Figure out how many components of each vertex attribute the shader1936* uses. Each variable should have been split to individual1937* components and unused ones DCEed. The vertex fetcher will load1938* from the start of the attribute to the number of components we1939* declare we need in c->vattr_sizes[].1940*1941* BGRA vertex attributes are a bit special: since we implement these1942* as RGBA swapping R/B components we always need at least 3 components1943* if component 0 is read.1944*/1945nir_foreach_shader_in_variable(var, c->s) {1946/* No VS attribute array support. */1947assert(MAX2(glsl_get_length(var->type), 1) == 1);19481949unsigned loc = var->data.driver_location;1950int start_component = var->data.location_frac;1951int num_components = glsl_get_components(var->type);19521953c->vattr_sizes[loc] = MAX2(c->vattr_sizes[loc],1954start_component + num_components);19551956/* Handle BGRA inputs */1957if (start_component == 0 &&1958c->vs_key->va_swap_rb_mask & (1 << var->data.location)) {1959c->vattr_sizes[loc] = MAX2(3, c->vattr_sizes[loc]);1960}1961}19621963unsigned num_components = 0;1964uint32_t vpm_components_queued = 0;1965bool uses_iid = BITSET_TEST(c->s->info.system_values_read,1966SYSTEM_VALUE_INSTANCE_ID) ||1967BITSET_TEST(c->s->info.system_values_read,1968SYSTEM_VALUE_INSTANCE_INDEX);1969bool uses_biid = BITSET_TEST(c->s->info.system_values_read,1970SYSTEM_VALUE_BASE_INSTANCE);1971bool uses_vid = BITSET_TEST(c->s->info.system_values_read,1972SYSTEM_VALUE_VERTEX_ID) ||1973BITSET_TEST(c->s->info.system_values_read,1974SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);19751976num_components += uses_iid;1977num_components += uses_biid;1978num_components += uses_vid;19791980for (int i = 0; i < ARRAY_SIZE(c->vattr_sizes); i++)1981num_components += c->vattr_sizes[i];19821983if (uses_iid) {1984c->iid = ntq_emit_vpm_read(c, &vpm_components_queued,1985&num_components, ~0);1986}19871988if (uses_biid) {1989c->biid = ntq_emit_vpm_read(c, &vpm_components_queued,1990&num_components, ~0);1991}19921993if (uses_vid) {1994c->vid = ntq_emit_vpm_read(c, &vpm_components_queued,1995&num_components, ~0);1996}19971998/* The actual loads will happen directly in nir_intrinsic_load_input1999* on newer versions.2000*/2001if (c->devinfo->ver >= 40)2002return;20032004for (int loc = 0; loc < ARRAY_SIZE(c->vattr_sizes); loc++) {2005resize_qreg_array(c, &c->inputs, &c->inputs_array_size,2006(loc + 1) * 4);20072008for (int i = 0; i < c->vattr_sizes[loc]; i++) {2009c->inputs[loc * 4 + i] =2010ntq_emit_vpm_read(c,2011&vpm_components_queued,2012&num_components,2013loc * 4 + i);20142015}2016}20172018if (c->devinfo->ver >= 40) {2019assert(vpm_components_queued == num_components);2020} else {2021assert(vpm_components_queued == 0);2022assert(num_components == 0);2023}2024}20252026static bool2027program_reads_point_coord(struct v3d_compile *c)2028{2029nir_foreach_shader_in_variable(var, c->s) {2030if (util_varying_is_point_coord(var->data.location,2031c->fs_key->point_sprite_mask)) {2032return true;2033}2034}20352036return false;2037}20382039static void2040ntq_setup_gs_inputs(struct v3d_compile *c)2041{2042nir_sort_variables_with_modes(c->s, driver_location_compare,2043nir_var_shader_in);20442045nir_foreach_shader_in_variable(var, c->s) {2046/* All GS inputs are arrays with as many entries as vertices2047* in the input primitive, but here we only care about the2048* per-vertex input type.2049*/2050assert(glsl_type_is_array(var->type));2051const struct glsl_type *type = glsl_get_array_element(var->type);2052unsigned array_len = MAX2(glsl_get_length(type), 1);2053unsigned loc = var->data.driver_location;20542055resize_qreg_array(c, &c->inputs, &c->inputs_array_size,2056(loc + array_len) * 4);20572058if (var->data.compact) {2059for (unsigned j = 0; j < array_len; j++) {2060unsigned input_idx = c->num_inputs++;2061unsigned loc_frac = var->data.location_frac + j;2062unsigned loc = var->data.location + loc_frac / 4;2063unsigned comp = loc_frac % 4;2064c->input_slots[input_idx] =2065v3d_slot_from_slot_and_component(loc, comp);2066}2067continue;2068}20692070for (unsigned j = 0; j < array_len; j++) {2071unsigned num_elements = glsl_get_vector_elements(type);2072for (unsigned k = 0; k < num_elements; k++) {2073unsigned chan = var->data.location_frac + k;2074unsigned input_idx = c->num_inputs++;2075struct v3d_varying_slot slot =2076v3d_slot_from_slot_and_component(var->data.location + j, chan);2077c->input_slots[input_idx] = slot;2078}2079}2080}2081}208220832084static void2085ntq_setup_fs_inputs(struct v3d_compile *c)2086{2087nir_sort_variables_with_modes(c->s, driver_location_compare,2088nir_var_shader_in);20892090nir_foreach_shader_in_variable(var, c->s) {2091unsigned var_len = glsl_count_vec4_slots(var->type, false, false);2092unsigned loc = var->data.driver_location;20932094uint32_t inputs_array_size = c->inputs_array_size;2095uint32_t inputs_array_required_size = (loc + var_len) * 4;2096resize_qreg_array(c, &c->inputs, &c->inputs_array_size,2097inputs_array_required_size);2098resize_interp_array(c, &c->interp, &inputs_array_size,2099inputs_array_required_size);21002101if (var->data.location == VARYING_SLOT_POS) {2102emit_fragcoord_input(c, loc);2103} else if (var->data.location == VARYING_SLOT_PRIMITIVE_ID &&2104!c->fs_key->has_gs) {2105/* If the fragment shader reads gl_PrimitiveID and we2106* don't have a geometry shader in the pipeline to write2107* it then we program the hardware to inject it as2108* an implicit varying. Take it from there.2109*/2110c->inputs[loc * 4] = c->primitive_id;2111} else if (util_varying_is_point_coord(var->data.location,2112c->fs_key->point_sprite_mask)) {2113c->inputs[loc * 4 + 0] = c->point_x;2114c->inputs[loc * 4 + 1] = c->point_y;2115} else if (var->data.compact) {2116for (int j = 0; j < var_len; j++)2117emit_compact_fragment_input(c, loc, var, j);2118} else if (glsl_type_is_struct(var->type)) {2119for (int j = 0; j < var_len; j++) {2120emit_fragment_input(c, loc, var, j, 4);2121}2122} else {2123for (int j = 0; j < var_len; j++) {2124emit_fragment_input(c, loc, var, j, glsl_get_vector_elements(var->type));2125}2126}2127}2128}21292130static void2131ntq_setup_outputs(struct v3d_compile *c)2132{2133if (c->s->info.stage != MESA_SHADER_FRAGMENT)2134return;21352136nir_foreach_shader_out_variable(var, c->s) {2137unsigned array_len = MAX2(glsl_get_length(var->type), 1);2138unsigned loc = var->data.driver_location * 4;21392140assert(array_len == 1);2141(void)array_len;21422143for (int i = 0; i < 4 - var->data.location_frac; i++) {2144add_output(c, loc + var->data.location_frac + i,2145var->data.location,2146var->data.location_frac + i);2147}21482149switch (var->data.location) {2150case FRAG_RESULT_COLOR:2151c->output_color_var[0] = var;2152c->output_color_var[1] = var;2153c->output_color_var[2] = var;2154c->output_color_var[3] = var;2155break;2156case FRAG_RESULT_DATA0:2157case FRAG_RESULT_DATA1:2158case FRAG_RESULT_DATA2:2159case FRAG_RESULT_DATA3:2160c->output_color_var[var->data.location -2161FRAG_RESULT_DATA0] = var;2162break;2163case FRAG_RESULT_DEPTH:2164c->output_position_index = loc;2165break;2166case FRAG_RESULT_SAMPLE_MASK:2167c->output_sample_mask_index = loc;2168break;2169}2170}2171}21722173/**2174* Sets up the mapping from nir_register to struct qreg *.2175*2176* Each nir_register gets a struct qreg per 32-bit component being stored.2177*/2178static void2179ntq_setup_registers(struct v3d_compile *c, struct exec_list *list)2180{2181foreach_list_typed(nir_register, nir_reg, node, list) {2182unsigned array_len = MAX2(nir_reg->num_array_elems, 1);2183struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,2184array_len *2185nir_reg->num_components);21862187_mesa_hash_table_insert(c->def_ht, nir_reg, qregs);21882189for (int i = 0; i < array_len * nir_reg->num_components; i++)2190qregs[i] = vir_get_temp(c);2191}2192}21932194static void2195ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr)2196{2197/* XXX perf: Experiment with using immediate loads to avoid having2198* these end up in the uniform stream. Watch out for breaking the2199* small immediates optimization in the process!2200*/2201struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);2202for (int i = 0; i < instr->def.num_components; i++)2203qregs[i] = vir_uniform_ui(c, instr->value[i].u32);22042205_mesa_hash_table_insert(c->def_ht, &instr->def, qregs);2206}22072208static void2209ntq_emit_image_size(struct v3d_compile *c, nir_intrinsic_instr *instr)2210{2211unsigned image_index = nir_src_as_uint(instr->src[0]);2212bool is_array = nir_intrinsic_image_array(instr);22132214assert(nir_src_as_uint(instr->src[1]) == 0);22152216ntq_store_dest(c, &instr->dest, 0,2217vir_uniform(c, QUNIFORM_IMAGE_WIDTH, image_index));2218if (instr->num_components > 1) {2219ntq_store_dest(c, &instr->dest, 1,2220vir_uniform(c,2221instr->num_components == 2 && is_array ?2222QUNIFORM_IMAGE_ARRAY_SIZE :2223QUNIFORM_IMAGE_HEIGHT,2224image_index));2225}2226if (instr->num_components > 2) {2227ntq_store_dest(c, &instr->dest, 2,2228vir_uniform(c,2229is_array ?2230QUNIFORM_IMAGE_ARRAY_SIZE :2231QUNIFORM_IMAGE_DEPTH,2232image_index));2233}2234}22352236static void2237vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr)2238{2239assert(c->s->info.stage == MESA_SHADER_FRAGMENT);22402241int rt = nir_src_as_uint(instr->src[0]);2242assert(rt < V3D_MAX_DRAW_BUFFERS);22432244int sample_index = nir_intrinsic_base(instr) ;2245assert(sample_index < V3D_MAX_SAMPLES);22462247int component = nir_intrinsic_component(instr);2248assert(component < 4);22492250/* We need to emit our TLB reads after we have acquired the scoreboard2251* lock, or the GPU will hang. Usually, we do our scoreboard locking on2252* the last thread switch to improve parallelism, however, that is only2253* guaranteed to happen before the tlb color writes.2254*2255* To fix that, we make sure we always emit a thread switch before the2256* first tlb color read. If that happens to be the last thread switch2257* we emit, then everything is fine, but otherwsie, if any code after2258* this point needs to emit additional thread switches, then we will2259* switch the strategy to locking the scoreboard on the first thread2260* switch instead -- see vir_emit_thrsw().2261*/2262if (!c->emitted_tlb_load) {2263if (!c->last_thrsw_at_top_level) {2264assert(c->devinfo->ver >= 41);2265vir_emit_thrsw(c);2266}22672268c->emitted_tlb_load = true;2269}22702271struct qreg *color_reads_for_sample =2272&c->color_reads[(rt * V3D_MAX_SAMPLES + sample_index) * 4];22732274if (color_reads_for_sample[component].file == QFILE_NULL) {2275enum pipe_format rt_format = c->fs_key->color_fmt[rt].format;2276int num_components =2277util_format_get_nr_components(rt_format);22782279const bool swap_rb = c->fs_key->swap_color_rb & (1 << rt);2280if (swap_rb)2281num_components = MAX2(num_components, 3);22822283nir_variable *var = c->output_color_var[rt];2284enum glsl_base_type type = glsl_get_base_type(var->type);22852286bool is_int_format = type == GLSL_TYPE_INT ||2287type == GLSL_TYPE_UINT;22882289bool is_32b_tlb_format = is_int_format ||2290(c->fs_key->f32_color_rb & (1 << rt));22912292int num_samples = c->fs_key->msaa ? V3D_MAX_SAMPLES : 1;22932294uint32_t conf = 0xffffff00;2295conf |= c->fs_key->msaa ? TLB_SAMPLE_MODE_PER_SAMPLE :2296TLB_SAMPLE_MODE_PER_PIXEL;2297conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;22982299if (is_32b_tlb_format) {2300/* The F32 vs I32 distinction was dropped in 4.2. */2301conf |= (c->devinfo->ver < 42 && is_int_format) ?2302TLB_TYPE_I32_COLOR : TLB_TYPE_F32_COLOR;23032304conf |= ((num_components - 1) <<2305TLB_VEC_SIZE_MINUS_1_SHIFT);2306} else {2307conf |= TLB_TYPE_F16_COLOR;2308conf |= TLB_F16_SWAP_HI_LO;23092310if (num_components >= 3)2311conf |= TLB_VEC_SIZE_4_F16;2312else2313conf |= TLB_VEC_SIZE_2_F16;2314}231523162317for (int i = 0; i < num_samples; i++) {2318struct qreg r, g, b, a;2319if (is_32b_tlb_format) {2320r = conf != 0xffffffff && i == 0?2321vir_TLBU_COLOR_READ(c, conf) :2322vir_TLB_COLOR_READ(c);2323if (num_components >= 2)2324g = vir_TLB_COLOR_READ(c);2325if (num_components >= 3)2326b = vir_TLB_COLOR_READ(c);2327if (num_components >= 4)2328a = vir_TLB_COLOR_READ(c);2329} else {2330struct qreg rg = conf != 0xffffffff && i == 0 ?2331vir_TLBU_COLOR_READ(c, conf) :2332vir_TLB_COLOR_READ(c);2333r = vir_FMOV(c, rg);2334vir_set_unpack(c->defs[r.index], 0,2335V3D_QPU_UNPACK_L);2336g = vir_FMOV(c, rg);2337vir_set_unpack(c->defs[g.index], 0,2338V3D_QPU_UNPACK_H);23392340if (num_components > 2) {2341struct qreg ba = vir_TLB_COLOR_READ(c);2342b = vir_FMOV(c, ba);2343vir_set_unpack(c->defs[b.index], 0,2344V3D_QPU_UNPACK_L);2345a = vir_FMOV(c, ba);2346vir_set_unpack(c->defs[a.index], 0,2347V3D_QPU_UNPACK_H);2348}2349}23502351struct qreg *color_reads =2352&c->color_reads[(rt * V3D_MAX_SAMPLES + i) * 4];23532354color_reads[0] = swap_rb ? b : r;2355if (num_components >= 2)2356color_reads[1] = g;2357if (num_components >= 3)2358color_reads[2] = swap_rb ? r : b;2359if (num_components >= 4)2360color_reads[3] = a;2361}2362}23632364assert(color_reads_for_sample[component].file != QFILE_NULL);2365ntq_store_dest(c, &instr->dest, 0,2366vir_MOV(c, color_reads_for_sample[component]));2367}23682369static void2370ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr)2371{2372if (nir_src_is_const(instr->src[0])) {2373int offset = (nir_intrinsic_base(instr) +2374nir_src_as_uint(instr->src[0]));2375assert(offset % 4 == 0);2376/* We need dwords */2377offset = offset / 4;2378for (int i = 0; i < instr->num_components; i++) {2379ntq_store_dest(c, &instr->dest, i,2380vir_uniform(c, QUNIFORM_UNIFORM,2381offset + i));2382}2383} else {2384ntq_emit_tmu_general(c, instr, false);2385}2386}23872388static void2389ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr)2390{2391/* XXX: Use ldvpmv (uniform offset) or ldvpmd (non-uniform offset).2392*2393* Right now the driver sets PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR even2394* if we don't support non-uniform offsets because we also set the2395* lower_all_io_to_temps option in the NIR compiler. This ensures that2396* any indirect indexing on in/out variables is turned into indirect2397* indexing on temporary variables instead, that we handle by lowering2398* to scratch. If we implement non-uniform offset here we might be able2399* to avoid the temp and scratch lowering, which involves copying from2400* the input to the temp variable, possibly making code more optimal.2401*/2402unsigned offset =2403nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0]);24042405if (c->s->info.stage != MESA_SHADER_FRAGMENT && c->devinfo->ver >= 40) {2406/* Emit the LDVPM directly now, rather than at the top2407* of the shader like we did for V3D 3.x (which needs2408* vpmsetup when not just taking the next offset).2409*2410* Note that delaying like this may introduce stalls,2411* as LDVPMV takes a minimum of 1 instruction but may2412* be slower if the VPM unit is busy with another QPU.2413*/2414int index = 0;2415if (BITSET_TEST(c->s->info.system_values_read,2416SYSTEM_VALUE_INSTANCE_ID)) {2417index++;2418}2419if (BITSET_TEST(c->s->info.system_values_read,2420SYSTEM_VALUE_BASE_INSTANCE)) {2421index++;2422}2423if (BITSET_TEST(c->s->info.system_values_read,2424SYSTEM_VALUE_VERTEX_ID)) {2425index++;2426}2427for (int i = 0; i < offset; i++)2428index += c->vattr_sizes[i];2429index += nir_intrinsic_component(instr);2430for (int i = 0; i < instr->num_components; i++) {2431struct qreg vpm_offset = vir_uniform_ui(c, index++);2432ntq_store_dest(c, &instr->dest, i,2433vir_LDVPMV_IN(c, vpm_offset));2434}2435} else {2436for (int i = 0; i < instr->num_components; i++) {2437int comp = nir_intrinsic_component(instr) + i;2438ntq_store_dest(c, &instr->dest, i,2439vir_MOV(c, c->inputs[offset * 4 + comp]));2440}2441}2442}24432444static void2445ntq_emit_per_sample_color_write(struct v3d_compile *c,2446nir_intrinsic_instr *instr)2447{2448assert(instr->intrinsic == nir_intrinsic_store_tlb_sample_color_v3d);24492450unsigned rt = nir_src_as_uint(instr->src[1]);2451assert(rt < V3D_MAX_DRAW_BUFFERS);24522453unsigned sample_idx = nir_intrinsic_base(instr);2454assert(sample_idx < V3D_MAX_SAMPLES);24552456unsigned offset = (rt * V3D_MAX_SAMPLES + sample_idx) * 4;2457for (int i = 0; i < instr->num_components; i++) {2458c->sample_colors[offset + i] =2459vir_MOV(c, ntq_get_src(c, instr->src[0], i));2460}2461}24622463static void2464ntq_emit_color_write(struct v3d_compile *c,2465nir_intrinsic_instr *instr)2466{2467unsigned offset = (nir_intrinsic_base(instr) +2468nir_src_as_uint(instr->src[1])) * 4 +2469nir_intrinsic_component(instr);2470for (int i = 0; i < instr->num_components; i++) {2471c->outputs[offset + i] =2472vir_MOV(c, ntq_get_src(c, instr->src[0], i));2473}2474}24752476static void2477emit_store_output_gs(struct v3d_compile *c, nir_intrinsic_instr *instr)2478{2479assert(instr->num_components == 1);24802481struct qreg offset = ntq_get_src(c, instr->src[1], 0);24822483uint32_t base_offset = nir_intrinsic_base(instr);24842485if (base_offset)2486offset = vir_ADD(c, vir_uniform_ui(c, base_offset), offset);24872488/* Usually, for VS or FS, we only emit outputs once at program end so2489* our VPM writes are never in non-uniform control flow, but this2490* is not true for GS, where we are emitting multiple vertices.2491*/2492if (vir_in_nonuniform_control_flow(c)) {2493vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),2494V3D_QPU_PF_PUSHZ);2495}24962497struct qreg val = ntq_get_src(c, instr->src[0], 0);24982499/* The offset isn’t necessarily dynamically uniform for a geometry2500* shader. This can happen if the shader sometimes doesn’t emit one of2501* the vertices. In that case subsequent vertices will be written to2502* different offsets in the VPM and we need to use the scatter write2503* instruction to have a different offset for each lane.2504*/2505bool is_uniform_offset =2506!vir_in_nonuniform_control_flow(c) &&2507!nir_src_is_divergent(instr->src[1]);2508vir_VPM_WRITE_indirect(c, val, offset, is_uniform_offset);25092510if (vir_in_nonuniform_control_flow(c)) {2511struct qinst *last_inst =2512(struct qinst *)c->cur_block->instructions.prev;2513vir_set_cond(last_inst, V3D_QPU_COND_IFA);2514}2515}25162517static void2518emit_store_output_vs(struct v3d_compile *c, nir_intrinsic_instr *instr)2519{2520assert(c->s->info.stage == MESA_SHADER_VERTEX);2521assert(instr->num_components == 1);25222523uint32_t base = nir_intrinsic_base(instr);2524struct qreg val = ntq_get_src(c, instr->src[0], 0);25252526if (nir_src_is_const(instr->src[1])) {2527vir_VPM_WRITE(c, val,2528base + nir_src_as_uint(instr->src[1]));2529} else {2530struct qreg offset = vir_ADD(c,2531ntq_get_src(c, instr->src[1], 1),2532vir_uniform_ui(c, base));2533bool is_uniform_offset =2534!vir_in_nonuniform_control_flow(c) &&2535!nir_src_is_divergent(instr->src[1]);2536vir_VPM_WRITE_indirect(c, val, offset, is_uniform_offset);2537}2538}25392540static void2541ntq_emit_store_output(struct v3d_compile *c, nir_intrinsic_instr *instr)2542{2543if (c->s->info.stage == MESA_SHADER_FRAGMENT)2544ntq_emit_color_write(c, instr);2545else if (c->s->info.stage == MESA_SHADER_GEOMETRY)2546emit_store_output_gs(c, instr);2547else2548emit_store_output_vs(c, instr);2549}25502551/**2552* This implementation is based on v3d_sample_{x,y}_offset() from2553* v3d_sample_offset.h.2554*/2555static void2556ntq_get_sample_offset(struct v3d_compile *c, struct qreg sample_idx,2557struct qreg *sx, struct qreg *sy)2558{2559sample_idx = vir_ITOF(c, sample_idx);25602561struct qreg offset_x =2562vir_FADD(c, vir_uniform_f(c, -0.125f),2563vir_FMUL(c, sample_idx,2564vir_uniform_f(c, 0.5f)));2565vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(),2566vir_uniform_f(c, 2.0f), sample_idx),2567V3D_QPU_PF_PUSHC);2568offset_x = vir_SEL(c, V3D_QPU_COND_IFA,2569vir_FSUB(c, offset_x, vir_uniform_f(c, 1.25f)),2570offset_x);25712572struct qreg offset_y =2573vir_FADD(c, vir_uniform_f(c, -0.375f),2574vir_FMUL(c, sample_idx,2575vir_uniform_f(c, 0.25f)));2576*sx = offset_x;2577*sy = offset_y;2578}25792580/**2581* This implementation is based on get_centroid_offset() from fep.c.2582*/2583static void2584ntq_get_barycentric_centroid(struct v3d_compile *c,2585struct qreg *out_x,2586struct qreg *out_y)2587{2588struct qreg sample_mask;2589if (c->output_sample_mask_index != -1)2590sample_mask = c->outputs[c->output_sample_mask_index];2591else2592sample_mask = vir_MSF(c);25932594struct qreg i0 = vir_uniform_ui(c, 0);2595struct qreg i1 = vir_uniform_ui(c, 1);2596struct qreg i2 = vir_uniform_ui(c, 2);2597struct qreg i3 = vir_uniform_ui(c, 3);2598struct qreg i4 = vir_uniform_ui(c, 4);2599struct qreg i8 = vir_uniform_ui(c, 8);26002601/* sN = TRUE if sample N enabled in sample mask, FALSE otherwise */2602struct qreg F = vir_uniform_ui(c, 0);2603struct qreg T = vir_uniform_ui(c, ~0);2604struct qreg s0 = vir_XOR(c, vir_AND(c, sample_mask, i1), i1);2605vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ);2606s0 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);2607struct qreg s1 = vir_XOR(c, vir_AND(c, sample_mask, i2), i2);2608vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ);2609s1 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);2610struct qreg s2 = vir_XOR(c, vir_AND(c, sample_mask, i4), i4);2611vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ);2612s2 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);2613struct qreg s3 = vir_XOR(c, vir_AND(c, sample_mask, i8), i8);2614vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s3), V3D_QPU_PF_PUSHZ);2615s3 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);26162617/* sample_idx = s0 ? 0 : s2 ? 2 : s1 ? 1 : 3 */2618struct qreg sample_idx = i3;2619vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ);2620sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i1, sample_idx);2621vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ);2622sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i2, sample_idx);2623vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ);2624sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i0, sample_idx);26252626/* Get offset at selected sample index */2627struct qreg offset_x, offset_y;2628ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y);26292630/* Select pixel center [offset=(0,0)] if two opposing samples (or none)2631* are selected.2632*/2633struct qreg s0_and_s3 = vir_AND(c, s0, s3);2634struct qreg s1_and_s2 = vir_AND(c, s1, s2);26352636struct qreg use_center = vir_XOR(c, sample_mask, vir_uniform_ui(c, 0));2637vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), use_center), V3D_QPU_PF_PUSHZ);2638use_center = vir_SEL(c, V3D_QPU_COND_IFA, T, F);2639use_center = vir_OR(c, use_center, s0_and_s3);2640use_center = vir_OR(c, use_center, s1_and_s2);26412642struct qreg zero = vir_uniform_f(c, 0.0f);2643vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), use_center), V3D_QPU_PF_PUSHZ);2644offset_x = vir_SEL(c, V3D_QPU_COND_IFNA, zero, offset_x);2645offset_y = vir_SEL(c, V3D_QPU_COND_IFNA, zero, offset_y);26462647*out_x = offset_x;2648*out_y = offset_y;2649}26502651static struct qreg2652ntq_emit_load_interpolated_input(struct v3d_compile *c,2653struct qreg p,2654struct qreg C,2655struct qreg offset_x,2656struct qreg offset_y,2657unsigned mode)2658{2659if (mode == INTERP_MODE_FLAT)2660return C;26612662struct qreg sample_offset_x =2663vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c)));2664struct qreg sample_offset_y =2665vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)));26662667struct qreg scaleX =2668vir_FADD(c, vir_FSUB(c, vir_uniform_f(c, 0.5f), sample_offset_x),2669offset_x);2670struct qreg scaleY =2671vir_FADD(c, vir_FSUB(c, vir_uniform_f(c, 0.5f), sample_offset_y),2672offset_y);26732674struct qreg pInterp =2675vir_FADD(c, p, vir_FADD(c, vir_FMUL(c, vir_FDX(c, p), scaleX),2676vir_FMUL(c, vir_FDY(c, p), scaleY)));26772678if (mode == INTERP_MODE_NOPERSPECTIVE)2679return vir_FADD(c, pInterp, C);26802681struct qreg w = c->payload_w;2682struct qreg wInterp =2683vir_FADD(c, w, vir_FADD(c, vir_FMUL(c, vir_FDX(c, w), scaleX),2684vir_FMUL(c, vir_FDY(c, w), scaleY)));26852686return vir_FADD(c, vir_FMUL(c, pInterp, wInterp), C);2687}26882689static void2690emit_ldunifa(struct v3d_compile *c, struct qreg *result)2691{2692struct qinst *ldunifa =2693vir_add_inst(V3D_QPU_A_NOP, c->undef, c->undef, c->undef);2694ldunifa->qpu.sig.ldunifa = true;2695if (result)2696*result = vir_emit_def(c, ldunifa);2697else2698vir_emit_nondef(c, ldunifa);2699c->current_unifa_offset += 4;2700}27012702static void2703ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)2704{2705/* Every ldunifa auto-increments the unifa address by 4 bytes, so our2706* current unifa offset is 4 bytes ahead of the offset of the last load.2707*/2708static const int32_t max_unifa_skip_dist =2709MAX_UNIFA_SKIP_DISTANCE - 4;27102711bool dynamic_src = !nir_src_is_const(instr->src[1]);2712uint32_t const_offset =2713dynamic_src ? 0 : nir_src_as_uint(instr->src[1]);27142715/* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index2716* shifted up by 1 (0 is gallium's constant buffer 0).2717*/2718uint32_t index = nir_src_as_uint(instr->src[0]);2719if (c->key->environment == V3D_ENVIRONMENT_OPENGL)2720index++;27212722/* We can only keep track of the last unifa address we used with2723* constant offset loads. If the new load targets the same UBO and2724* is close enough to the previous load, we can skip the unifa register2725* write by emitting dummy ldunifa instructions to update the unifa2726* address.2727*/2728bool skip_unifa = false;2729uint32_t ldunifa_skips = 0;2730if (dynamic_src) {2731c->current_unifa_block = NULL;2732} else if (c->cur_block == c->current_unifa_block &&2733c->current_unifa_index == index &&2734c->current_unifa_offset <= const_offset &&2735c->current_unifa_offset + max_unifa_skip_dist >= const_offset) {2736skip_unifa = true;2737ldunifa_skips = (const_offset - c->current_unifa_offset) / 4;2738} else {2739c->current_unifa_block = c->cur_block;2740c->current_unifa_index = index;2741c->current_unifa_offset = const_offset;2742}27432744if (!skip_unifa) {2745struct qreg base_offset =2746vir_uniform(c, QUNIFORM_UBO_ADDR,2747v3d_unit_data_create(index, const_offset));27482749struct qreg unifa = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA);2750if (!dynamic_src) {2751vir_MOV_dest(c, unifa, base_offset);2752} else {2753vir_ADD_dest(c, unifa, base_offset,2754ntq_get_src(c, instr->src[1], 0));2755}2756} else {2757for (int i = 0; i < ldunifa_skips; i++)2758emit_ldunifa(c, NULL);2759}27602761for (uint32_t i = 0; i < nir_intrinsic_dest_components(instr); i++) {2762struct qreg data;2763emit_ldunifa(c, &data);2764ntq_store_dest(c, &instr->dest, i, vir_MOV(c, data));2765}2766}27672768static inline struct qreg2769emit_load_local_invocation_index(struct v3d_compile *c)2770{2771return vir_SHR(c, c->cs_payload[1],2772vir_uniform_ui(c, 32 - c->local_invocation_index_bits));2773}27742775/* Various subgroup operations rely on the A flags, so this helper ensures that2776* A flags represents currently active lanes in the subgroup.2777*/2778static void2779set_a_flags_for_subgroup(struct v3d_compile *c)2780{2781/* MSF returns 0 for disabled lanes in compute shaders so2782* PUSHZ will set A=1 for disabled lanes. We want the inverse2783* of this but we don't have any means to negate the A flags2784* directly, but we can do it by repeating the same operation2785* with NORZ (A = ~A & ~Z).2786*/2787assert(c->s->info.stage == MESA_SHADER_COMPUTE);2788vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ);2789vir_set_uf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_UF_NORZ);27902791/* If we are under non-uniform control flow we also need to2792* AND the A flags with the current execute mask.2793*/2794if (vir_in_nonuniform_control_flow(c)) {2795const uint32_t bidx = c->cur_block->index;2796vir_set_uf(c, vir_XOR_dest(c, vir_nop_reg(),2797c->execute,2798vir_uniform_ui(c, bidx)),2799V3D_QPU_UF_ANDZ);2800}2801}28022803static void2804ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)2805{2806switch (instr->intrinsic) {2807case nir_intrinsic_load_uniform:2808ntq_emit_load_uniform(c, instr);2809break;28102811case nir_intrinsic_load_ubo:2812if (!nir_src_is_divergent(instr->src[1]))2813ntq_emit_load_ubo_unifa(c, instr);2814else2815ntq_emit_tmu_general(c, instr, false);2816break;28172818case nir_intrinsic_ssbo_atomic_add:2819case nir_intrinsic_ssbo_atomic_imin:2820case nir_intrinsic_ssbo_atomic_umin:2821case nir_intrinsic_ssbo_atomic_imax:2822case nir_intrinsic_ssbo_atomic_umax:2823case nir_intrinsic_ssbo_atomic_and:2824case nir_intrinsic_ssbo_atomic_or:2825case nir_intrinsic_ssbo_atomic_xor:2826case nir_intrinsic_ssbo_atomic_exchange:2827case nir_intrinsic_ssbo_atomic_comp_swap:2828case nir_intrinsic_load_ssbo:2829case nir_intrinsic_store_ssbo:2830ntq_emit_tmu_general(c, instr, false);2831break;28322833case nir_intrinsic_shared_atomic_add:2834case nir_intrinsic_shared_atomic_imin:2835case nir_intrinsic_shared_atomic_umin:2836case nir_intrinsic_shared_atomic_imax:2837case nir_intrinsic_shared_atomic_umax:2838case nir_intrinsic_shared_atomic_and:2839case nir_intrinsic_shared_atomic_or:2840case nir_intrinsic_shared_atomic_xor:2841case nir_intrinsic_shared_atomic_exchange:2842case nir_intrinsic_shared_atomic_comp_swap:2843case nir_intrinsic_load_shared:2844case nir_intrinsic_store_shared:2845case nir_intrinsic_load_scratch:2846case nir_intrinsic_store_scratch:2847ntq_emit_tmu_general(c, instr, true);2848break;28492850case nir_intrinsic_image_load:2851case nir_intrinsic_image_store:2852case nir_intrinsic_image_atomic_add:2853case nir_intrinsic_image_atomic_imin:2854case nir_intrinsic_image_atomic_umin:2855case nir_intrinsic_image_atomic_imax:2856case nir_intrinsic_image_atomic_umax:2857case nir_intrinsic_image_atomic_and:2858case nir_intrinsic_image_atomic_or:2859case nir_intrinsic_image_atomic_xor:2860case nir_intrinsic_image_atomic_exchange:2861case nir_intrinsic_image_atomic_comp_swap:2862v3d40_vir_emit_image_load_store(c, instr);2863break;28642865case nir_intrinsic_get_ssbo_size:2866ntq_store_dest(c, &instr->dest, 0,2867vir_uniform(c, QUNIFORM_GET_SSBO_SIZE,2868nir_src_comp_as_uint(instr->src[0], 0)));2869break;28702871case nir_intrinsic_get_ubo_size:2872ntq_store_dest(c, &instr->dest, 0,2873vir_uniform(c, QUNIFORM_GET_UBO_SIZE,2874nir_src_comp_as_uint(instr->src[0], 0)));2875break;28762877case nir_intrinsic_load_user_clip_plane:2878for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) {2879ntq_store_dest(c, &instr->dest, i,2880vir_uniform(c, QUNIFORM_USER_CLIP_PLANE,2881nir_intrinsic_ucp_id(instr) *28824 + i));2883}2884break;28852886case nir_intrinsic_load_viewport_x_scale:2887ntq_store_dest(c, &instr->dest, 0,2888vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0));2889break;28902891case nir_intrinsic_load_viewport_y_scale:2892ntq_store_dest(c, &instr->dest, 0,2893vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0));2894break;28952896case nir_intrinsic_load_viewport_z_scale:2897ntq_store_dest(c, &instr->dest, 0,2898vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0));2899break;29002901case nir_intrinsic_load_viewport_z_offset:2902ntq_store_dest(c, &instr->dest, 0,2903vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0));2904break;29052906case nir_intrinsic_load_line_coord:2907ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->line_x));2908break;29092910case nir_intrinsic_load_line_width:2911ntq_store_dest(c, &instr->dest, 0,2912vir_uniform(c, QUNIFORM_LINE_WIDTH, 0));2913break;29142915case nir_intrinsic_load_aa_line_width:2916ntq_store_dest(c, &instr->dest, 0,2917vir_uniform(c, QUNIFORM_AA_LINE_WIDTH, 0));2918break;29192920case nir_intrinsic_load_sample_mask_in:2921ntq_store_dest(c, &instr->dest, 0, vir_MSF(c));2922break;29232924case nir_intrinsic_load_helper_invocation:2925vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ);2926struct qreg qdest = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);2927ntq_store_dest(c, &instr->dest, 0, qdest);2928break;29292930case nir_intrinsic_load_front_face:2931/* The register contains 0 (front) or 1 (back), and we need to2932* turn it into a NIR bool where true means front.2933*/2934ntq_store_dest(c, &instr->dest, 0,2935vir_ADD(c,2936vir_uniform_ui(c, -1),2937vir_REVF(c)));2938break;29392940case nir_intrinsic_load_base_instance:2941ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->biid));2942break;29432944case nir_intrinsic_load_instance_id:2945ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->iid));2946break;29472948case nir_intrinsic_load_vertex_id:2949ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid));2950break;29512952case nir_intrinsic_load_tlb_color_v3d:2953vir_emit_tlb_color_read(c, instr);2954break;29552956case nir_intrinsic_load_input:2957ntq_emit_load_input(c, instr);2958break;29592960case nir_intrinsic_store_tlb_sample_color_v3d:2961ntq_emit_per_sample_color_write(c, instr);2962break;29632964case nir_intrinsic_store_output:2965ntq_emit_store_output(c, instr);2966break;29672968case nir_intrinsic_image_size:2969ntq_emit_image_size(c, instr);2970break;29712972case nir_intrinsic_discard:2973ntq_flush_tmu(c);29742975if (vir_in_nonuniform_control_flow(c)) {2976vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),2977V3D_QPU_PF_PUSHZ);2978vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(),2979vir_uniform_ui(c, 0)),2980V3D_QPU_COND_IFA);2981} else {2982vir_SETMSF_dest(c, vir_nop_reg(),2983vir_uniform_ui(c, 0));2984}2985break;29862987case nir_intrinsic_discard_if: {2988ntq_flush_tmu(c);29892990enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, instr->src[0]);29912992if (vir_in_nonuniform_control_flow(c)) {2993struct qinst *exec_flag = vir_MOV_dest(c, vir_nop_reg(),2994c->execute);2995if (cond == V3D_QPU_COND_IFA) {2996vir_set_uf(c, exec_flag, V3D_QPU_UF_ANDZ);2997} else {2998vir_set_uf(c, exec_flag, V3D_QPU_UF_NORNZ);2999cond = V3D_QPU_COND_IFA;3000}3001}30023003vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(),3004vir_uniform_ui(c, 0)), cond);30053006break;3007}30083009case nir_intrinsic_memory_barrier:3010case nir_intrinsic_memory_barrier_buffer:3011case nir_intrinsic_memory_barrier_image:3012case nir_intrinsic_memory_barrier_shared:3013case nir_intrinsic_memory_barrier_tcs_patch:3014case nir_intrinsic_group_memory_barrier:3015/* We don't do any instruction scheduling of these NIR3016* instructions between each other, so we just need to make3017* sure that the TMU operations before the barrier are flushed3018* before the ones after the barrier.3019*/3020ntq_flush_tmu(c);3021break;30223023case nir_intrinsic_control_barrier:3024/* Emit a TSY op to get all invocations in the workgroup3025* (actually supergroup) to block until the last invocation3026* reaches the TSY op.3027*/3028ntq_flush_tmu(c);30293030if (c->devinfo->ver >= 42) {3031vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC,3032V3D_QPU_WADDR_SYNCB));3033} else {3034struct qinst *sync =3035vir_BARRIERID_dest(c,3036vir_reg(QFILE_MAGIC,3037V3D_QPU_WADDR_SYNCU));3038sync->uniform =3039vir_get_uniform_index(c, QUNIFORM_CONSTANT,30400xffffff00 |3041V3D_TSY_WAIT_INC_CHECK);30423043}30443045/* The blocking of a TSY op only happens at the next thread3046* switch. No texturing may be outstanding at the time of a3047* TSY blocking operation.3048*/3049vir_emit_thrsw(c);3050break;30513052case nir_intrinsic_load_num_workgroups:3053for (int i = 0; i < 3; i++) {3054ntq_store_dest(c, &instr->dest, i,3055vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS,3056i));3057}3058break;30593060case nir_intrinsic_load_workgroup_id: {3061struct qreg x = vir_AND(c, c->cs_payload[0],3062vir_uniform_ui(c, 0xffff));30633064struct qreg y = vir_SHR(c, c->cs_payload[0],3065vir_uniform_ui(c, 16));30663067struct qreg z = vir_AND(c, c->cs_payload[1],3068vir_uniform_ui(c, 0xffff));30693070/* We only support dispatch base in Vulkan */3071if (c->key->environment == V3D_ENVIRONMENT_VULKAN) {3072x = vir_ADD(c, x,3073vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 0));3074y = vir_ADD(c, y,3075vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 1));3076z = vir_ADD(c, z,3077vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 2));3078}30793080ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, x));3081ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, y));3082ntq_store_dest(c, &instr->dest, 2, vir_MOV(c, z));3083break;3084}30853086case nir_intrinsic_load_local_invocation_index:3087ntq_store_dest(c, &instr->dest, 0,3088emit_load_local_invocation_index(c));3089break;30903091case nir_intrinsic_load_subgroup_id: {3092/* This is basically the batch index, which is the Local3093* Invocation Index divided by the SIMD width).3094*/3095STATIC_ASSERT(util_is_power_of_two_nonzero(V3D_CHANNELS));3096const uint32_t divide_shift = ffs(V3D_CHANNELS) - 1;3097struct qreg lii = emit_load_local_invocation_index(c);3098ntq_store_dest(c, &instr->dest, 0,3099vir_SHR(c, lii,3100vir_uniform_ui(c, divide_shift)));3101break;3102}31033104case nir_intrinsic_load_per_vertex_input: {3105/* The vertex shader writes all its used outputs into3106* consecutive VPM offsets, so if any output component is3107* unused, its VPM offset is used by the next used3108* component. This means that we can't assume that each3109* location will use 4 consecutive scalar offsets in the VPM3110* and we need to compute the VPM offset for each input by3111* going through the inputs and finding the one that matches3112* our location and component.3113*3114* col: vertex index, row = varying index3115*/3116assert(nir_src_is_const(instr->src[1]));3117uint32_t location =3118nir_intrinsic_io_semantics(instr).location +3119nir_src_as_uint(instr->src[1]);3120uint32_t component = nir_intrinsic_component(instr);31213122int32_t row_idx = -1;3123for (int i = 0; i < c->num_inputs; i++) {3124struct v3d_varying_slot slot = c->input_slots[i];3125if (v3d_slot_get_slot(slot) == location &&3126v3d_slot_get_component(slot) == component) {3127row_idx = i;3128break;3129}3130}31313132assert(row_idx != -1);31333134struct qreg col = ntq_get_src(c, instr->src[0], 0);3135for (int i = 0; i < instr->num_components; i++) {3136struct qreg row = vir_uniform_ui(c, row_idx++);3137ntq_store_dest(c, &instr->dest, i,3138vir_LDVPMG_IN(c, row, col));3139}3140break;3141}31423143case nir_intrinsic_emit_vertex:3144case nir_intrinsic_end_primitive:3145unreachable("Should have been lowered in v3d_nir_lower_io");3146break;31473148case nir_intrinsic_load_primitive_id: {3149/* gl_PrimitiveIdIn is written by the GBG in the first word of3150* VPM output header. According to docs, we should read this3151* using ldvpm(v,d)_in (See Table 71).3152*/3153assert(c->s->info.stage == MESA_SHADER_GEOMETRY);3154ntq_store_dest(c, &instr->dest, 0,3155vir_LDVPMV_IN(c, vir_uniform_ui(c, 0)));3156break;3157}31583159case nir_intrinsic_load_invocation_id:3160ntq_store_dest(c, &instr->dest, 0, vir_IID(c));3161break;31623163case nir_intrinsic_load_fb_layers_v3d:3164ntq_store_dest(c, &instr->dest, 0,3165vir_uniform(c, QUNIFORM_FB_LAYERS, 0));3166break;31673168case nir_intrinsic_load_sample_id:3169ntq_store_dest(c, &instr->dest, 0, vir_SAMPID(c));3170break;31713172case nir_intrinsic_load_sample_pos:3173ntq_store_dest(c, &instr->dest, 0,3174vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c))));3175ntq_store_dest(c, &instr->dest, 1,3176vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c))));3177break;31783179case nir_intrinsic_load_barycentric_at_offset:3180ntq_store_dest(c, &instr->dest, 0,3181vir_MOV(c, ntq_get_src(c, instr->src[0], 0)));3182ntq_store_dest(c, &instr->dest, 1,3183vir_MOV(c, ntq_get_src(c, instr->src[0], 1)));3184break;31853186case nir_intrinsic_load_barycentric_pixel:3187ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f));3188ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f));3189break;31903191case nir_intrinsic_load_barycentric_at_sample: {3192if (!c->fs_key->msaa) {3193ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f));3194ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f));3195return;3196}31973198struct qreg offset_x, offset_y;3199struct qreg sample_idx = ntq_get_src(c, instr->src[0], 0);3200ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y);32013202ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x));3203ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y));3204break;3205}32063207case nir_intrinsic_load_barycentric_sample: {3208struct qreg offset_x =3209vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c)));3210struct qreg offset_y =3211vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)));32123213ntq_store_dest(c, &instr->dest, 0,3214vir_FSUB(c, offset_x, vir_uniform_f(c, 0.5f)));3215ntq_store_dest(c, &instr->dest, 1,3216vir_FSUB(c, offset_y, vir_uniform_f(c, 0.5f)));3217break;3218}32193220case nir_intrinsic_load_barycentric_centroid: {3221struct qreg offset_x, offset_y;3222ntq_get_barycentric_centroid(c, &offset_x, &offset_y);3223ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x));3224ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y));3225break;3226}32273228case nir_intrinsic_load_interpolated_input: {3229assert(nir_src_is_const(instr->src[1]));3230const uint32_t offset = nir_src_as_uint(instr->src[1]);32313232for (int i = 0; i < instr->num_components; i++) {3233const uint32_t input_idx =3234(nir_intrinsic_base(instr) + offset) * 4 +3235nir_intrinsic_component(instr) + i;32363237/* If we are not in MSAA or if we are not interpolating3238* a user varying, just return the pre-computed3239* interpolated input.3240*/3241if (!c->fs_key->msaa ||3242c->interp[input_idx].vp.file == QFILE_NULL) {3243ntq_store_dest(c, &instr->dest, i,3244vir_MOV(c, c->inputs[input_idx]));3245continue;3246}32473248/* Otherwise compute interpolation at the specified3249* offset.3250*/3251struct qreg p = c->interp[input_idx].vp;3252struct qreg C = c->interp[input_idx].C;3253unsigned interp_mode = c->interp[input_idx].mode;32543255struct qreg offset_x = ntq_get_src(c, instr->src[0], 0);3256struct qreg offset_y = ntq_get_src(c, instr->src[0], 1);32573258struct qreg result =3259ntq_emit_load_interpolated_input(c, p, C,3260offset_x, offset_y,3261interp_mode);3262ntq_store_dest(c, &instr->dest, i, result);3263}3264break;3265}32663267case nir_intrinsic_load_subgroup_size:3268ntq_store_dest(c, &instr->dest, 0,3269vir_uniform_ui(c, V3D_CHANNELS));3270break;32713272case nir_intrinsic_load_subgroup_invocation:3273ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c));3274break;32753276case nir_intrinsic_elect: {3277set_a_flags_for_subgroup(c);3278struct qreg first = vir_FLAFIRST(c);32793280/* Produce a boolean result from Flafirst */3281vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),3282first, vir_uniform_ui(c, 1)),3283V3D_QPU_PF_PUSHZ);3284struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);3285ntq_store_dest(c, &instr->dest, 0, result);3286break;3287}32883289case nir_intrinsic_load_num_subgroups:3290unreachable("Should have been lowered");3291break;32923293default:3294fprintf(stderr, "Unknown intrinsic: ");3295nir_print_instr(&instr->instr, stderr);3296fprintf(stderr, "\n");3297break;3298}3299}33003301/* Clears (activates) the execute flags for any channels whose jump target3302* matches this block.3303*3304* XXX perf: Could we be using flpush/flpop somehow for our execution channel3305* enabling?3306*3307*/3308static void3309ntq_activate_execute_for_block(struct v3d_compile *c)3310{3311vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),3312c->execute, vir_uniform_ui(c, c->cur_block->index)),3313V3D_QPU_PF_PUSHZ);33143315vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));3316}33173318static void3319ntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt)3320{3321nir_block *nir_else_block = nir_if_first_else_block(if_stmt);3322bool empty_else_block =3323(nir_else_block == nir_if_last_else_block(if_stmt) &&3324exec_list_is_empty(&nir_else_block->instr_list));33253326struct qblock *then_block = vir_new_block(c);3327struct qblock *after_block = vir_new_block(c);3328struct qblock *else_block;3329if (empty_else_block)3330else_block = after_block;3331else3332else_block = vir_new_block(c);33333334/* Check if this if statement is really just a conditional jump with3335* the form:3336*3337* if (cond) {3338* break/continue;3339* } else {3340* }3341*3342* In which case we can skip the jump to ELSE we emit before the THEN3343* block and instead just emit the break/continue directly.3344*/3345nir_jump_instr *conditional_jump = NULL;3346if (empty_else_block) {3347nir_block *nir_then_block = nir_if_first_then_block(if_stmt);3348struct nir_instr *inst = nir_block_first_instr(nir_then_block);3349if (inst && inst->type == nir_instr_type_jump)3350conditional_jump = nir_instr_as_jump(inst);3351}33523353/* Set up the flags for the IF condition (taking the THEN branch). */3354enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition);33553356if (!conditional_jump) {3357/* Jump to ELSE. */3358struct qinst *branch = vir_BRANCH(c, cond == V3D_QPU_COND_IFA ?3359V3D_QPU_BRANCH_COND_ANYNA :3360V3D_QPU_BRANCH_COND_ANYA);3361/* Pixels that were not dispatched or have been discarded3362* should not contribute to the ANYA/ANYNA condition.3363*/3364branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P;33653366vir_link_blocks(c->cur_block, else_block);3367vir_link_blocks(c->cur_block, then_block);33683369/* Process the THEN block. */3370vir_set_emit_block(c, then_block);3371ntq_emit_cf_list(c, &if_stmt->then_list);33723373if (!empty_else_block) {3374/* At the end of the THEN block, jump to ENDIF, unless3375* the block ended in a break or continue.3376*/3377if (!c->cur_block->branch_emitted) {3378vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);3379vir_link_blocks(c->cur_block, after_block);3380}33813382/* Emit the else block. */3383vir_set_emit_block(c, else_block);3384ntq_emit_cf_list(c, &if_stmt->else_list);3385}3386} else {3387/* Emit the conditional jump directly.3388*3389* Use ALL with breaks and ANY with continues to ensure that3390* we always break and never continue when all lanes have been3391* disabled (for example because of discards) to prevent3392* infinite loops.3393*/3394assert(conditional_jump &&3395(conditional_jump->type == nir_jump_continue ||3396conditional_jump->type == nir_jump_break));33973398struct qinst *branch = vir_BRANCH(c, cond == V3D_QPU_COND_IFA ?3399(conditional_jump->type == nir_jump_break ?3400V3D_QPU_BRANCH_COND_ALLA :3401V3D_QPU_BRANCH_COND_ANYA) :3402(conditional_jump->type == nir_jump_break ?3403V3D_QPU_BRANCH_COND_ALLNA :3404V3D_QPU_BRANCH_COND_ANYNA));3405branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P;34063407vir_link_blocks(c->cur_block,3408conditional_jump->type == nir_jump_break ?3409c->loop_break_block :3410c->loop_cont_block);3411}34123413vir_link_blocks(c->cur_block, after_block);34143415vir_set_emit_block(c, after_block);3416}34173418static void3419ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)3420{3421nir_block *nir_else_block = nir_if_first_else_block(if_stmt);3422bool empty_else_block =3423(nir_else_block == nir_if_last_else_block(if_stmt) &&3424exec_list_is_empty(&nir_else_block->instr_list));34253426struct qblock *then_block = vir_new_block(c);3427struct qblock *after_block = vir_new_block(c);3428struct qblock *else_block;3429if (empty_else_block)3430else_block = after_block;3431else3432else_block = vir_new_block(c);34333434bool was_uniform_control_flow = false;3435if (!vir_in_nonuniform_control_flow(c)) {3436c->execute = vir_MOV(c, vir_uniform_ui(c, 0));3437was_uniform_control_flow = true;3438}34393440/* Set up the flags for the IF condition (taking the THEN branch). */3441enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition);34423443/* Update the flags+cond to mean "Taking the ELSE branch (!cond) and3444* was previously active (execute Z) for updating the exec flags.3445*/3446if (was_uniform_control_flow) {3447cond = v3d_qpu_cond_invert(cond);3448} else {3449struct qinst *inst = vir_MOV_dest(c, vir_nop_reg(), c->execute);3450if (cond == V3D_QPU_COND_IFA) {3451vir_set_uf(c, inst, V3D_QPU_UF_NORNZ);3452} else {3453vir_set_uf(c, inst, V3D_QPU_UF_ANDZ);3454cond = V3D_QPU_COND_IFA;3455}3456}34573458vir_MOV_cond(c, cond,3459c->execute,3460vir_uniform_ui(c, else_block->index));34613462/* Jump to ELSE if nothing is active for THEN, otherwise fall3463* through.3464*/3465vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ);3466vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);3467vir_link_blocks(c->cur_block, else_block);3468vir_link_blocks(c->cur_block, then_block);34693470/* Process the THEN block. */3471vir_set_emit_block(c, then_block);3472ntq_emit_cf_list(c, &if_stmt->then_list);34733474if (!empty_else_block) {3475/* Handle the end of the THEN block. First, all currently3476* active channels update their execute flags to point to3477* ENDIF3478*/3479vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),3480V3D_QPU_PF_PUSHZ);3481vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,3482vir_uniform_ui(c, after_block->index));34833484/* If everything points at ENDIF, then jump there immediately. */3485vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),3486c->execute,3487vir_uniform_ui(c, after_block->index)),3488V3D_QPU_PF_PUSHZ);3489vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);3490vir_link_blocks(c->cur_block, after_block);3491vir_link_blocks(c->cur_block, else_block);34923493vir_set_emit_block(c, else_block);3494ntq_activate_execute_for_block(c);3495ntq_emit_cf_list(c, &if_stmt->else_list);3496}34973498vir_link_blocks(c->cur_block, after_block);34993500vir_set_emit_block(c, after_block);3501if (was_uniform_control_flow)3502c->execute = c->undef;3503else3504ntq_activate_execute_for_block(c);3505}35063507static void3508ntq_emit_if(struct v3d_compile *c, nir_if *nif)3509{3510bool was_in_control_flow = c->in_control_flow;3511c->in_control_flow = true;3512if (!vir_in_nonuniform_control_flow(c) &&3513!nir_src_is_divergent(nif->condition)) {3514ntq_emit_uniform_if(c, nif);3515} else {3516ntq_emit_nonuniform_if(c, nif);3517}3518c->in_control_flow = was_in_control_flow;3519}35203521static void3522ntq_emit_jump(struct v3d_compile *c, nir_jump_instr *jump)3523{3524switch (jump->type) {3525case nir_jump_break:3526vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),3527V3D_QPU_PF_PUSHZ);3528vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,3529vir_uniform_ui(c, c->loop_break_block->index));3530break;35313532case nir_jump_continue:3533vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),3534V3D_QPU_PF_PUSHZ);3535vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,3536vir_uniform_ui(c, c->loop_cont_block->index));3537break;35383539case nir_jump_return:3540unreachable("All returns should be lowered\n");3541break;35423543case nir_jump_halt:3544case nir_jump_goto:3545case nir_jump_goto_if:3546unreachable("not supported\n");3547break;3548}3549}35503551static void3552ntq_emit_uniform_jump(struct v3d_compile *c, nir_jump_instr *jump)3553{3554switch (jump->type) {3555case nir_jump_break:3556vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);3557vir_link_blocks(c->cur_block, c->loop_break_block);3558c->cur_block->branch_emitted = true;3559break;3560case nir_jump_continue:3561vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);3562vir_link_blocks(c->cur_block, c->loop_cont_block);3563c->cur_block->branch_emitted = true;3564break;35653566case nir_jump_return:3567unreachable("All returns should be lowered\n");3568break;35693570case nir_jump_halt:3571case nir_jump_goto:3572case nir_jump_goto_if:3573unreachable("not supported\n");3574break;3575}3576}35773578static void3579ntq_emit_instr(struct v3d_compile *c, nir_instr *instr)3580{3581switch (instr->type) {3582case nir_instr_type_alu:3583ntq_emit_alu(c, nir_instr_as_alu(instr));3584break;35853586case nir_instr_type_intrinsic:3587ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr));3588break;35893590case nir_instr_type_load_const:3591ntq_emit_load_const(c, nir_instr_as_load_const(instr));3592break;35933594case nir_instr_type_ssa_undef:3595unreachable("Should've been lowered by nir_lower_undef_to_zero");3596break;35973598case nir_instr_type_tex:3599ntq_emit_tex(c, nir_instr_as_tex(instr));3600break;36013602case nir_instr_type_jump:3603/* Always flush TMU before jumping to another block, for the3604* same reasons as in ntq_emit_block.3605*/3606ntq_flush_tmu(c);3607if (vir_in_nonuniform_control_flow(c))3608ntq_emit_jump(c, nir_instr_as_jump(instr));3609else3610ntq_emit_uniform_jump(c, nir_instr_as_jump(instr));3611break;36123613default:3614fprintf(stderr, "Unknown NIR instr type: ");3615nir_print_instr(instr, stderr);3616fprintf(stderr, "\n");3617abort();3618}3619}36203621static void3622ntq_emit_block(struct v3d_compile *c, nir_block *block)3623{3624nir_foreach_instr(instr, block) {3625ntq_emit_instr(c, instr);3626}36273628/* Always process pending TMU operations in the same block they were3629* emitted: we can't emit TMU operations in a block and then emit a3630* thread switch and LDTMU/TMUWT for them in another block, possibly3631* under control flow.3632*/3633ntq_flush_tmu(c);3634}36353636static void ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);36373638static void3639ntq_emit_nonuniform_loop(struct v3d_compile *c, nir_loop *loop)3640{3641bool was_uniform_control_flow = false;3642if (!vir_in_nonuniform_control_flow(c)) {3643c->execute = vir_MOV(c, vir_uniform_ui(c, 0));3644was_uniform_control_flow = true;3645}36463647c->loop_cont_block = vir_new_block(c);3648c->loop_break_block = vir_new_block(c);36493650vir_link_blocks(c->cur_block, c->loop_cont_block);3651vir_set_emit_block(c, c->loop_cont_block);3652ntq_activate_execute_for_block(c);36533654ntq_emit_cf_list(c, &loop->body);36553656/* Re-enable any previous continues now, so our ANYA check below3657* works.3658*3659* XXX: Use the .ORZ flags update, instead.3660*/3661vir_set_pf(c, vir_XOR_dest(c,3662vir_nop_reg(),3663c->execute,3664vir_uniform_ui(c, c->loop_cont_block->index)),3665V3D_QPU_PF_PUSHZ);3666vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));36673668vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ);36693670struct qinst *branch = vir_BRANCH(c, V3D_QPU_BRANCH_COND_ANYA);3671/* Pixels that were not dispatched or have been discarded should not3672* contribute to looping again.3673*/3674branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P;3675vir_link_blocks(c->cur_block, c->loop_cont_block);3676vir_link_blocks(c->cur_block, c->loop_break_block);36773678vir_set_emit_block(c, c->loop_break_block);3679if (was_uniform_control_flow)3680c->execute = c->undef;3681else3682ntq_activate_execute_for_block(c);3683}36843685static void3686ntq_emit_uniform_loop(struct v3d_compile *c, nir_loop *loop)3687{36883689c->loop_cont_block = vir_new_block(c);3690c->loop_break_block = vir_new_block(c);36913692vir_link_blocks(c->cur_block, c->loop_cont_block);3693vir_set_emit_block(c, c->loop_cont_block);36943695ntq_emit_cf_list(c, &loop->body);36963697if (!c->cur_block->branch_emitted) {3698vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);3699vir_link_blocks(c->cur_block, c->loop_cont_block);3700}37013702vir_set_emit_block(c, c->loop_break_block);3703}37043705static void3706ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)3707{3708bool was_in_control_flow = c->in_control_flow;3709c->in_control_flow = true;37103711struct qblock *save_loop_cont_block = c->loop_cont_block;3712struct qblock *save_loop_break_block = c->loop_break_block;37133714if (vir_in_nonuniform_control_flow(c) || loop->divergent) {3715ntq_emit_nonuniform_loop(c, loop);3716} else {3717ntq_emit_uniform_loop(c, loop);3718}37193720c->loop_break_block = save_loop_break_block;3721c->loop_cont_block = save_loop_cont_block;37223723c->loops++;37243725c->in_control_flow = was_in_control_flow;3726}37273728static void3729ntq_emit_function(struct v3d_compile *c, nir_function_impl *func)3730{3731fprintf(stderr, "FUNCTIONS not handled.\n");3732abort();3733}37343735static void3736ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list)3737{3738foreach_list_typed(nir_cf_node, node, node, list) {3739switch (node->type) {3740case nir_cf_node_block:3741ntq_emit_block(c, nir_cf_node_as_block(node));3742break;37433744case nir_cf_node_if:3745ntq_emit_if(c, nir_cf_node_as_if(node));3746break;37473748case nir_cf_node_loop:3749ntq_emit_loop(c, nir_cf_node_as_loop(node));3750break;37513752case nir_cf_node_function:3753ntq_emit_function(c, nir_cf_node_as_function(node));3754break;37553756default:3757fprintf(stderr, "Unknown NIR node type\n");3758abort();3759}3760}3761}37623763static void3764ntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl)3765{3766ntq_setup_registers(c, &impl->registers);3767ntq_emit_cf_list(c, &impl->body);3768}37693770static void3771nir_to_vir(struct v3d_compile *c)3772{3773switch (c->s->info.stage) {3774case MESA_SHADER_FRAGMENT:3775c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));3776c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));3777c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));37783779/* V3D 4.x can disable implicit varyings if they are not used */3780c->fs_uses_primitive_id =3781nir_find_variable_with_location(c->s, nir_var_shader_in,3782VARYING_SLOT_PRIMITIVE_ID);3783if (c->fs_uses_primitive_id && !c->fs_key->has_gs) {3784c->primitive_id =3785emit_fragment_varying(c, NULL, -1, 0, 0);3786}37873788if (c->fs_key->is_points &&3789(c->devinfo->ver < 40 || program_reads_point_coord(c))) {3790c->point_x = emit_fragment_varying(c, NULL, -1, 0, 0);3791c->point_y = emit_fragment_varying(c, NULL, -1, 0, 0);3792c->uses_implicit_point_line_varyings = true;3793} else if (c->fs_key->is_lines &&3794(c->devinfo->ver < 40 ||3795BITSET_TEST(c->s->info.system_values_read,3796SYSTEM_VALUE_LINE_COORD))) {3797c->line_x = emit_fragment_varying(c, NULL, -1, 0, 0);3798c->uses_implicit_point_line_varyings = true;3799}38003801c->force_per_sample_msaa =3802c->s->info.fs.uses_sample_qualifier ||3803BITSET_TEST(c->s->info.system_values_read,3804SYSTEM_VALUE_SAMPLE_ID) ||3805BITSET_TEST(c->s->info.system_values_read,3806SYSTEM_VALUE_SAMPLE_POS);3807break;3808case MESA_SHADER_COMPUTE:3809/* Set up the TSO for barriers, assuming we do some. */3810if (c->devinfo->ver < 42) {3811vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC,3812V3D_QPU_WADDR_SYNC));3813}38143815c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));3816c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));38173818/* Set up the division between gl_LocalInvocationIndex and3819* wg_in_mem in the payload reg.3820*/3821int wg_size = (c->s->info.workgroup_size[0] *3822c->s->info.workgroup_size[1] *3823c->s->info.workgroup_size[2]);3824c->local_invocation_index_bits =3825ffs(util_next_power_of_two(MAX2(wg_size, 64))) - 1;3826assert(c->local_invocation_index_bits <= 8);38273828if (c->s->info.shared_size) {3829struct qreg wg_in_mem = vir_SHR(c, c->cs_payload[1],3830vir_uniform_ui(c, 16));3831if (c->s->info.workgroup_size[0] != 1 ||3832c->s->info.workgroup_size[1] != 1 ||3833c->s->info.workgroup_size[2] != 1) {3834int wg_bits = (16 -3835c->local_invocation_index_bits);3836int wg_mask = (1 << wg_bits) - 1;3837wg_in_mem = vir_AND(c, wg_in_mem,3838vir_uniform_ui(c, wg_mask));3839}3840struct qreg shared_per_wg =3841vir_uniform_ui(c, c->s->info.shared_size);38423843c->cs_shared_offset =3844vir_ADD(c,3845vir_uniform(c, QUNIFORM_SHARED_OFFSET,0),3846vir_UMUL(c, wg_in_mem, shared_per_wg));3847}3848break;3849default:3850break;3851}38523853if (c->s->scratch_size) {3854v3d_setup_spill_base(c);3855c->spill_size += V3D_CHANNELS * c->s->scratch_size;3856}38573858switch (c->s->info.stage) {3859case MESA_SHADER_VERTEX:3860ntq_setup_vs_inputs(c);3861break;3862case MESA_SHADER_GEOMETRY:3863ntq_setup_gs_inputs(c);3864break;3865case MESA_SHADER_FRAGMENT:3866ntq_setup_fs_inputs(c);3867break;3868case MESA_SHADER_COMPUTE:3869break;3870default:3871unreachable("unsupported shader stage");3872}38733874ntq_setup_outputs(c);38753876/* Find the main function and emit the body. */3877nir_foreach_function(function, c->s) {3878assert(strcmp(function->name, "main") == 0);3879assert(function->impl);3880ntq_emit_impl(c, function->impl);3881}3882}38833884/**3885* When demoting a shader down to single-threaded, removes the THRSW3886* instructions (one will still be inserted at v3d_vir_to_qpu() for the3887* program end).3888*/3889static void3890vir_remove_thrsw(struct v3d_compile *c)3891{3892vir_for_each_block(block, c) {3893vir_for_each_inst_safe(inst, block) {3894if (inst->qpu.sig.thrsw)3895vir_remove_instruction(c, inst);3896}3897}38983899c->last_thrsw = NULL;3900}39013902void3903vir_emit_last_thrsw(struct v3d_compile *c)3904{3905/* On V3D before 4.1, we need a TMU op to be outstanding when thread3906* switching, so disable threads if we didn't do any TMU ops (each of3907* which would have emitted a THRSW).3908*/3909if (!c->last_thrsw_at_top_level && c->devinfo->ver < 41) {3910c->threads = 1;3911if (c->last_thrsw)3912vir_remove_thrsw(c);3913return;3914}39153916/* If we're threaded and the last THRSW was in conditional code, then3917* we need to emit another one so that we can flag it as the last3918* thrsw.3919*/3920if (c->last_thrsw && !c->last_thrsw_at_top_level) {3921assert(c->devinfo->ver >= 41);3922vir_emit_thrsw(c);3923}39243925/* If we're threaded, then we need to mark the last THRSW instruction3926* so we can emit a pair of them at QPU emit time.3927*3928* For V3D 4.x, we can spawn the non-fragment shaders already in the3929* post-last-THRSW state, so we can skip this.3930*/3931if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT) {3932assert(c->devinfo->ver >= 41);3933vir_emit_thrsw(c);3934}39353936if (c->last_thrsw)3937c->last_thrsw->is_last_thrsw = true;3938}39393940/* There's a flag in the shader for "center W is needed for reasons other than3941* non-centroid varyings", so we just walk the program after VIR optimization3942* to see if it's used. It should be harmless to set even if we only use3943* center W for varyings.3944*/3945static void3946vir_check_payload_w(struct v3d_compile *c)3947{3948if (c->s->info.stage != MESA_SHADER_FRAGMENT)3949return;39503951vir_for_each_inst_inorder(inst, c) {3952for (int i = 0; i < vir_get_nsrc(inst); i++) {3953if (inst->src[i].file == QFILE_REG &&3954inst->src[i].index == 0) {3955c->uses_center_w = true;3956return;3957}3958}3959}3960}39613962void3963v3d_nir_to_vir(struct v3d_compile *c)3964{3965if (V3D_DEBUG & (V3D_DEBUG_NIR |3966v3d_debug_flag_for_shader_stage(c->s->info.stage))) {3967fprintf(stderr, "%s prog %d/%d NIR:\n",3968vir_get_stage_name(c),3969c->program_id, c->variant_id);3970nir_print_shader(c->s, stderr);3971}39723973nir_to_vir(c);39743975/* Emit the last THRSW before STVPM and TLB writes. */3976vir_emit_last_thrsw(c);39773978switch (c->s->info.stage) {3979case MESA_SHADER_FRAGMENT:3980emit_frag_end(c);3981break;3982case MESA_SHADER_GEOMETRY:3983emit_geom_end(c);3984break;3985case MESA_SHADER_VERTEX:3986emit_vert_end(c);3987break;3988case MESA_SHADER_COMPUTE:3989break;3990default:3991unreachable("bad stage");3992}39933994if (V3D_DEBUG & (V3D_DEBUG_VIR |3995v3d_debug_flag_for_shader_stage(c->s->info.stage))) {3996fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n",3997vir_get_stage_name(c),3998c->program_id, c->variant_id);3999vir_dump(c);4000fprintf(stderr, "\n");4001}40024003vir_optimize(c);40044005vir_check_payload_w(c);40064007/* XXX perf: On VC4, we do a VIR-level instruction scheduling here.4008* We used that on that platform to pipeline TMU writes and reduce the4009* number of thread switches, as well as try (mostly successfully) to4010* reduce maximum register pressure to allow more threads. We should4011* do something of that sort for V3D -- either instruction scheduling4012* here, or delay the the THRSW and LDTMUs from our texture4013* instructions until the results are needed.4014*/40154016if (V3D_DEBUG & (V3D_DEBUG_VIR |4017v3d_debug_flag_for_shader_stage(c->s->info.stage))) {4018fprintf(stderr, "%s prog %d/%d VIR:\n",4019vir_get_stage_name(c),4020c->program_id, c->variant_id);4021vir_dump(c);4022fprintf(stderr, "\n");4023}40244025/* Attempt to allocate registers for the temporaries. If we fail,4026* reduce thread count and try again.4027*/4028int min_threads = (c->devinfo->ver >= 41) ? 2 : 1;4029struct qpu_reg *temp_registers;4030while (true) {4031bool spilled;4032temp_registers = v3d_register_allocate(c, &spilled);4033if (spilled)4034continue;40354036if (temp_registers)4037break;40384039if (c->threads == min_threads &&4040(V3D_DEBUG & V3D_DEBUG_RA)) {4041fprintf(stderr,4042"Failed to register allocate using %s\n",4043c->fallback_scheduler ? "the fallback scheduler:" :4044"the normal scheduler: \n");40454046vir_dump(c);40474048char *shaderdb;4049int ret = v3d_shaderdb_dump(c, &shaderdb);4050if (ret > 0) {4051fprintf(stderr, "%s\n", shaderdb);4052free(shaderdb);4053}4054}40554056if (c->threads <= MAX2(c->min_threads_for_reg_alloc, min_threads)) {4057if (V3D_DEBUG & V3D_DEBUG_PERF) {4058fprintf(stderr,4059"Failed to register allocate %s at "4060"%d threads.\n", vir_get_stage_name(c),4061c->threads);4062}4063c->compilation_result =4064V3D_COMPILATION_FAILED_REGISTER_ALLOCATION;4065return;4066}40674068c->spill_count = 0;4069c->threads /= 2;40704071if (c->threads == 1)4072vir_remove_thrsw(c);4073}40744075if (c->spills &&4076(V3D_DEBUG & (V3D_DEBUG_VIR |4077v3d_debug_flag_for_shader_stage(c->s->info.stage)))) {4078fprintf(stderr, "%s prog %d/%d spilled VIR:\n",4079vir_get_stage_name(c),4080c->program_id, c->variant_id);4081vir_dump(c);4082fprintf(stderr, "\n");4083}40844085v3d_vir_to_qpu(c, temp_registers);4086}408740884089