Path: blob/21.2-virgl/src/gallium/auxiliary/nir/nir_to_tgsi.c
4561 views
/*1* Copyright © 2014-2015 Broadcom2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*/2223#include "compiler/nir/nir.h"24#include "compiler/nir/nir_deref.h"25#include "nir/nir_to_tgsi.h"26#include "pipe/p_screen.h"27#include "pipe/p_state.h"28#include "tgsi/tgsi_dump.h"29#include "tgsi/tgsi_from_mesa.h"30#include "tgsi/tgsi_info.h"31#include "tgsi/tgsi_ureg.h"32#include "util/debug.h"33#include "util/u_math.h"34#include "util/u_memory.h"3536struct ntt_compile {37nir_shader *s;38nir_function_impl *impl;39struct pipe_screen *screen;40struct ureg_program *ureg;4142bool needs_texcoord_semantic;43bool any_reg_as_address;44bool native_integers;45bool has_txf_lz;4647int next_addr_reg;48bool addr_declared[2];49struct ureg_dst addr_reg[2];5051/* if condition set up at the end of a block, for ntt_emit_if(). */52struct ureg_src if_cond;5354/* TGSI temps for our NIR SSA and register values. */55struct ureg_dst *reg_temp;56struct ureg_dst *ssa_temp;5758nir_instr_liveness *liveness;5960/* Mappings from driver_location to TGSI input/output number.61*62* We'll be declaring TGSI input/outputs in an arbitrary order, and they get63* their numbers assigned incrementally, unlike inputs or constants.64*/65struct ureg_src *input_index_map;66uint64_t centroid_inputs;6768struct ureg_src images[PIPE_MAX_SHADER_IMAGES];69};7071static void ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list);7273/**74* Interprets a nir_load_const used as a NIR src as a uint.75*76* For non-native-integers drivers, nir_load_const_instrs used by an integer ALU77* instruction (or in a phi-web used by an integer ALU instruction) were78* converted to floats and the ALU instruction swapped to the float equivalent.79* However, this means that integer load_consts used by intrinsics (which don't80* normally get that conversion) may have been reformatted to be floats. Given81* that all of our intrinsic nir_src_as_uint() calls are expected to be small,82* we can just look and see if they look like floats and convert them back to83* ints.84*/85static uint32_t86ntt_src_as_uint(struct ntt_compile *c, nir_src src)87{88uint32_t val = nir_src_as_uint(src);89if (!c->native_integers && val >= fui(1.0))90val = (uint32_t)uif(val);91return val;92}9394static unsigned95ntt_64bit_write_mask(unsigned write_mask)96{97return ((write_mask & 1) ? 0x3 : 0) | ((write_mask & 2) ? 0xc : 0);98}99100static struct ureg_src101ntt_64bit_1f(struct ntt_compile *c)102{103return ureg_imm4u(c->ureg,1040x00000000, 0x3ff00000,1050x00000000, 0x3ff00000);106}107108static const struct glsl_type *109ntt_shader_input_type(struct ntt_compile *c,110struct nir_variable *var)111{112switch (c->s->info.stage) {113case MESA_SHADER_GEOMETRY:114case MESA_SHADER_TESS_EVAL:115case MESA_SHADER_TESS_CTRL:116if (glsl_type_is_array(var->type))117return glsl_get_array_element(var->type);118else119return var->type;120default:121return var->type;122}123}124125static void126ntt_get_gl_varying_semantic(struct ntt_compile *c, unsigned location,127unsigned *semantic_name, unsigned *semantic_index)128{129/* We want to use most of tgsi_get_gl_varying_semantic(), but the130* !texcoord shifting has already been applied, so avoid that.131*/132if (!c->needs_texcoord_semantic &&133(location >= VARYING_SLOT_VAR0 && location < VARYING_SLOT_PATCH0)) {134*semantic_name = TGSI_SEMANTIC_GENERIC;135*semantic_index = location - VARYING_SLOT_VAR0;136return;137}138139tgsi_get_gl_varying_semantic(location, true,140semantic_name, semantic_index);141}142143/* TGSI varying declarations have a component usage mask associated (used by144* r600 and svga).145*/146static uint32_t147ntt_tgsi_usage_mask(unsigned start_component, unsigned num_components,148bool is_64)149{150uint32_t usage_mask =151u_bit_consecutive(start_component, num_components);152153if (is_64) {154if (start_component >= 2)155usage_mask >>= 2;156157uint32_t tgsi_usage_mask = 0;158159if (usage_mask & TGSI_WRITEMASK_X)160tgsi_usage_mask |= TGSI_WRITEMASK_XY;161if (usage_mask & TGSI_WRITEMASK_Y)162tgsi_usage_mask |= TGSI_WRITEMASK_ZW;163164return tgsi_usage_mask;165} else {166return usage_mask;167}168}169170/* TGSI varying declarations have a component usage mask associated (used by171* r600 and svga).172*/173static uint32_t174ntt_tgsi_var_usage_mask(const struct nir_variable *var)175{176const struct glsl_type *type_without_array =177glsl_without_array(var->type);178unsigned num_components = glsl_get_vector_elements(type_without_array);179if (num_components == 0) /* structs */180num_components = 4;181182return ntt_tgsi_usage_mask(var->data.location_frac, num_components,183glsl_type_is_64bit(type_without_array));184}185186static struct ureg_dst187ntt_store_output_decl(struct ntt_compile *c, nir_intrinsic_instr *instr, uint32_t *frac)188{189nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);190int base = nir_intrinsic_base(instr);191*frac = nir_intrinsic_component(instr);192bool is_64 = nir_src_bit_size(instr->src[0]) == 64;193194struct ureg_dst out;195if (c->s->info.stage == MESA_SHADER_FRAGMENT) {196if (semantics.location == FRAG_RESULT_COLOR)197ureg_property(c->ureg, TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS, 1);198199unsigned semantic_name, semantic_index;200tgsi_get_gl_frag_result_semantic(semantics.location,201&semantic_name, &semantic_index);202semantic_index += semantics.dual_source_blend_index;203204switch (semantics.location) {205case FRAG_RESULT_DEPTH:206*frac = 2; /* z write is the to the .z channel in TGSI */207break;208case FRAG_RESULT_STENCIL:209*frac = 1;210break;211default:212break;213}214215out = ureg_DECL_output(c->ureg, semantic_name, semantic_index);216} else {217unsigned semantic_name, semantic_index;218219ntt_get_gl_varying_semantic(c, semantics.location,220&semantic_name, &semantic_index);221222uint32_t usage_mask = ntt_tgsi_usage_mask(*frac,223instr->num_components,224is_64);225uint32_t gs_streams = semantics.gs_streams;226for (int i = 0; i < 4; i++) {227if (!(usage_mask & (1 << i)))228gs_streams &= ~(0x3 << 2 * i);229}230231/* No driver appears to use array_id of outputs. */232unsigned array_id = 0;233234/* This bit is lost in the i/o semantics, but it's unused in in-tree235* drivers.236*/237bool invariant = false;238239out = ureg_DECL_output_layout(c->ureg,240semantic_name, semantic_index,241gs_streams,242base,243usage_mask,244array_id,245semantics.num_slots,246invariant);247}248249unsigned write_mask = nir_intrinsic_write_mask(instr);250251if (is_64) {252write_mask = ntt_64bit_write_mask(write_mask);253if (*frac >= 2)254write_mask = write_mask << 2;255} else {256write_mask = write_mask << *frac;257}258return ureg_writemask(out, write_mask);259}260261/* If this reg or SSA def is used only for storing an output, then in the simple262* cases we can write directly to the TGSI output instead of having store_output263* emit its own MOV.264*/265static bool266ntt_try_store_in_tgsi_output(struct ntt_compile *c, struct ureg_dst *dst,267struct list_head *uses, struct list_head *if_uses)268{269*dst = ureg_dst_undef();270271switch (c->s->info.stage) {272case MESA_SHADER_FRAGMENT:273case MESA_SHADER_VERTEX:274break;275default:276/* tgsi_exec (at least) requires that output stores happen per vertex277* emitted, you don't get to reuse a previous output value for the next278* vertex.279*/280return false;281}282283if (!list_is_empty(if_uses) || !list_is_singular(uses))284return false;285286nir_src *src = list_first_entry(uses, nir_src, use_link);287288if (src->parent_instr->type != nir_instr_type_intrinsic)289return false;290291nir_intrinsic_instr *intr = nir_instr_as_intrinsic(src->parent_instr);292if (intr->intrinsic != nir_intrinsic_store_output ||293!nir_src_is_const(intr->src[1])) {294return false;295}296297uint32_t frac;298*dst = ntt_store_output_decl(c, intr, &frac);299dst->Index += ntt_src_as_uint(c, intr->src[1]);300301return frac == 0;302}303304static void305ntt_setup_inputs(struct ntt_compile *c)306{307if (c->s->info.stage != MESA_SHADER_FRAGMENT)308return;309310unsigned num_inputs = 0;311int num_input_arrays = 0;312313nir_foreach_shader_in_variable(var, c->s) {314const struct glsl_type *type = ntt_shader_input_type(c, var);315unsigned array_len =316glsl_count_attribute_slots(type, false);317318num_inputs = MAX2(num_inputs, var->data.driver_location + array_len);319}320321c->input_index_map = ralloc_array(c, struct ureg_src, num_inputs);322323nir_foreach_shader_in_variable(var, c->s) {324const struct glsl_type *type = ntt_shader_input_type(c, var);325unsigned array_len =326glsl_count_attribute_slots(type, false);327328unsigned interpolation = TGSI_INTERPOLATE_CONSTANT;329unsigned sample_loc;330struct ureg_src decl;331332if (c->s->info.stage == MESA_SHADER_FRAGMENT) {333interpolation =334tgsi_get_interp_mode(var->data.interpolation,335var->data.location == VARYING_SLOT_COL0 ||336var->data.location == VARYING_SLOT_COL1);337338if (var->data.location == VARYING_SLOT_POS)339interpolation = TGSI_INTERPOLATE_LINEAR;340}341342unsigned semantic_name, semantic_index;343ntt_get_gl_varying_semantic(c, var->data.location,344&semantic_name, &semantic_index);345346if (var->data.sample) {347sample_loc = TGSI_INTERPOLATE_LOC_SAMPLE;348} else if (var->data.centroid) {349sample_loc = TGSI_INTERPOLATE_LOC_CENTROID;350c->centroid_inputs |= (BITSET_MASK(array_len) <<351var->data.driver_location);352} else {353sample_loc = TGSI_INTERPOLATE_LOC_CENTER;354}355356unsigned array_id = 0;357if (glsl_type_is_array(type))358array_id = ++num_input_arrays;359360uint32_t usage_mask = ntt_tgsi_var_usage_mask(var);361362decl = ureg_DECL_fs_input_cyl_centroid_layout(c->ureg,363semantic_name,364semantic_index,365interpolation,3660,367sample_loc,368var->data.driver_location,369usage_mask,370array_id, array_len);371372if (semantic_name == TGSI_SEMANTIC_FACE) {373struct ureg_dst temp = ureg_DECL_temporary(c->ureg);374/* NIR is ~0 front and 0 back, while TGSI is +1 front */375ureg_SGE(c->ureg, temp, decl, ureg_imm1f(c->ureg, 0));376decl = ureg_src(temp);377}378379for (unsigned i = 0; i < array_len; i++) {380c->input_index_map[var->data.driver_location + i] = decl;381c->input_index_map[var->data.driver_location + i].Index += i;382}383}384}385386static void387ntt_setup_uniforms(struct ntt_compile *c)388{389struct pipe_screen *screen = c->screen;390bool packed = screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS);391392nir_foreach_uniform_variable(var, c->s) {393if (glsl_type_is_image(var->type)) {394c->images[var->data.binding] = ureg_DECL_image(c->ureg,395var->data.binding,396TGSI_TEXTURE_2D,397var->data.image.format,398!var->data.read_only,399false);400} else {401unsigned size;402if (packed) {403size = DIV_ROUND_UP(glsl_count_dword_slots(var->type,404var->data.bindless), 4);405} else {406size = glsl_count_vec4_slots(var->type, false, var->data.bindless);407}408409for (unsigned i = 0; i < size; i++)410ureg_DECL_constant(c->ureg, var->data.driver_location + i);411}412}413414nir_foreach_variable_with_modes(var, c->s, nir_var_mem_ubo) {415ureg_DECL_constant2D(c->ureg, 0, 0, var->data.driver_location);416}417418nir_foreach_variable_with_modes(var, c->s, nir_var_mem_ssbo) {419/* XXX: nv50 uses the atomic flag to set caching for (lowered) atomic420* counters421*/422bool atomic = false;423ureg_DECL_buffer(c->ureg, var->data.binding, atomic);424}425426for (int i = 0; i < PIPE_MAX_SAMPLERS; i++) {427if (BITSET_TEST(c->s->info.textures_used, i))428ureg_DECL_sampler(c->ureg, i);429}430}431432static void433ntt_setup_registers(struct ntt_compile *c, struct exec_list *list)434{435foreach_list_typed(nir_register, nir_reg, node, list) {436struct ureg_dst decl;437if (nir_reg->num_array_elems == 0) {438uint32_t write_mask = BITFIELD_MASK(nir_reg->num_components);439if (!ntt_try_store_in_tgsi_output(c, &decl, &nir_reg->uses, &nir_reg->if_uses)) {440if (nir_reg->bit_size == 64) {441if (nir_reg->num_components > 2) {442fprintf(stderr, "NIR-to-TGSI: error: %d-component NIR r%d\n",443nir_reg->num_components, nir_reg->index);444}445446write_mask = ntt_64bit_write_mask(write_mask);447}448449decl = ureg_writemask(ureg_DECL_temporary(c->ureg), write_mask);450}451} else {452decl = ureg_DECL_array_temporary(c->ureg, nir_reg->num_array_elems,453true);454}455c->reg_temp[nir_reg->index] = decl;456}457}458459static struct ureg_src460ntt_get_load_const_src(struct ntt_compile *c, nir_load_const_instr *instr)461{462int num_components = instr->def.num_components;463464if (!c->native_integers) {465float values[4];466assert(instr->def.bit_size == 32);467for (int i = 0; i < num_components; i++)468values[i] = uif(instr->value[i].u32);469470return ureg_DECL_immediate(c->ureg, values, num_components);471} else {472uint32_t values[4];473474if (instr->def.bit_size == 32) {475for (int i = 0; i < num_components; i++)476values[i] = instr->value[i].u32;477} else {478assert(num_components <= 2);479for (int i = 0; i < num_components; i++) {480values[i * 2 + 0] = instr->value[i].u64 & 0xffffffff;481values[i * 2 + 1] = instr->value[i].u64 >> 32;482}483num_components *= 2;484}485486return ureg_DECL_immediate_uint(c->ureg, values, num_components);487}488}489490static struct ureg_src491ntt_reladdr(struct ntt_compile *c, struct ureg_src addr)492{493if (c->any_reg_as_address) {494/* Make sure we're getting the refcounting right even on any_reg495* drivers.496*/497c->next_addr_reg++;498499return ureg_scalar(addr, 0);500}501502assert(c->next_addr_reg < ARRAY_SIZE(c->addr_reg));503504if (!c->addr_declared[c->next_addr_reg]) {505c->addr_reg[c->next_addr_reg] = ureg_writemask(ureg_DECL_address(c->ureg),506TGSI_WRITEMASK_X);507c->addr_declared[c->next_addr_reg] = true;508}509510if (c->native_integers)511ureg_UARL(c->ureg, c->addr_reg[c->next_addr_reg], addr);512else513ureg_ARL(c->ureg, c->addr_reg[c->next_addr_reg], addr);514return ureg_scalar(ureg_src(c->addr_reg[c->next_addr_reg++]), 0);515}516517static void518ntt_put_reladdr(struct ntt_compile *c)519{520c->next_addr_reg--;521assert(c->next_addr_reg >= 0);522}523524static void525ntt_reladdr_dst_put(struct ntt_compile *c, struct ureg_dst dst)526{527if (c->any_reg_as_address)528return;529530if (dst.Indirect)531ntt_put_reladdr(c);532if (dst.DimIndirect)533ntt_put_reladdr(c);534}535536static struct ureg_src537ntt_get_src(struct ntt_compile *c, nir_src src)538{539if (src.is_ssa) {540if (src.ssa->parent_instr->type == nir_instr_type_load_const)541return ntt_get_load_const_src(c, nir_instr_as_load_const(src.ssa->parent_instr));542543return ureg_src(c->ssa_temp[src.ssa->index]);544} else {545nir_register *reg = src.reg.reg;546struct ureg_dst reg_temp = c->reg_temp[reg->index];547reg_temp.Index += src.reg.base_offset;548549if (src.reg.indirect) {550struct ureg_src offset = ntt_get_src(c, *src.reg.indirect);551return ureg_src_indirect(ureg_src(reg_temp),552ntt_reladdr(c, offset));553} else {554return ureg_src(reg_temp);555}556}557}558559static struct ureg_src560ntt_get_alu_src(struct ntt_compile *c, nir_alu_instr *instr, int i)561{562nir_alu_src src = instr->src[i];563struct ureg_src usrc = ntt_get_src(c, src.src);564565if (nir_src_bit_size(src.src) == 64) {566int chan0 = 0, chan1 = 1;567if (nir_op_infos[instr->op].input_sizes[i] == 0) {568chan0 = ffs(instr->dest.write_mask) - 1;569chan1 = ffs(instr->dest.write_mask & ~(1 << chan0)) - 1;570if (chan1 == -1)571chan1 = chan0;572}573usrc = ureg_swizzle(usrc,574src.swizzle[chan0] * 2,575src.swizzle[chan0] * 2 + 1,576src.swizzle[chan1] * 2,577src.swizzle[chan1] * 2 + 1);578} else {579usrc = ureg_swizzle(usrc,580src.swizzle[0],581src.swizzle[1],582src.swizzle[2],583src.swizzle[3]);584}585586if (src.abs)587usrc = ureg_abs(usrc);588if (src.negate)589usrc = ureg_negate(usrc);590591return usrc;592}593594/* Reswizzles a source so that the unset channels in the write mask still refer595* to one of the channels present in the write mask.596*/597static struct ureg_src598ntt_swizzle_for_write_mask(struct ureg_src src, uint32_t write_mask)599{600assert(write_mask);601int first_chan = ffs(write_mask) - 1;602return ureg_swizzle(src,603(write_mask & TGSI_WRITEMASK_X) ? TGSI_SWIZZLE_X : first_chan,604(write_mask & TGSI_WRITEMASK_Y) ? TGSI_SWIZZLE_Y : first_chan,605(write_mask & TGSI_WRITEMASK_Z) ? TGSI_SWIZZLE_Z : first_chan,606(write_mask & TGSI_WRITEMASK_W) ? TGSI_SWIZZLE_W : first_chan);607}608609static struct ureg_dst *610ntt_get_ssa_def_decl(struct ntt_compile *c, nir_ssa_def *ssa)611{612uint32_t writemask = BITSET_MASK(ssa->num_components);613if (ssa->bit_size == 64)614writemask = ntt_64bit_write_mask(writemask);615616struct ureg_dst dst;617if (!ntt_try_store_in_tgsi_output(c, &dst, &ssa->uses, &ssa->if_uses))618dst = ureg_DECL_temporary(c->ureg);619620c->ssa_temp[ssa->index] = ureg_writemask(dst, writemask);621622return &c->ssa_temp[ssa->index];623}624625static struct ureg_dst *626ntt_get_dest_decl(struct ntt_compile *c, nir_dest *dest)627{628if (dest->is_ssa)629return ntt_get_ssa_def_decl(c, &dest->ssa);630else631return &c->reg_temp[dest->reg.reg->index];632}633634static struct ureg_dst635ntt_get_dest(struct ntt_compile *c, nir_dest *dest)636{637struct ureg_dst dst = *ntt_get_dest_decl(c, dest);638639if (!dest->is_ssa) {640dst.Index += dest->reg.base_offset;641642if (dest->reg.indirect) {643struct ureg_src offset = ntt_get_src(c, *dest->reg.indirect);644dst = ureg_dst_indirect(dst, ntt_reladdr(c, offset));645}646}647648return dst;649}650651/* For an SSA dest being populated by a constant src, replace the storage with652* a copy of the ureg_src.653*/654static void655ntt_store_def(struct ntt_compile *c, nir_ssa_def *def, struct ureg_src src)656{657if (!src.Negate && !src.Absolute && !src.Indirect && !src.DimIndirect &&658src.SwizzleX == TGSI_SWIZZLE_X &&659(src.SwizzleY == TGSI_SWIZZLE_Y || def->num_components < 2) &&660(src.SwizzleZ == TGSI_SWIZZLE_Z || def->num_components < 3) &&661(src.SwizzleW == TGSI_SWIZZLE_W || def->num_components < 4)) {662switch (src.File) {663case TGSI_FILE_IMMEDIATE:664case TGSI_FILE_INPUT:665case TGSI_FILE_CONSTANT:666case TGSI_FILE_SYSTEM_VALUE:667c->ssa_temp[def->index] = ureg_dst(src);668return;669}670}671672ureg_MOV(c->ureg, *ntt_get_ssa_def_decl(c, def), src);673}674675static void676ntt_store(struct ntt_compile *c, nir_dest *dest, struct ureg_src src)677{678if (dest->is_ssa)679ntt_store_def(c, &dest->ssa, src);680else {681struct ureg_dst dst = ntt_get_dest(c, dest);682ureg_MOV(c->ureg, dst, src);683}684}685686static void687ntt_emit_scalar(struct ntt_compile *c, unsigned tgsi_op,688struct ureg_dst dst,689struct ureg_src src0,690struct ureg_src src1)691{692unsigned i;693int num_src;694695/* POW is the only 2-operand scalar op. */696if (tgsi_op == TGSI_OPCODE_POW) {697num_src = 2;698} else {699num_src = 1;700src1 = src0;701}702703for (i = 0; i < 4; i++) {704if (dst.WriteMask & (1 << i)) {705struct ureg_dst this_dst = dst;706struct ureg_src srcs[2] = {707ureg_scalar(src0, i),708ureg_scalar(src1, i),709};710this_dst.WriteMask = (1 << i);711712ureg_insn(c->ureg, tgsi_op, &this_dst, 1, srcs, num_src, false);713}714}715}716717static void718ntt_emit_alu(struct ntt_compile *c, nir_alu_instr *instr)719{720struct ureg_src src[4];721struct ureg_dst dst;722unsigned i;723int dst_64 = nir_dest_bit_size(instr->dest.dest) == 64;724int src_64 = nir_src_bit_size(instr->src[0].src) == 64;725int num_srcs = nir_op_infos[instr->op].num_inputs;726727assert(num_srcs <= ARRAY_SIZE(src));728for (i = 0; i < num_srcs; i++)729src[i] = ntt_get_alu_src(c, instr, i);730dst = ntt_get_dest(c, &instr->dest.dest);731732if (instr->dest.saturate)733dst.Saturate = true;734735if (dst_64)736dst = ureg_writemask(dst, ntt_64bit_write_mask(instr->dest.write_mask));737else738dst = ureg_writemask(dst, instr->dest.write_mask);739740static enum tgsi_opcode op_map[][2] = {741[nir_op_mov] = { TGSI_OPCODE_MOV, TGSI_OPCODE_MOV },742743/* fabs/fneg 32-bit are special-cased below. */744[nir_op_fabs] = { 0, TGSI_OPCODE_DABS },745[nir_op_fneg] = { 0, TGSI_OPCODE_DNEG },746747[nir_op_fdot2] = { TGSI_OPCODE_DP2 },748[nir_op_fdot3] = { TGSI_OPCODE_DP3 },749[nir_op_fdot4] = { TGSI_OPCODE_DP4 },750[nir_op_ffloor] = { TGSI_OPCODE_FLR, TGSI_OPCODE_DFLR },751[nir_op_ffract] = { TGSI_OPCODE_FRC, TGSI_OPCODE_DFRAC },752[nir_op_fceil] = { TGSI_OPCODE_CEIL, TGSI_OPCODE_DCEIL },753[nir_op_fround_even] = { TGSI_OPCODE_ROUND, TGSI_OPCODE_DROUND },754[nir_op_fdiv] = { TGSI_OPCODE_DIV, TGSI_OPCODE_DDIV },755[nir_op_idiv] = { TGSI_OPCODE_IDIV, TGSI_OPCODE_I64DIV },756[nir_op_udiv] = { TGSI_OPCODE_UDIV, TGSI_OPCODE_U64DIV },757758[nir_op_frcp] = { 0, TGSI_OPCODE_DRCP },759[nir_op_frsq] = { 0, TGSI_OPCODE_DRSQ },760[nir_op_fsqrt] = { 0, TGSI_OPCODE_DSQRT },761762/* The conversions will have one combination of src and dst bitsize. */763[nir_op_f2f32] = { 0, TGSI_OPCODE_D2F },764[nir_op_f2f64] = { TGSI_OPCODE_F2D },765[nir_op_i2i64] = { TGSI_OPCODE_I2I64 },766767[nir_op_f2i32] = { TGSI_OPCODE_F2I, TGSI_OPCODE_D2I },768[nir_op_f2i64] = { TGSI_OPCODE_F2I64, TGSI_OPCODE_D2I64 },769[nir_op_f2u32] = { TGSI_OPCODE_F2U, TGSI_OPCODE_D2U },770[nir_op_f2u64] = { TGSI_OPCODE_F2U64, TGSI_OPCODE_D2U64 },771[nir_op_i2f32] = { TGSI_OPCODE_I2F, TGSI_OPCODE_I642F },772[nir_op_i2f64] = { TGSI_OPCODE_I2D, TGSI_OPCODE_I642D },773[nir_op_u2f32] = { TGSI_OPCODE_U2F, TGSI_OPCODE_U642F },774[nir_op_u2f64] = { TGSI_OPCODE_U2D, TGSI_OPCODE_U642D },775776[nir_op_slt] = { TGSI_OPCODE_SLT },777[nir_op_sge] = { TGSI_OPCODE_SGE },778[nir_op_seq] = { TGSI_OPCODE_SEQ },779[nir_op_sne] = { TGSI_OPCODE_SNE },780781[nir_op_flt32] = { TGSI_OPCODE_FSLT, TGSI_OPCODE_DSLT },782[nir_op_fge32] = { TGSI_OPCODE_FSGE, TGSI_OPCODE_DSGE },783[nir_op_feq32] = { TGSI_OPCODE_FSEQ, TGSI_OPCODE_DSEQ },784[nir_op_fneu32] = { TGSI_OPCODE_FSNE, TGSI_OPCODE_DSNE },785786[nir_op_ilt32] = { TGSI_OPCODE_ISLT, TGSI_OPCODE_I64SLT },787[nir_op_ige32] = { TGSI_OPCODE_ISGE, TGSI_OPCODE_I64SGE },788[nir_op_ieq32] = { TGSI_OPCODE_USEQ, TGSI_OPCODE_U64SEQ },789[nir_op_ine32] = { TGSI_OPCODE_USNE, TGSI_OPCODE_U64SNE },790791[nir_op_ult32] = { TGSI_OPCODE_USLT, TGSI_OPCODE_U64SLT },792[nir_op_uge32] = { TGSI_OPCODE_USGE, TGSI_OPCODE_U64SGE },793794[nir_op_iabs] = { TGSI_OPCODE_IABS, TGSI_OPCODE_I64ABS },795[nir_op_ineg] = { TGSI_OPCODE_INEG, TGSI_OPCODE_I64NEG },796[nir_op_fsign] = { TGSI_OPCODE_SSG },797[nir_op_isign] = { TGSI_OPCODE_ISSG },798[nir_op_ftrunc] = { TGSI_OPCODE_TRUNC, TGSI_OPCODE_DTRUNC },799[nir_op_fddx] = { TGSI_OPCODE_DDX },800[nir_op_fddy] = { TGSI_OPCODE_DDY },801[nir_op_fddx_coarse] = { TGSI_OPCODE_DDX },802[nir_op_fddy_coarse] = { TGSI_OPCODE_DDY },803[nir_op_fddx_fine] = { TGSI_OPCODE_DDX_FINE },804[nir_op_fddy_fine] = { TGSI_OPCODE_DDY_FINE },805[nir_op_pack_half_2x16] = { TGSI_OPCODE_PK2H },806[nir_op_unpack_half_2x16] = { TGSI_OPCODE_UP2H },807[nir_op_ibitfield_extract] = { TGSI_OPCODE_IBFE },808[nir_op_ubitfield_extract] = { TGSI_OPCODE_UBFE },809[nir_op_bitfield_insert] = { TGSI_OPCODE_BFI },810[nir_op_bitfield_reverse] = { TGSI_OPCODE_BREV },811[nir_op_bit_count] = { TGSI_OPCODE_POPC },812[nir_op_ifind_msb] = { TGSI_OPCODE_IMSB },813[nir_op_ufind_msb] = { TGSI_OPCODE_UMSB },814[nir_op_find_lsb] = { TGSI_OPCODE_LSB },815[nir_op_fadd] = { TGSI_OPCODE_ADD, TGSI_OPCODE_DADD },816[nir_op_iadd] = { TGSI_OPCODE_UADD, TGSI_OPCODE_U64ADD },817[nir_op_fmul] = { TGSI_OPCODE_MUL, TGSI_OPCODE_DMUL },818[nir_op_imul] = { TGSI_OPCODE_UMUL, TGSI_OPCODE_U64MUL },819[nir_op_imod] = { TGSI_OPCODE_MOD, TGSI_OPCODE_I64MOD },820[nir_op_umod] = { TGSI_OPCODE_UMOD, TGSI_OPCODE_U64MOD },821[nir_op_imul_high] = { TGSI_OPCODE_IMUL_HI },822[nir_op_umul_high] = { TGSI_OPCODE_UMUL_HI },823[nir_op_ishl] = { TGSI_OPCODE_SHL, TGSI_OPCODE_U64SHL },824[nir_op_ishr] = { TGSI_OPCODE_ISHR, TGSI_OPCODE_I64SHR },825[nir_op_ushr] = { TGSI_OPCODE_USHR, TGSI_OPCODE_U64SHR },826827/* These bitwise ops don't care about 32 vs 64 types, so they have the828* same TGSI op.829*/830[nir_op_inot] = { TGSI_OPCODE_NOT, TGSI_OPCODE_NOT },831[nir_op_iand] = { TGSI_OPCODE_AND, TGSI_OPCODE_AND },832[nir_op_ior] = { TGSI_OPCODE_OR, TGSI_OPCODE_OR },833[nir_op_ixor] = { TGSI_OPCODE_XOR, TGSI_OPCODE_XOR },834835[nir_op_fmin] = { TGSI_OPCODE_MIN, TGSI_OPCODE_DMIN },836[nir_op_imin] = { TGSI_OPCODE_IMIN, TGSI_OPCODE_I64MIN },837[nir_op_umin] = { TGSI_OPCODE_UMIN, TGSI_OPCODE_U64MIN },838[nir_op_fmax] = { TGSI_OPCODE_MAX, TGSI_OPCODE_DMAX },839[nir_op_imax] = { TGSI_OPCODE_IMAX, TGSI_OPCODE_I64MAX },840[nir_op_umax] = { TGSI_OPCODE_UMAX, TGSI_OPCODE_U64MAX },841[nir_op_ffma] = { TGSI_OPCODE_MAD, TGSI_OPCODE_DMAD },842[nir_op_ldexp] = { TGSI_OPCODE_LDEXP, 0 },843};844845/* TGSI's 64 bit compares storing to 32-bit are weird and write .xz instead846* of .xy. Store to a temp and move it to the real dst.847*/848bool tgsi_64bit_compare = src_64 && !dst_64 &&849(num_srcs == 2 ||850nir_op_infos[instr->op].output_type == nir_type_bool32) &&851(dst.WriteMask != TGSI_WRITEMASK_X);852853/* TGSI 64bit-to-32-bit conversions only generate results in the .xy854* channels and will need to get fixed up.855*/856bool tgsi_64bit_downconvert = (src_64 && !dst_64 &&857num_srcs == 1 && !tgsi_64bit_compare &&858(dst.WriteMask & ~TGSI_WRITEMASK_XY));859860struct ureg_dst real_dst = ureg_dst_undef();861if (tgsi_64bit_compare || tgsi_64bit_downconvert) {862real_dst = dst;863dst = ureg_DECL_temporary(c->ureg);864}865866bool table_op64 = src_64;867if (instr->op < ARRAY_SIZE(op_map) && op_map[instr->op][table_op64] != 0) {868/* The normal path for NIR to TGSI ALU op translation */869ureg_insn(c->ureg, op_map[instr->op][table_op64],870&dst, 1, src, num_srcs, false);871} else {872/* Special cases for NIR to TGSI ALU op translation. */873874/* TODO: Use something like the ntt_store() path for the MOV calls so we875* don't emit extra MOVs for swizzles/srcmods of inputs/const/imm.876*/877878switch (instr->op) {879case nir_op_u2u64:880ureg_AND(c->ureg, dst, ureg_swizzle(src[0],881TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,882TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),883ureg_imm4u(c->ureg, ~0, 0, ~0, 0));884break;885886case nir_op_i2i32:887case nir_op_u2u32:888assert(src_64);889ureg_MOV(c->ureg, dst, ureg_swizzle(src[0],890TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z,891TGSI_SWIZZLE_X, TGSI_SWIZZLE_X));892break;893894case nir_op_fabs:895ureg_MOV(c->ureg, dst, ureg_abs(src[0]));896break;897898case nir_op_fsat:899if (dst_64) {900ureg_MIN(c->ureg, dst, src[0], ntt_64bit_1f(c));901ureg_MAX(c->ureg, dst, ureg_src(dst), ureg_imm1u(c->ureg, 0));902} else {903ureg_MOV(c->ureg, ureg_saturate(dst), src[0]);904}905break;906907case nir_op_fneg:908ureg_MOV(c->ureg, dst, ureg_negate(src[0]));909break;910911/* NOTE: TGSI 32-bit math ops have the old "one source channel912* replicated to all dst channels" behavior, while 64 is normal mapping913* of src channels to dst.914*/915case nir_op_frcp:916assert(!dst_64);917ntt_emit_scalar(c, TGSI_OPCODE_RCP, dst, src[0], src[1]);918break;919920case nir_op_frsq:921assert(!dst_64);922ntt_emit_scalar(c, TGSI_OPCODE_RSQ, dst, src[0], src[1]);923break;924925case nir_op_fsqrt:926assert(!dst_64);927ntt_emit_scalar(c, TGSI_OPCODE_SQRT, dst, src[0], src[1]);928break;929930case nir_op_fexp2:931assert(!dst_64);932ntt_emit_scalar(c, TGSI_OPCODE_EX2, dst, src[0], src[1]);933break;934935case nir_op_flog2:936assert(!dst_64);937ntt_emit_scalar(c, TGSI_OPCODE_LG2, dst, src[0], src[1]);938break;939940case nir_op_b2f32:941ureg_AND(c->ureg, dst, src[0], ureg_imm1f(c->ureg, 1.0));942break;943944case nir_op_b2f64:945ureg_AND(c->ureg, dst,946ureg_swizzle(src[0],947TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,948TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),949ntt_64bit_1f(c));950break;951952case nir_op_f2b32:953if (src_64)954ureg_DSNE(c->ureg, dst, src[0], ureg_imm1f(c->ureg, 0));955else956ureg_FSNE(c->ureg, dst, src[0], ureg_imm1f(c->ureg, 0));957break;958959case nir_op_i2b32:960if (src_64) {961ureg_U64SNE(c->ureg, dst, src[0], ureg_imm1u(c->ureg, 0));962} else963ureg_USNE(c->ureg, dst, src[0], ureg_imm1u(c->ureg, 0));964break;965966case nir_op_b2i32:967ureg_AND(c->ureg, dst, src[0], ureg_imm1u(c->ureg, 1));968break;969970case nir_op_b2i64:971ureg_AND(c->ureg, dst,972ureg_swizzle(src[0],973TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,974TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),975ureg_imm4u(c->ureg, 1, 0, 1, 0));976break;977978case nir_op_fsin:979ntt_emit_scalar(c, TGSI_OPCODE_SIN, dst, src[0], src[1]);980break;981982case nir_op_fcos:983ntt_emit_scalar(c, TGSI_OPCODE_COS, dst, src[0], src[1]);984break;985986case nir_op_fsub:987assert(!dst_64);988ureg_ADD(c->ureg, dst, src[0], ureg_negate(src[1]));989break;990991case nir_op_isub:992assert(!dst_64);993ureg_UADD(c->ureg, dst, src[0], ureg_negate(src[1]));994break;995996case nir_op_fmod:997unreachable("should be handled by .lower_fmod = true");998break;9991000case nir_op_fpow:1001ntt_emit_scalar(c, TGSI_OPCODE_POW, dst, src[0], src[1]);1002break;10031004case nir_op_flrp:1005ureg_LRP(c->ureg, dst, src[2], src[1], src[0]);1006break;10071008case nir_op_pack_64_2x32_split:1009ureg_MOV(c->ureg, ureg_writemask(dst, TGSI_WRITEMASK_XZ),1010ureg_swizzle(src[0],1011TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,1012TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));1013ureg_MOV(c->ureg, ureg_writemask(dst, TGSI_WRITEMASK_YW),1014ureg_swizzle(src[1],1015TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,1016TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));1017break;10181019case nir_op_unpack_64_2x32_split_x:1020ureg_MOV(c->ureg, dst, ureg_swizzle(src[0],1021TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z,1022TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z));1023break;10241025case nir_op_unpack_64_2x32_split_y:1026ureg_MOV(c->ureg, dst, ureg_swizzle(src[0],1027TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W,1028TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W));1029break;10301031case nir_op_b32csel:1032if (nir_src_bit_size(instr->src[1].src) == 64) {1033ureg_UCMP(c->ureg, dst, ureg_swizzle(src[0],1034TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,1035TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),1036src[1], src[2]);1037} else {1038ureg_UCMP(c->ureg, dst, src[0], src[1], src[2]);1039}1040break;10411042case nir_op_fcsel:1043/* NIR is src0 != 0 ? src1 : src2.1044* TGSI is src0 < 0 ? src1 : src2.1045*1046* However, fcsel so far as I can find only appears on1047* bools-as-floats (1.0 or 0.0), so we can negate it for the TGSI op.1048*/1049ureg_CMP(c->ureg, dst, ureg_negate(ureg_abs(src[0])), src[1], src[2]);1050break;10511052/* It would be nice if we could get this left as scalar in NIR, since1053* the TGSI op is scalar.1054*/1055case nir_op_frexp_sig:1056case nir_op_frexp_exp: {1057assert(src_64);1058struct ureg_dst temp = ureg_DECL_temporary(c->ureg);10591060for (int chan = 0; chan < 2; chan++) {1061int wm = 1 << chan;10621063if (!(instr->dest.write_mask & wm))1064continue;10651066struct ureg_dst dsts[2] = { temp, temp };1067if (instr->op == nir_op_frexp_sig) {1068dsts[0] = ureg_writemask(dst, ntt_64bit_write_mask(wm));1069} else {1070dsts[1] = ureg_writemask(dst, wm);1071}10721073struct ureg_src chan_src = ureg_swizzle(src[0],1074chan * 2, chan * 2 + 1,1075chan * 2, chan * 2 + 1);10761077ureg_insn(c->ureg, TGSI_OPCODE_DFRACEXP,1078dsts, 2,1079&chan_src, 1, false);1080}10811082ureg_release_temporary(c->ureg, temp);1083break;1084}10851086case nir_op_ldexp:1087assert(dst_64); /* 32bit handled in table. */1088ureg_DLDEXP(c->ureg, dst, src[0],1089ureg_swizzle(src[1],1090TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,1091TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));1092break;10931094case nir_op_vec4:1095case nir_op_vec3:1096case nir_op_vec2:1097unreachable("covered by nir_lower_vec_to_movs()");10981099default:1100fprintf(stderr, "Unknown NIR opcode: %s\n", nir_op_infos[instr->op].name);1101unreachable("Unknown NIR opcode");1102}1103}11041105/* 64-bit op fixup movs */1106if (!ureg_dst_is_undef(real_dst)) {1107if (tgsi_64bit_compare) {1108ureg_MOV(c->ureg, real_dst,1109ureg_swizzle(ureg_src(dst), 0, 2, 0, 2));1110} else {1111assert(tgsi_64bit_downconvert);1112uint8_t swizzle[] = {0, 0, 0, 0};1113uint32_t second_bit = real_dst.WriteMask & ~(1 << (ffs(real_dst.WriteMask) - 1));1114if (second_bit)1115swizzle[ffs(second_bit) - 1] = 1;1116ureg_MOV(c->ureg, real_dst, ureg_swizzle(ureg_src(dst),1117swizzle[0],1118swizzle[1],1119swizzle[2],1120swizzle[3]));1121}1122ureg_release_temporary(c->ureg, dst);1123}1124}11251126static struct ureg_src1127ntt_ureg_src_indirect(struct ntt_compile *c, struct ureg_src usrc,1128nir_src src)1129{1130if (nir_src_is_const(src)) {1131usrc.Index += ntt_src_as_uint(c, src);1132return usrc;1133} else {1134return ureg_src_indirect(usrc, ntt_reladdr(c, ntt_get_src(c, src)));1135}1136}11371138static struct ureg_dst1139ntt_ureg_dst_indirect(struct ntt_compile *c, struct ureg_dst dst,1140nir_src src)1141{1142if (nir_src_is_const(src)) {1143dst.Index += ntt_src_as_uint(c, src);1144return dst;1145} else {1146return ureg_dst_indirect(dst, ntt_reladdr(c, ntt_get_src(c, src)));1147}1148}11491150static struct ureg_src1151ntt_ureg_src_dimension_indirect(struct ntt_compile *c, struct ureg_src usrc,1152nir_src src)1153{1154if (nir_src_is_const(src)) {1155return ureg_src_dimension(usrc, ntt_src_as_uint(c, src));1156}1157else1158{1159return ureg_src_dimension_indirect(usrc,1160ntt_reladdr(c, ntt_get_src(c, src)),11610);1162}1163}11641165static struct ureg_dst1166ntt_ureg_dst_dimension_indirect(struct ntt_compile *c, struct ureg_dst udst,1167nir_src src)1168{1169if (nir_src_is_const(src)) {1170return ureg_dst_dimension(udst, ntt_src_as_uint(c, src));1171} else {1172return ureg_dst_dimension_indirect(udst,1173ntt_reladdr(c, ntt_get_src(c, src)),11740);1175}1176}1177/* Some load operations in NIR will have a fractional offset that we need to1178* swizzle down before storing to the result register.1179*/1180static struct ureg_src1181ntt_shift_by_frac(struct ureg_src src, unsigned frac, unsigned num_components)1182{1183return ureg_swizzle(src,1184frac,1185frac + MIN2(num_components - 1, 1),1186frac + MIN2(num_components - 1, 2),1187frac + MIN2(num_components - 1, 3));1188}118911901191static void1192ntt_emit_load_ubo(struct ntt_compile *c, nir_intrinsic_instr *instr)1193{1194int bit_size = nir_dest_bit_size(instr->dest);1195assert(bit_size == 32 || instr->num_components <= 2);11961197struct ureg_src src = ureg_src_register(TGSI_FILE_CONSTANT, 0);11981199src = ntt_ureg_src_dimension_indirect(c, src, instr->src[0]);12001201if (instr->intrinsic == nir_intrinsic_load_ubo_vec4) {1202/* !PIPE_CAP_LOAD_CONSTBUF: Just emit it as a vec4 reference to the const1203* file.1204*/12051206if (nir_src_is_const(instr->src[1])) {1207src.Index += ntt_src_as_uint(c, instr->src[1]);1208} else {1209src = ureg_src_indirect(src, ntt_reladdr(c, ntt_get_src(c, instr->src[1])));1210}12111212int start_component = nir_intrinsic_component(instr);1213if (bit_size == 64)1214start_component *= 2;12151216src = ntt_shift_by_frac(src, start_component,1217instr->num_components * bit_size / 32);12181219ntt_store(c, &instr->dest, src);1220} else {1221/* PIPE_CAP_LOAD_CONSTBUF: Not necessarily vec4 aligned, emit a1222* TGSI_OPCODE_LOAD instruction from the const file.1223*/1224struct ureg_dst dst = ntt_get_dest(c, &instr->dest);1225struct ureg_src srcs[2] = {1226src,1227ntt_get_src(c, instr->src[1]),1228};1229ureg_memory_insn(c->ureg, TGSI_OPCODE_LOAD,1230&dst, 1,1231srcs, ARRAY_SIZE(srcs),12320 /* qualifier */,12330 /* tex target */,12340 /* format: unused */1235);1236}1237}12381239static unsigned1240ntt_get_access_qualifier(nir_intrinsic_instr *instr)1241{1242enum gl_access_qualifier access = nir_intrinsic_access(instr);1243unsigned qualifier = 0;12441245if (access & ACCESS_COHERENT)1246qualifier |= TGSI_MEMORY_COHERENT;1247if (access & ACCESS_VOLATILE)1248qualifier |= TGSI_MEMORY_VOLATILE;1249if (access & ACCESS_RESTRICT)1250qualifier |= TGSI_MEMORY_RESTRICT;12511252return qualifier;1253}12541255static void1256ntt_emit_mem(struct ntt_compile *c, nir_intrinsic_instr *instr,1257nir_variable_mode mode)1258{1259bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||1260instr->intrinsic == nir_intrinsic_store_shared);1261bool is_load = (instr->intrinsic == nir_intrinsic_load_ssbo ||1262instr->intrinsic == nir_intrinsic_load_shared);1263unsigned opcode;1264struct ureg_src src[4];1265int num_src = 0;1266int nir_src;12671268struct ureg_src memory;1269switch (mode) {1270case nir_var_mem_ssbo:1271memory = ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_BUFFER, 0),1272instr->src[is_store ? 1 : 0]);1273nir_src = 1;1274break;1275case nir_var_mem_shared:1276memory = ureg_src_register(TGSI_FILE_MEMORY, 0);1277nir_src = 0;1278break;1279default:1280unreachable("unknown memory type");1281}12821283if (is_store) {1284src[num_src++] = ntt_get_src(c, instr->src[nir_src + 1]); /* offset */1285src[num_src++] = ntt_get_src(c, instr->src[0]); /* value */1286} else {1287src[num_src++] = memory;1288if (instr->intrinsic != nir_intrinsic_get_ssbo_size) {1289src[num_src++] = ntt_get_src(c, instr->src[nir_src++]); /* offset */1290if (!is_load)1291src[num_src++] = ntt_get_src(c, instr->src[nir_src++]); /* value */1292}1293}129412951296switch (instr->intrinsic) {1297case nir_intrinsic_ssbo_atomic_add:1298case nir_intrinsic_shared_atomic_add:1299opcode = TGSI_OPCODE_ATOMUADD;1300break;1301case nir_intrinsic_ssbo_atomic_fadd:1302case nir_intrinsic_shared_atomic_fadd:1303opcode = TGSI_OPCODE_ATOMFADD;1304break;1305case nir_intrinsic_ssbo_atomic_imin:1306case nir_intrinsic_shared_atomic_imin:1307opcode = TGSI_OPCODE_ATOMIMIN;1308break;1309case nir_intrinsic_ssbo_atomic_imax:1310case nir_intrinsic_shared_atomic_imax:1311opcode = TGSI_OPCODE_ATOMIMAX;1312break;1313case nir_intrinsic_ssbo_atomic_umin:1314case nir_intrinsic_shared_atomic_umin:1315opcode = TGSI_OPCODE_ATOMUMIN;1316break;1317case nir_intrinsic_ssbo_atomic_umax:1318case nir_intrinsic_shared_atomic_umax:1319opcode = TGSI_OPCODE_ATOMUMAX;1320break;1321case nir_intrinsic_ssbo_atomic_and:1322case nir_intrinsic_shared_atomic_and:1323opcode = TGSI_OPCODE_ATOMAND;1324break;1325case nir_intrinsic_ssbo_atomic_or:1326case nir_intrinsic_shared_atomic_or:1327opcode = TGSI_OPCODE_ATOMOR;1328break;1329case nir_intrinsic_ssbo_atomic_xor:1330case nir_intrinsic_shared_atomic_xor:1331opcode = TGSI_OPCODE_ATOMXOR;1332break;1333case nir_intrinsic_ssbo_atomic_exchange:1334case nir_intrinsic_shared_atomic_exchange:1335opcode = TGSI_OPCODE_ATOMXCHG;1336break;1337case nir_intrinsic_ssbo_atomic_comp_swap:1338case nir_intrinsic_shared_atomic_comp_swap:1339opcode = TGSI_OPCODE_ATOMCAS;1340src[num_src++] = ntt_get_src(c, instr->src[nir_src++]);1341break;1342case nir_intrinsic_load_ssbo:1343case nir_intrinsic_load_shared:1344opcode = TGSI_OPCODE_LOAD;1345break;1346case nir_intrinsic_store_ssbo:1347case nir_intrinsic_store_shared:1348opcode = TGSI_OPCODE_STORE;1349break;1350case nir_intrinsic_get_ssbo_size:1351opcode = TGSI_OPCODE_RESQ;1352break;1353default:1354unreachable("unknown memory op");1355}13561357unsigned qualifier = 0;1358if (mode == nir_var_mem_ssbo &&1359instr->intrinsic != nir_intrinsic_get_ssbo_size) {1360qualifier = ntt_get_access_qualifier(instr);1361}13621363struct ureg_dst dst;1364if (is_store) {1365dst = ureg_dst(memory);13661367unsigned write_mask = nir_intrinsic_write_mask(instr);1368if (nir_src_bit_size(instr->src[0]) == 64)1369write_mask = ntt_64bit_write_mask(write_mask);1370dst = ureg_writemask(dst, write_mask);1371} else {1372dst = ntt_get_dest(c, &instr->dest);1373}13741375ureg_memory_insn(c->ureg, opcode,1376&dst, 1,1377src, num_src,1378qualifier,1379TGSI_TEXTURE_BUFFER,13800 /* format: unused */);1381}13821383static enum tgsi_texture_type1384tgsi_target_from_sampler_dim(enum glsl_sampler_dim dim, bool is_array)1385{1386switch (dim) {1387case GLSL_SAMPLER_DIM_1D:1388return is_array ? TGSI_TEXTURE_1D_ARRAY : TGSI_TEXTURE_1D;1389case GLSL_SAMPLER_DIM_2D:1390return is_array ? TGSI_TEXTURE_2D_ARRAY : TGSI_TEXTURE_2D;1391case GLSL_SAMPLER_DIM_3D:1392return TGSI_TEXTURE_3D;1393case GLSL_SAMPLER_DIM_CUBE:1394return is_array ? TGSI_TEXTURE_CUBE_ARRAY : TGSI_TEXTURE_CUBE;1395case GLSL_SAMPLER_DIM_RECT:1396return TGSI_TEXTURE_RECT;1397case GLSL_SAMPLER_DIM_BUF:1398return TGSI_TEXTURE_BUFFER;1399default:1400unreachable("unknown sampler dim");1401}1402}14031404static void1405ntt_emit_image_load_store(struct ntt_compile *c, nir_intrinsic_instr *instr)1406{1407unsigned op;1408struct ureg_src srcs[4];1409int num_src = 0;1410enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);1411bool is_array = nir_intrinsic_image_array(instr);14121413struct ureg_dst temp = ureg_dst_undef();14141415enum tgsi_texture_type target = tgsi_target_from_sampler_dim(dim, is_array);14161417struct ureg_src resource =1418ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_IMAGE, 0),1419instr->src[0]);14201421struct ureg_dst dst;1422if (instr->intrinsic == nir_intrinsic_image_store) {1423dst = ureg_dst(resource);1424} else {1425srcs[num_src++] = resource;1426dst = ntt_get_dest(c, &instr->dest);1427}14281429if (instr->intrinsic != nir_intrinsic_image_size) {1430struct ureg_src coord = ntt_get_src(c, instr->src[1]);14311432if (dim == GLSL_SAMPLER_DIM_MS) {1433temp = ureg_DECL_temporary(c->ureg);1434ureg_MOV(c->ureg, temp, coord);1435ureg_MOV(c->ureg, ureg_writemask(temp, 1 << (is_array ? 3 : 2)),1436ureg_scalar(ntt_get_src(c, instr->src[2]), TGSI_SWIZZLE_X));1437coord = ureg_src(temp);1438}1439srcs[num_src++] = coord;14401441if (instr->intrinsic != nir_intrinsic_image_load) {1442srcs[num_src++] = ntt_get_src(c, instr->src[3]); /* data */1443if (instr->intrinsic == nir_intrinsic_image_atomic_comp_swap)1444srcs[num_src++] = ntt_get_src(c, instr->src[4]); /* data2 */1445}1446}14471448switch (instr->intrinsic) {1449case nir_intrinsic_image_load:1450op = TGSI_OPCODE_LOAD;1451break;1452case nir_intrinsic_image_store:1453op = TGSI_OPCODE_STORE;1454break;1455case nir_intrinsic_image_size:1456op = TGSI_OPCODE_RESQ;1457break;1458case nir_intrinsic_image_atomic_add:1459op = TGSI_OPCODE_ATOMUADD;1460break;1461case nir_intrinsic_image_atomic_fadd:1462op = TGSI_OPCODE_ATOMFADD;1463break;1464case nir_intrinsic_image_atomic_imin:1465op = TGSI_OPCODE_ATOMIMIN;1466break;1467case nir_intrinsic_image_atomic_umin:1468op = TGSI_OPCODE_ATOMUMIN;1469break;1470case nir_intrinsic_image_atomic_imax:1471op = TGSI_OPCODE_ATOMIMAX;1472break;1473case nir_intrinsic_image_atomic_umax:1474op = TGSI_OPCODE_ATOMUMAX;1475break;1476case nir_intrinsic_image_atomic_and:1477op = TGSI_OPCODE_ATOMAND;1478break;1479case nir_intrinsic_image_atomic_or:1480op = TGSI_OPCODE_ATOMOR;1481break;1482case nir_intrinsic_image_atomic_xor:1483op = TGSI_OPCODE_ATOMXOR;1484break;1485case nir_intrinsic_image_atomic_exchange:1486op = TGSI_OPCODE_ATOMXCHG;1487break;1488case nir_intrinsic_image_atomic_comp_swap:1489op = TGSI_OPCODE_ATOMCAS;1490break;1491default:1492unreachable("bad op");1493}14941495ureg_memory_insn(c->ureg, op, &dst, 1, srcs, num_src,1496ntt_get_access_qualifier(instr),1497target,1498nir_intrinsic_format(instr));14991500if (!ureg_dst_is_undef(temp))1501ureg_release_temporary(c->ureg, temp);1502}15031504static void1505ntt_emit_load_input(struct ntt_compile *c, nir_intrinsic_instr *instr)1506{1507uint32_t frac = nir_intrinsic_component(instr);1508uint32_t num_components = instr->num_components;1509unsigned base = nir_intrinsic_base(instr);1510struct ureg_src input;1511nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);1512bool is_64 = nir_dest_bit_size(instr->dest) == 64;15131514if (c->s->info.stage == MESA_SHADER_VERTEX) {1515input = ureg_DECL_vs_input(c->ureg, base);1516for (int i = 1; i < semantics.num_slots; i++)1517ureg_DECL_vs_input(c->ureg, base + i);1518} else if (c->s->info.stage != MESA_SHADER_FRAGMENT) {1519unsigned semantic_name, semantic_index;1520ntt_get_gl_varying_semantic(c, semantics.location,1521&semantic_name, &semantic_index);15221523/* XXX: ArrayID is used in r600 gs inputs */1524uint32_t array_id = 0;15251526input = ureg_DECL_input_layout(c->ureg,1527semantic_name,1528semantic_index,1529base,1530ntt_tgsi_usage_mask(frac,1531instr->num_components,1532is_64),1533array_id,1534semantics.num_slots);1535} else {1536input = c->input_index_map[base];1537}15381539if (is_64)1540num_components *= 2;15411542input = ntt_shift_by_frac(input, frac, num_components);15431544switch (instr->intrinsic) {1545case nir_intrinsic_load_input:1546input = ntt_ureg_src_indirect(c, input, instr->src[0]);1547ntt_store(c, &instr->dest, input);1548break;15491550case nir_intrinsic_load_per_vertex_input:1551input = ntt_ureg_src_indirect(c, input, instr->src[1]);1552input = ntt_ureg_src_dimension_indirect(c, input, instr->src[0]);1553ntt_store(c, &instr->dest, input);1554break;15551556case nir_intrinsic_load_interpolated_input: {1557input = ntt_ureg_src_indirect(c, input, instr->src[1]);15581559nir_intrinsic_instr *bary_instr =1560nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);15611562switch (bary_instr->intrinsic) {1563case nir_intrinsic_load_barycentric_pixel:1564ntt_store(c, &instr->dest, input);1565break;15661567case nir_intrinsic_load_barycentric_centroid:1568/* If the input was declared centroid, then there's no need to1569* emit the extra TGSI interp instruction, we can just read the1570* input.1571*/1572if (c->centroid_inputs & (1ull << nir_intrinsic_base(instr))) {1573ntt_store(c, &instr->dest, input);1574} else {1575ureg_INTERP_CENTROID(c->ureg, ntt_get_dest(c, &instr->dest),1576input);1577}1578break;15791580case nir_intrinsic_load_barycentric_at_sample:1581ureg_INTERP_SAMPLE(c->ureg, ntt_get_dest(c, &instr->dest), input,1582ureg_imm1u(c->ureg,1583ntt_src_as_uint(c, bary_instr->src[0])));1584break;15851586case nir_intrinsic_load_barycentric_at_offset:1587/* We stored the offset in the fake "bary" dest. */1588ureg_INTERP_OFFSET(c->ureg, ntt_get_dest(c, &instr->dest), input,1589ntt_get_src(c, instr->src[0]));1590break;15911592default:1593unreachable("bad barycentric interp intrinsic\n");1594}1595break;1596}15971598default:1599unreachable("bad load input intrinsic\n");1600}1601}16021603static void1604ntt_emit_store_output(struct ntt_compile *c, nir_intrinsic_instr *instr)1605{1606struct ureg_src src = ntt_get_src(c, instr->src[0]);16071608if (src.File == TGSI_FILE_OUTPUT) {1609/* If our src is the output file, that's an indication that we were able1610* to emit the output stores in the generating instructions and we have1611* nothing to do here.1612*/1613return;1614}16151616uint32_t frac;1617struct ureg_dst out = ntt_store_output_decl(c, instr, &frac);16181619if (instr->intrinsic == nir_intrinsic_store_per_vertex_output) {1620out = ntt_ureg_dst_indirect(c, out, instr->src[2]);1621out = ntt_ureg_dst_dimension_indirect(c, out, instr->src[1]);1622} else {1623out = ntt_ureg_dst_indirect(c, out, instr->src[1]);1624}16251626uint8_t swizzle[4] = { 0, 0, 0, 0 };1627for (int i = frac; i <= 4; i++) {1628if (out.WriteMask & (1 << i))1629swizzle[i] = i - frac;1630}16311632src = ureg_swizzle(src, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);16331634ureg_MOV(c->ureg, out, src);1635ntt_reladdr_dst_put(c, out);1636}16371638static void1639ntt_emit_load_sysval(struct ntt_compile *c, nir_intrinsic_instr *instr)1640{1641gl_system_value sysval = nir_system_value_from_intrinsic(instr->intrinsic);1642enum tgsi_semantic semantic = tgsi_get_sysval_semantic(sysval);1643struct ureg_src sv = ureg_DECL_system_value(c->ureg, semantic, 0);16441645/* virglrenderer doesn't like references to channels of the sysval that1646* aren't defined, even if they aren't really read. (GLSL compile fails on1647* gl_NumWorkGroups.w, for example).1648*/1649uint32_t write_mask = BITSET_MASK(nir_dest_num_components(instr->dest));1650sv = ntt_swizzle_for_write_mask(sv, write_mask);16511652/* TGSI and NIR define these intrinsics as always loading ints, but they can1653* still appear on hardware with non-native-integers fragment shaders using1654* the draw path (i915g). In that case, having called nir_lower_int_to_float1655* means that we actually want floats instead.1656*/1657if (!c->native_integers) {1658switch (instr->intrinsic) {1659case nir_intrinsic_load_vertex_id:1660case nir_intrinsic_load_instance_id:1661ureg_U2F(c->ureg, ntt_get_dest(c, &instr->dest), sv);1662return;16631664default:1665break;1666}1667}16681669ntt_store(c, &instr->dest, sv);1670}16711672static void1673ntt_emit_intrinsic(struct ntt_compile *c, nir_intrinsic_instr *instr)1674{1675switch (instr->intrinsic) {1676case nir_intrinsic_load_ubo:1677case nir_intrinsic_load_ubo_vec4:1678ntt_emit_load_ubo(c, instr);1679break;16801681/* Vertex */1682case nir_intrinsic_load_vertex_id:1683case nir_intrinsic_load_vertex_id_zero_base:1684case nir_intrinsic_load_base_vertex:1685case nir_intrinsic_load_base_instance:1686case nir_intrinsic_load_instance_id:1687case nir_intrinsic_load_draw_id:1688case nir_intrinsic_load_invocation_id:1689case nir_intrinsic_load_frag_coord:1690case nir_intrinsic_load_point_coord:1691case nir_intrinsic_load_front_face:1692case nir_intrinsic_load_sample_id:1693case nir_intrinsic_load_sample_mask_in:1694case nir_intrinsic_load_helper_invocation:1695case nir_intrinsic_load_tess_coord:1696case nir_intrinsic_load_patch_vertices_in:1697case nir_intrinsic_load_primitive_id:1698case nir_intrinsic_load_tess_level_outer:1699case nir_intrinsic_load_tess_level_inner:1700case nir_intrinsic_load_local_invocation_id:1701case nir_intrinsic_load_workgroup_id:1702case nir_intrinsic_load_num_workgroups:1703case nir_intrinsic_load_workgroup_size:1704case nir_intrinsic_load_subgroup_size:1705case nir_intrinsic_load_subgroup_invocation:1706case nir_intrinsic_load_subgroup_eq_mask:1707case nir_intrinsic_load_subgroup_ge_mask:1708case nir_intrinsic_load_subgroup_gt_mask:1709case nir_intrinsic_load_subgroup_lt_mask:1710ntt_emit_load_sysval(c, instr);1711break;17121713case nir_intrinsic_load_input:1714case nir_intrinsic_load_per_vertex_input:1715case nir_intrinsic_load_interpolated_input:1716ntt_emit_load_input(c, instr);1717break;17181719case nir_intrinsic_store_output:1720case nir_intrinsic_store_per_vertex_output:1721ntt_emit_store_output(c, instr);1722break;17231724case nir_intrinsic_discard:1725ureg_KILL(c->ureg);1726break;17271728case nir_intrinsic_discard_if: {1729struct ureg_src cond = ureg_scalar(ntt_get_src(c, instr->src[0]), 0);17301731if (c->native_integers) {1732struct ureg_dst temp = ureg_writemask(ureg_DECL_temporary(c->ureg), 1);1733ureg_AND(c->ureg, temp, cond, ureg_imm1f(c->ureg, 1.0));1734ureg_KILL_IF(c->ureg, ureg_scalar(ureg_negate(ureg_src(temp)), 0));1735ureg_release_temporary(c->ureg, temp);1736} else {1737/* For !native_integers, the bool got lowered to 1.0 or 0.0. */1738ureg_KILL_IF(c->ureg, ureg_negate(cond));1739}1740break;1741}17421743case nir_intrinsic_load_ssbo:1744case nir_intrinsic_store_ssbo:1745case nir_intrinsic_ssbo_atomic_add:1746case nir_intrinsic_ssbo_atomic_fadd:1747case nir_intrinsic_ssbo_atomic_imin:1748case nir_intrinsic_ssbo_atomic_imax:1749case nir_intrinsic_ssbo_atomic_umin:1750case nir_intrinsic_ssbo_atomic_umax:1751case nir_intrinsic_ssbo_atomic_and:1752case nir_intrinsic_ssbo_atomic_or:1753case nir_intrinsic_ssbo_atomic_xor:1754case nir_intrinsic_ssbo_atomic_exchange:1755case nir_intrinsic_ssbo_atomic_comp_swap:1756case nir_intrinsic_get_ssbo_size:1757ntt_emit_mem(c, instr, nir_var_mem_ssbo);1758break;17591760case nir_intrinsic_load_shared:1761case nir_intrinsic_store_shared:1762case nir_intrinsic_shared_atomic_add:1763case nir_intrinsic_shared_atomic_fadd:1764case nir_intrinsic_shared_atomic_imin:1765case nir_intrinsic_shared_atomic_imax:1766case nir_intrinsic_shared_atomic_umin:1767case nir_intrinsic_shared_atomic_umax:1768case nir_intrinsic_shared_atomic_and:1769case nir_intrinsic_shared_atomic_or:1770case nir_intrinsic_shared_atomic_xor:1771case nir_intrinsic_shared_atomic_exchange:1772case nir_intrinsic_shared_atomic_comp_swap:1773ntt_emit_mem(c, instr, nir_var_mem_shared);1774break;17751776case nir_intrinsic_image_load:1777case nir_intrinsic_image_store:1778case nir_intrinsic_image_size:1779case nir_intrinsic_image_atomic_add:1780case nir_intrinsic_image_atomic_fadd:1781case nir_intrinsic_image_atomic_imin:1782case nir_intrinsic_image_atomic_umin:1783case nir_intrinsic_image_atomic_imax:1784case nir_intrinsic_image_atomic_umax:1785case nir_intrinsic_image_atomic_and:1786case nir_intrinsic_image_atomic_or:1787case nir_intrinsic_image_atomic_xor:1788case nir_intrinsic_image_atomic_exchange:1789case nir_intrinsic_image_atomic_comp_swap:1790ntt_emit_image_load_store(c, instr);1791break;17921793case nir_intrinsic_control_barrier:1794ureg_BARRIER(c->ureg);1795break;17961797case nir_intrinsic_memory_barrier:1798ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg,1799TGSI_MEMBAR_SHADER_BUFFER |1800TGSI_MEMBAR_ATOMIC_BUFFER |1801TGSI_MEMBAR_SHADER_IMAGE |1802TGSI_MEMBAR_SHARED));1803break;18041805case nir_intrinsic_memory_barrier_atomic_counter:1806ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg, TGSI_MEMBAR_ATOMIC_BUFFER));1807break;18081809case nir_intrinsic_memory_barrier_buffer:1810ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHADER_BUFFER));1811break;18121813case nir_intrinsic_memory_barrier_image:1814ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHADER_IMAGE));1815break;18161817case nir_intrinsic_memory_barrier_shared:1818ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHARED));1819break;18201821case nir_intrinsic_group_memory_barrier:1822ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg,1823TGSI_MEMBAR_SHADER_BUFFER |1824TGSI_MEMBAR_ATOMIC_BUFFER |1825TGSI_MEMBAR_SHADER_IMAGE |1826TGSI_MEMBAR_SHARED |1827TGSI_MEMBAR_THREAD_GROUP));1828break;18291830case nir_intrinsic_end_primitive:1831ureg_ENDPRIM(c->ureg, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr)));1832break;18331834case nir_intrinsic_emit_vertex:1835ureg_EMIT(c->ureg, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr)));1836break;18371838/* In TGSI we don't actually generate the barycentric coords, and emit1839* interp intrinsics later. However, we do need to store the _at_offset1840* argument so that we can use it at that point.1841*/1842case nir_intrinsic_load_barycentric_pixel:1843case nir_intrinsic_load_barycentric_centroid:1844case nir_intrinsic_load_barycentric_at_sample:1845break;18461847case nir_intrinsic_load_barycentric_at_offset:1848ntt_store(c, &instr->dest, ntt_get_src(c, instr->src[0]));1849break;18501851default:1852fprintf(stderr, "Unknown intrinsic: ");1853nir_print_instr(&instr->instr, stderr);1854fprintf(stderr, "\n");1855break;1856}1857}18581859struct ntt_tex_operand_state {1860struct ureg_src srcs[4];1861unsigned i;1862unsigned chan;1863bool is_temp[4];1864};18651866static void1867ntt_push_tex_arg(struct ntt_compile *c,1868nir_tex_instr *instr,1869nir_tex_src_type tex_src_type,1870struct ntt_tex_operand_state *s)1871{1872int tex_src = nir_tex_instr_src_index(instr, tex_src_type);1873if (tex_src < 0)1874return;18751876struct ureg_src src = ntt_get_src(c, instr->src[tex_src].src);1877int num_components = nir_tex_instr_src_size(instr, tex_src);18781879/* Find which src in the tex args we'll fit in. */1880if (s->chan + num_components > 4) {1881s->chan = 0;1882s->i++;1883}18841885/* Would need to fix up swizzling up to the writemask channel here. */1886assert(num_components == 1 || s->chan == 0);1887if (num_components == 1)1888src = ureg_scalar(src, 0);18891890if (ureg_src_is_undef(s->srcs[s->i])) {1891/* First emit of a tex operand's components, no need for a mov. */1892s->srcs[s->i] = src;1893} else {1894/* Otherwise, we need to have a temporary for all the components that go1895* in this operand.1896*/1897if (!s->is_temp[s->i]) {1898struct ureg_src prev_src = s->srcs[s->i];1899s->srcs[s->i] = ureg_src(ureg_DECL_temporary(c->ureg));1900s->is_temp[s->i] = true;19011902ureg_MOV(c->ureg,1903ureg_writemask(ureg_dst(s->srcs[s->i]),1904BITFIELD_MASK(s->chan)), prev_src);1905}19061907ureg_MOV(c->ureg,1908ureg_writemask(ureg_dst(s->srcs[s->i]),1909BITFIELD_RANGE(s->chan, num_components)),1910src);1911}19121913s->chan += num_components;1914}19151916static void1917ntt_emit_texture(struct ntt_compile *c, nir_tex_instr *instr)1918{1919struct ureg_dst dst = ntt_get_dest(c, &instr->dest);1920unsigned target;1921unsigned tex_opcode;19221923struct ureg_src sampler = ureg_DECL_sampler(c->ureg, instr->sampler_index);1924int sampler_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset);1925if (sampler_src >= 0) {1926struct ureg_src reladdr = ntt_get_src(c, instr->src[sampler_src].src);1927sampler = ureg_src_indirect(sampler, ntt_reladdr(c, reladdr));1928}19291930switch (instr->op) {1931case nir_texop_tex:1932tex_opcode = TGSI_OPCODE_TEX;1933break;1934case nir_texop_txf:1935case nir_texop_txf_ms:1936tex_opcode = TGSI_OPCODE_TXF;19371938if (c->has_txf_lz) {1939int lod_src = nir_tex_instr_src_index(instr, nir_tex_src_lod);1940if (lod_src >= 0 &&1941nir_src_is_const(instr->src[lod_src].src) &&1942ntt_src_as_uint(c, instr->src[lod_src].src) == 0) {1943tex_opcode = TGSI_OPCODE_TXF_LZ;1944}1945}1946break;1947case nir_texop_txl:1948tex_opcode = TGSI_OPCODE_TXL;1949break;1950case nir_texop_txb:1951tex_opcode = TGSI_OPCODE_TXB;1952break;1953case nir_texop_txd:1954tex_opcode = TGSI_OPCODE_TXD;1955break;1956case nir_texop_txs:1957tex_opcode = TGSI_OPCODE_TXQ;1958break;1959case nir_texop_tg4:1960tex_opcode = TGSI_OPCODE_TG4;1961break;1962case nir_texop_query_levels:1963tex_opcode = TGSI_OPCODE_TXQ;1964break;1965case nir_texop_lod:1966tex_opcode = TGSI_OPCODE_LODQ;1967break;1968case nir_texop_texture_samples:1969tex_opcode = TGSI_OPCODE_TXQS;1970break;1971default:1972unreachable("unsupported tex op");1973}19741975struct ntt_tex_operand_state s = { .i = 0 };1976ntt_push_tex_arg(c, instr, nir_tex_src_coord, &s);1977/* We always have at least two slots for the coordinate, even on 1D. */1978s.chan = MAX2(s.chan, 2);19791980ntt_push_tex_arg(c, instr, nir_tex_src_comparator, &s);1981s.chan = MAX2(s.chan, 3);19821983ntt_push_tex_arg(c, instr, nir_tex_src_bias, &s);1984if (tex_opcode != TGSI_OPCODE_TXF_LZ)1985ntt_push_tex_arg(c, instr, nir_tex_src_lod, &s);19861987/* End of packed src setup, everything that follows gets its own operand. */1988if (s.chan)1989s.i++;19901991switch (instr->sampler_dim) {1992case GLSL_SAMPLER_DIM_1D:1993if (instr->is_array) {1994if (instr->is_shadow) {1995target = TGSI_TEXTURE_SHADOW1D_ARRAY;1996} else {1997target = TGSI_TEXTURE_1D_ARRAY;1998}1999} else {2000if (instr->is_shadow) {2001target = TGSI_TEXTURE_SHADOW1D;2002} else {2003target = TGSI_TEXTURE_1D;2004}2005}2006break;2007case GLSL_SAMPLER_DIM_2D:2008case GLSL_SAMPLER_DIM_EXTERNAL:2009if (instr->is_array) {2010if (instr->is_shadow) {2011target = TGSI_TEXTURE_SHADOW2D_ARRAY;2012} else {2013target = TGSI_TEXTURE_2D_ARRAY;2014}2015} else {2016if (instr->is_shadow) {2017target = TGSI_TEXTURE_SHADOW2D;2018} else {2019target = TGSI_TEXTURE_2D;2020}2021}2022break;2023case GLSL_SAMPLER_DIM_MS:2024if (instr->is_array) {2025target = TGSI_TEXTURE_2D_ARRAY_MSAA;2026} else {2027target = TGSI_TEXTURE_2D_ARRAY;2028}2029break;2030case GLSL_SAMPLER_DIM_3D:2031assert(!instr->is_shadow);2032target = TGSI_TEXTURE_3D;2033break;2034case GLSL_SAMPLER_DIM_RECT:2035if (instr->is_shadow) {2036target = TGSI_TEXTURE_SHADOWRECT;2037} else {2038target = TGSI_TEXTURE_RECT;2039}2040break;2041case GLSL_SAMPLER_DIM_CUBE:2042if (instr->is_array) {2043if (instr->is_shadow) {2044target = TGSI_TEXTURE_SHADOWCUBE_ARRAY;2045} else {2046target = TGSI_TEXTURE_CUBE_ARRAY;2047}2048} else {2049if (instr->is_shadow) {2050target = TGSI_TEXTURE_SHADOWCUBE;2051} else {2052target = TGSI_TEXTURE_CUBE;2053}2054}2055break;2056case GLSL_SAMPLER_DIM_BUF:2057target = TGSI_TEXTURE_BUFFER;2058break;2059default:2060fprintf(stderr, "Unknown sampler dimensions: %d\n", instr->sampler_dim);2061abort();2062}20632064if (s.i > 1) {2065if (tex_opcode == TGSI_OPCODE_TEX)2066tex_opcode = TGSI_OPCODE_TEX2;2067if (tex_opcode == TGSI_OPCODE_TXB)2068tex_opcode = TGSI_OPCODE_TXB2;2069if (tex_opcode == TGSI_OPCODE_TXL)2070tex_opcode = TGSI_OPCODE_TXL2;2071}20722073if (instr->op == nir_texop_txd) {2074/* Derivs appear in their own src args */2075int ddx = nir_tex_instr_src_index(instr, nir_tex_src_ddx);2076int ddy = nir_tex_instr_src_index(instr, nir_tex_src_ddy);2077s.srcs[s.i++] = ntt_get_src(c, instr->src[ddx].src);2078s.srcs[s.i++] = ntt_get_src(c, instr->src[ddy].src);2079}20802081if (instr->op == nir_texop_tg4 && target != TGSI_TEXTURE_SHADOWCUBE_ARRAY) {2082if (c->screen->get_param(c->screen,2083PIPE_CAP_TGSI_TG4_COMPONENT_IN_SWIZZLE)) {2084sampler = ureg_scalar(sampler, instr->component);2085s.srcs[s.i++] = ureg_src_undef();2086} else {2087s.srcs[s.i++] = ureg_imm1u(c->ureg, instr->component);2088}2089}20902091s.srcs[s.i++] = sampler;20922093enum tgsi_return_type tex_type;2094switch (instr->dest_type) {2095case nir_type_float32:2096tex_type = TGSI_RETURN_TYPE_FLOAT;2097break;2098case nir_type_int32:2099tex_type = TGSI_RETURN_TYPE_SINT;2100break;2101case nir_type_uint32:2102tex_type = TGSI_RETURN_TYPE_UINT;2103break;2104default:2105unreachable("unknown texture type");2106}21072108struct tgsi_texture_offset tex_offsets[4];2109unsigned num_tex_offsets = 0;2110int tex_offset_src = nir_tex_instr_src_index(instr, nir_tex_src_offset);2111if (tex_offset_src >= 0) {2112struct ureg_src offset = ntt_get_src(c, instr->src[tex_offset_src].src);21132114tex_offsets[0].File = offset.File;2115tex_offsets[0].Index = offset.Index;2116tex_offsets[0].SwizzleX = offset.SwizzleX;2117tex_offsets[0].SwizzleY = offset.SwizzleY;2118tex_offsets[0].SwizzleZ = offset.SwizzleZ;2119tex_offsets[0].Padding = 0;21202121num_tex_offsets = 1;2122}21232124struct ureg_dst tex_dst;2125if (instr->op == nir_texop_query_levels)2126tex_dst = ureg_writemask(ureg_DECL_temporary(c->ureg), TGSI_WRITEMASK_W);2127else2128tex_dst = dst;21292130ureg_tex_insn(c->ureg, tex_opcode,2131&tex_dst, 1,2132target,2133tex_type,2134tex_offsets, num_tex_offsets,2135s.srcs, s.i);21362137if (instr->op == nir_texop_query_levels) {2138ureg_MOV(c->ureg, dst, ureg_scalar(ureg_src(tex_dst), 3));2139ureg_release_temporary(c->ureg, tex_dst);2140}21412142for (int i = 0; i < s.i; i++) {2143if (s.is_temp[i])2144ureg_release_temporary(c->ureg, ureg_dst(s.srcs[i]));2145}2146}21472148static void2149ntt_emit_jump(struct ntt_compile *c, nir_jump_instr *jump)2150{2151switch (jump->type) {2152case nir_jump_break:2153ureg_BRK(c->ureg);2154break;21552156case nir_jump_continue:2157ureg_CONT(c->ureg);2158break;21592160default:2161fprintf(stderr, "Unknown jump instruction: ");2162nir_print_instr(&jump->instr, stderr);2163fprintf(stderr, "\n");2164abort();2165}2166}21672168static void2169ntt_emit_ssa_undef(struct ntt_compile *c, nir_ssa_undef_instr *instr)2170{2171/* Nothing to do but make sure that we have some storage to deref. */2172(void)ntt_get_ssa_def_decl(c, &instr->def);2173}21742175static void2176ntt_emit_instr(struct ntt_compile *c, nir_instr *instr)2177{2178/* There is no addr reg in use before we start emitting an instr. */2179c->next_addr_reg = 0;21802181switch (instr->type) {2182case nir_instr_type_deref:2183/* ignored, will be walked by nir_intrinsic_image_*_deref. */2184break;21852186case nir_instr_type_alu:2187ntt_emit_alu(c, nir_instr_as_alu(instr));2188break;21892190case nir_instr_type_intrinsic:2191ntt_emit_intrinsic(c, nir_instr_as_intrinsic(instr));2192break;21932194case nir_instr_type_load_const:2195/* Nothing to do here, as load consts are done directly from2196* ntt_get_src() (since many constant NIR srcs will often get folded2197* directly into a register file index instead of as a TGSI src).2198*/2199break;22002201case nir_instr_type_tex:2202ntt_emit_texture(c, nir_instr_as_tex(instr));2203break;22042205case nir_instr_type_jump:2206ntt_emit_jump(c, nir_instr_as_jump(instr));2207break;22082209case nir_instr_type_ssa_undef:2210ntt_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr));2211break;22122213default:2214fprintf(stderr, "Unknown NIR instr type: ");2215nir_print_instr(instr, stderr);2216fprintf(stderr, "\n");2217abort();2218}2219}22202221static void2222ntt_emit_if(struct ntt_compile *c, nir_if *if_stmt)2223{2224unsigned label;2225ureg_UIF(c->ureg, c->if_cond, &label);2226ntt_emit_cf_list(c, &if_stmt->then_list);22272228if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {2229ureg_fixup_label(c->ureg, label, ureg_get_instruction_number(c->ureg));2230ureg_ELSE(c->ureg, &label);2231ntt_emit_cf_list(c, &if_stmt->else_list);2232}22332234ureg_fixup_label(c->ureg, label, ureg_get_instruction_number(c->ureg));2235ureg_ENDIF(c->ureg);2236}22372238static void2239ntt_emit_loop(struct ntt_compile *c, nir_loop *loop)2240{2241/* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx2242* does reference BGNLOOP's. Follow the former behavior unless something comes up2243* with a need.2244*/2245unsigned begin_label;2246ureg_BGNLOOP(c->ureg, &begin_label);2247ntt_emit_cf_list(c, &loop->body);22482249unsigned end_label;2250ureg_ENDLOOP(c->ureg, &end_label);2251}22522253static void2254ntt_free_ssa_temp_by_index(struct ntt_compile *c, int index)2255{2256/* We do store CONST/IMM/INPUT/etc. in ssa_temp[] */2257if (c->ssa_temp[index].File != TGSI_FILE_TEMPORARY)2258return;22592260ureg_release_temporary(c->ureg, c->ssa_temp[index]);2261memset(&c->ssa_temp[index], 0, sizeof(c->ssa_temp[index]));2262}22632264/* Releases any temporaries for SSA defs with a live interval ending at this2265* instruction.2266*/2267static bool2268ntt_src_live_interval_end_cb(nir_src *src, void *state)2269{2270struct ntt_compile *c = state;22712272if (src->is_ssa) {2273nir_ssa_def *def = src->ssa;22742275if (c->liveness->defs[def->index].end == src->parent_instr->index)2276ntt_free_ssa_temp_by_index(c, def->index);2277}22782279return true;2280}22812282static void2283ntt_emit_block(struct ntt_compile *c, nir_block *block)2284{2285nir_foreach_instr(instr, block) {2286ntt_emit_instr(c, instr);22872288nir_foreach_src(instr, ntt_src_live_interval_end_cb, c);2289}22902291/* Set up the if condition for ntt_emit_if(), which we have to do before2292* freeing up the temps (the "if" is treated as inside the block for liveness2293* purposes, despite not being an instruction)2294*2295* Note that, while IF and UIF are supposed to look at only .x, virglrenderer2296* looks at all of .xyzw. No harm in working around the bug.2297*/2298nir_if *nif = nir_block_get_following_if(block);2299if (nif)2300c->if_cond = ureg_scalar(ntt_get_src(c, nif->condition), TGSI_SWIZZLE_X);23012302/* Free up any SSA temps that are unused at the end of the block. */2303unsigned index;2304BITSET_FOREACH_SET(index, block->live_out, BITSET_WORDS(c->impl->ssa_alloc)) {2305unsigned def_end_ip = c->liveness->defs[index].end;2306if (def_end_ip == block->end_ip)2307ntt_free_ssa_temp_by_index(c, index);2308}2309}23102311static void2312ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list)2313{2314/* There is no addr reg in use before we start emitting any part of a CF2315* node (such as an if condition)2316*/2317c->next_addr_reg = 0;23182319foreach_list_typed(nir_cf_node, node, node, list) {2320switch (node->type) {2321case nir_cf_node_block:2322ntt_emit_block(c, nir_cf_node_as_block(node));2323break;23242325case nir_cf_node_if:2326ntt_emit_if(c, nir_cf_node_as_if(node));2327break;23282329case nir_cf_node_loop:2330ntt_emit_loop(c, nir_cf_node_as_loop(node));2331break;23322333default:2334unreachable("unknown CF type");2335}2336}2337}23382339static void2340ntt_emit_impl(struct ntt_compile *c, nir_function_impl *impl)2341{2342c->impl = impl;2343c->liveness = nir_live_ssa_defs_per_instr(impl);23442345c->ssa_temp = rzalloc_array(c, struct ureg_dst, impl->ssa_alloc);2346c->reg_temp = rzalloc_array(c, struct ureg_dst, impl->reg_alloc);23472348ntt_setup_registers(c, &impl->registers);2349ntt_emit_cf_list(c, &impl->body);23502351ralloc_free(c->liveness);2352c->liveness = NULL;2353}23542355static int2356type_size(const struct glsl_type *type, bool bindless)2357{2358return glsl_count_attribute_slots(type, false);2359}23602361/* Allow vectorizing of ALU instructions, but avoid vectorizing past what we2362* can handle for 64-bit values in TGSI.2363*/2364static bool2365ntt_should_vectorize_instr(const nir_instr *instr, void *data)2366{2367if (instr->type != nir_instr_type_alu)2368return false;23692370nir_alu_instr *alu = nir_instr_as_alu(instr);23712372switch (alu->op) {2373case nir_op_ibitfield_extract:2374case nir_op_ubitfield_extract:2375case nir_op_bitfield_insert:2376/* virglrenderer only looks at the .x channel of the offset/bits operands2377* when translating to GLSL. tgsi.rst doesn't seem to require scalar2378* offset/bits operands.2379*2380* https://gitlab.freedesktop.org/virgl/virglrenderer/-/issues/1952381*/2382return false;23832384default:2385break;2386}23872388unsigned num_components = alu->dest.dest.ssa.num_components;23892390int src_bit_size = nir_src_bit_size(alu->src[0].src);2391int dst_bit_size = nir_dest_bit_size(alu->dest.dest);23922393if (src_bit_size == 64 || dst_bit_size == 64) {2394if (num_components > 1)2395return false;2396}23972398return true;2399}24002401static bool2402ntt_should_vectorize_io(unsigned align, unsigned bit_size,2403unsigned num_components, unsigned high_offset,2404nir_intrinsic_instr *low, nir_intrinsic_instr *high,2405void *data)2406{2407if (bit_size != 32)2408return false;24092410/* Our offset alignment should aways be at least 4 bytes */2411if (align < 4)2412return false;24132414/* No wrapping off the end of a TGSI reg. We could do a bit better by2415* looking at low's actual offset. XXX: With LOAD_CONSTBUF maybe we don't2416* need this restriction.2417*/2418unsigned worst_start_component = align == 4 ? 3 : align / 4;2419if (worst_start_component + num_components > 4)2420return false;24212422return true;2423}24242425static nir_variable_mode2426ntt_no_indirects_mask(nir_shader *s, struct pipe_screen *screen)2427{2428unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);2429unsigned indirect_mask = 0;24302431if (!screen->get_shader_param(screen, pipe_stage,2432PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR)) {2433indirect_mask |= nir_var_shader_in;2434}24352436if (!screen->get_shader_param(screen, pipe_stage,2437PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR)) {2438indirect_mask |= nir_var_shader_out;2439}24402441if (!screen->get_shader_param(screen, pipe_stage,2442PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR)) {2443indirect_mask |= nir_var_function_temp;2444}24452446return indirect_mask;2447}24482449static void2450ntt_optimize_nir(struct nir_shader *s, struct pipe_screen *screen)2451{2452bool progress;2453nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);2454unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);2455unsigned control_flow_depth =2456screen->get_shader_param(screen, pipe_stage,2457PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH);2458do {2459progress = false;24602461NIR_PASS_V(s, nir_lower_vars_to_ssa);24622463NIR_PASS(progress, s, nir_copy_prop);2464NIR_PASS(progress, s, nir_opt_algebraic);2465NIR_PASS(progress, s, nir_opt_constant_folding);2466NIR_PASS(progress, s, nir_opt_remove_phis);2467NIR_PASS(progress, s, nir_opt_conditional_discard);2468NIR_PASS(progress, s, nir_opt_dce);2469NIR_PASS(progress, s, nir_opt_dead_cf);2470NIR_PASS(progress, s, nir_opt_cse);2471NIR_PASS(progress, s, nir_opt_find_array_copies);2472NIR_PASS(progress, s, nir_opt_if, true);2473NIR_PASS(progress, s, nir_opt_peephole_select,2474control_flow_depth == 0 ? ~0 : 8, true, true);2475NIR_PASS(progress, s, nir_opt_algebraic);2476NIR_PASS(progress, s, nir_opt_constant_folding);2477nir_load_store_vectorize_options vectorize_opts = {2478.modes = nir_var_mem_ubo,2479.callback = ntt_should_vectorize_io,2480.robust_modes = 0,2481};2482NIR_PASS(progress, s, nir_opt_load_store_vectorize, &vectorize_opts);2483NIR_PASS(progress, s, nir_opt_shrink_vectors, true);2484NIR_PASS(progress, s, nir_opt_trivial_continues);2485NIR_PASS(progress, s, nir_opt_vectorize, ntt_should_vectorize_instr, NULL);2486NIR_PASS(progress, s, nir_opt_undef);2487NIR_PASS(progress, s, nir_opt_loop_unroll, no_indirects_mask);24882489} while (progress);2490}24912492/* Scalarizes all 64-bit ALU ops. Note that we only actually need to2493* scalarize vec3/vec4s, should probably fix that.2494*/2495static bool2496scalarize_64bit(const nir_instr *instr, const void *data)2497{2498const nir_alu_instr *alu = nir_instr_as_alu(instr);24992500return (nir_dest_bit_size(alu->dest.dest) == 64 ||2501nir_src_bit_size(alu->src[0].src) == 64);2502}25032504static bool2505nir_to_tgsi_lower_64bit_intrinsic(nir_builder *b, nir_intrinsic_instr *instr)2506{2507b->cursor = nir_after_instr(&instr->instr);25082509switch (instr->intrinsic) {2510case nir_intrinsic_load_ubo:2511case nir_intrinsic_load_ubo_vec4:2512case nir_intrinsic_load_ssbo:2513case nir_intrinsic_load_input:2514case nir_intrinsic_load_interpolated_input:2515case nir_intrinsic_load_per_vertex_input:2516case nir_intrinsic_store_output:2517case nir_intrinsic_store_ssbo:2518break;2519default:2520return false;2521}25222523if (instr->num_components <= 2)2524return false;25252526bool has_dest = nir_intrinsic_infos[instr->intrinsic].has_dest;2527if (has_dest) {2528if (nir_dest_bit_size(instr->dest) != 64)2529return false;2530} else {2531if (nir_src_bit_size(instr->src[0]) != 64)2532return false;2533}25342535nir_intrinsic_instr *first =2536nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));2537nir_intrinsic_instr *second =2538nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));25392540switch (instr->intrinsic) {2541case nir_intrinsic_load_ubo:2542case nir_intrinsic_load_ubo_vec4:2543case nir_intrinsic_load_ssbo:2544case nir_intrinsic_store_ssbo:2545break;25462547default: {2548nir_io_semantics semantics = nir_intrinsic_io_semantics(second);2549semantics.location++;2550semantics.num_slots--;2551nir_intrinsic_set_io_semantics(second, semantics);25522553nir_intrinsic_set_base(second, nir_intrinsic_base(second) + 1);2554break;2555}2556}25572558first->num_components = 2;2559second->num_components -= 2;2560if (has_dest) {2561first->dest.ssa.num_components = 2;2562second->dest.ssa.num_components -= 2;2563}25642565nir_builder_instr_insert(b, &first->instr);2566nir_builder_instr_insert(b, &second->instr);25672568if (has_dest) {2569/* Merge the two loads' results back into a vector. */2570nir_ssa_def *channels[4] = {2571nir_channel(b, &first->dest.ssa, 0),2572nir_channel(b, &first->dest.ssa, 1),2573nir_channel(b, &second->dest.ssa, 0),2574second->num_components > 1 ? nir_channel(b, &second->dest.ssa, 1) : NULL,2575};2576nir_ssa_def *new = nir_vec(b, channels, instr->num_components);2577nir_ssa_def_rewrite_uses(&instr->dest.ssa, new);2578} else {2579/* Split the src value across the two stores. */2580b->cursor = nir_before_instr(&instr->instr);25812582nir_ssa_def *src0 = instr->src[0].ssa;2583nir_ssa_def *channels[4] = { 0 };2584for (int i = 0; i < instr->num_components; i++)2585channels[i] = nir_channel(b, src0, i);25862587nir_intrinsic_set_write_mask(first, nir_intrinsic_write_mask(instr) & 3);2588nir_intrinsic_set_write_mask(second, nir_intrinsic_write_mask(instr) >> 2);25892590nir_instr_rewrite_src(&first->instr, &first->src[0],2591nir_src_for_ssa(nir_vec(b, channels, 2)));2592nir_instr_rewrite_src(&second->instr, &second->src[0],2593nir_src_for_ssa(nir_vec(b, &channels[2],2594second->num_components)));2595}25962597int offset_src = -1;2598uint32_t offset_amount = 16;25992600switch (instr->intrinsic) {2601case nir_intrinsic_load_ssbo:2602case nir_intrinsic_load_ubo:2603offset_src = 1;2604break;2605case nir_intrinsic_load_ubo_vec4:2606offset_src = 1;2607offset_amount = 1;2608break;2609case nir_intrinsic_store_ssbo:2610offset_src = 2;2611break;2612default:2613break;2614}2615if (offset_src != -1) {2616b->cursor = nir_before_instr(&second->instr);2617nir_ssa_def *second_offset =2618nir_iadd_imm(b, second->src[offset_src].ssa, offset_amount);2619nir_instr_rewrite_src(&second->instr, &second->src[offset_src],2620nir_src_for_ssa(second_offset));2621}26222623/* DCE stores we generated with no writemask (nothing else does this2624* currently).2625*/2626if (!has_dest) {2627if (nir_intrinsic_write_mask(first) == 0)2628nir_instr_remove(&first->instr);2629if (nir_intrinsic_write_mask(second) == 0)2630nir_instr_remove(&second->instr);2631}26322633nir_instr_remove(&instr->instr);26342635return true;2636}26372638static bool2639nir_to_tgsi_lower_64bit_load_const(nir_builder *b, nir_load_const_instr *instr)2640{2641int num_components = instr->def.num_components;26422643if (instr->def.bit_size != 64 || num_components <= 2)2644return false;26452646b->cursor = nir_before_instr(&instr->instr);26472648nir_load_const_instr *first =2649nir_load_const_instr_create(b->shader, 2, 64);2650nir_load_const_instr *second =2651nir_load_const_instr_create(b->shader, num_components - 2, 64);26522653first->value[0] = instr->value[0];2654first->value[1] = instr->value[1];2655second->value[0] = instr->value[2];2656if (num_components == 4)2657second->value[1] = instr->value[3];26582659nir_builder_instr_insert(b, &first->instr);2660nir_builder_instr_insert(b, &second->instr);26612662nir_ssa_def *channels[4] = {2663nir_channel(b, &first->def, 0),2664nir_channel(b, &first->def, 1),2665nir_channel(b, &second->def, 0),2666num_components == 4 ? nir_channel(b, &second->def, 1) : NULL,2667};2668nir_ssa_def *new = nir_vec(b, channels, num_components);2669nir_ssa_def_rewrite_uses(&instr->def, new);2670nir_instr_remove(&instr->instr);26712672return true;2673}26742675static bool2676nir_to_tgsi_lower_64bit_to_vec2_instr(nir_builder *b, nir_instr *instr,2677void *data)2678{2679switch (instr->type) {2680case nir_instr_type_load_const:2681return nir_to_tgsi_lower_64bit_load_const(b, nir_instr_as_load_const(instr));26822683case nir_instr_type_intrinsic:2684return nir_to_tgsi_lower_64bit_intrinsic(b, nir_instr_as_intrinsic(instr));2685default:2686return false;2687}2688}26892690static bool2691nir_to_tgsi_lower_64bit_to_vec2(nir_shader *s)2692{2693return nir_shader_instructions_pass(s,2694nir_to_tgsi_lower_64bit_to_vec2_instr,2695nir_metadata_block_index |2696nir_metadata_dominance,2697NULL);2698}26992700static void2701ntt_fix_nir_options(struct pipe_screen *screen, struct nir_shader *s)2702{2703const struct nir_shader_compiler_options *options = s->options;2704bool lower_fsqrt =2705!screen->get_shader_param(screen, pipe_shader_type_from_mesa(s->info.stage),2706PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED);27072708if (!options->lower_extract_byte ||2709!options->lower_extract_word ||2710!options->lower_insert_byte ||2711!options->lower_insert_word ||2712!options->lower_fdph ||2713!options->lower_flrp64 ||2714!options->lower_fmod ||2715!options->lower_rotate ||2716!options->lower_uniforms_to_ubo ||2717!options->lower_vector_cmp ||2718options->lower_fsqrt != lower_fsqrt) {2719nir_shader_compiler_options *new_options = ralloc(s, nir_shader_compiler_options);2720*new_options = *s->options;27212722new_options->lower_extract_byte = true;2723new_options->lower_extract_word = true;2724new_options->lower_insert_byte = true;2725new_options->lower_insert_word = true;2726new_options->lower_fdph = true;2727new_options->lower_flrp64 = true;2728new_options->lower_fmod = true;2729new_options->lower_rotate = true;2730new_options->lower_uniforms_to_ubo = true,2731new_options->lower_vector_cmp = true;2732new_options->lower_fsqrt = lower_fsqrt;27332734s->options = new_options;2735}2736}27372738/**2739* Translates the NIR shader to TGSI.2740*2741* This requires some lowering of the NIR shader to prepare it for translation.2742* We take ownership of the NIR shader passed, returning a reference to the new2743* TGSI tokens instead. If you need to keep the NIR, then pass us a clone.2744*/2745const void *2746nir_to_tgsi(struct nir_shader *s,2747struct pipe_screen *screen)2748{2749struct ntt_compile *c;2750const void *tgsi_tokens;2751bool debug = env_var_as_boolean("NIR_TO_TGSI_DEBUG", false);2752nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);2753bool native_integers = screen->get_shader_param(screen,2754pipe_shader_type_from_mesa(s->info.stage),2755PIPE_SHADER_CAP_INTEGERS);2756const struct nir_shader_compiler_options *original_options = s->options;27572758ntt_fix_nir_options(screen, s);27592760NIR_PASS_V(s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,2761type_size, (nir_lower_io_options)0);2762NIR_PASS_V(s, nir_lower_regs_to_ssa);27632764const nir_lower_tex_options lower_tex_options = {2765/* XXX: We could skip lowering of TXP for TEX with <=3 coord_compoennts.2766*/2767.lower_txp = ~0,2768};2769NIR_PASS_V(s, nir_lower_tex, &lower_tex_options);27702771if (!original_options->lower_uniforms_to_ubo) {2772NIR_PASS_V(s, nir_lower_uniforms_to_ubo,2773screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS),2774!native_integers);2775}27762777/* Do lowering so we can directly translate f64/i64 NIR ALU ops to TGSI --2778* TGSI stores up to a vec2 in each slot, so to avoid a whole bunch of op2779* duplication logic we just make it so that we only see vec2s.2780*/2781NIR_PASS_V(s, nir_lower_alu_to_scalar, scalarize_64bit, NULL);2782NIR_PASS_V(s, nir_to_tgsi_lower_64bit_to_vec2);27832784if (!screen->get_param(screen, PIPE_CAP_LOAD_CONSTBUF))2785NIR_PASS_V(s, nir_lower_ubo_vec4);27862787ntt_optimize_nir(s, screen);27882789NIR_PASS_V(s, nir_lower_indirect_derefs, no_indirects_mask, UINT32_MAX);27902791bool progress;2792do {2793progress = false;2794NIR_PASS(progress, s, nir_opt_algebraic_late);2795if (progress) {2796NIR_PASS_V(s, nir_copy_prop);2797NIR_PASS_V(s, nir_opt_dce);2798NIR_PASS_V(s, nir_opt_cse);2799}2800} while (progress);28012802if (screen->get_shader_param(screen,2803pipe_shader_type_from_mesa(s->info.stage),2804PIPE_SHADER_CAP_INTEGERS)) {2805NIR_PASS_V(s, nir_lower_bool_to_int32);2806} else {2807NIR_PASS_V(s, nir_lower_int_to_float);2808NIR_PASS_V(s, nir_lower_bool_to_float);2809/* bool_to_float generates MOVs for b2f32 that we want to clean up. */2810NIR_PASS_V(s, nir_copy_prop);2811NIR_PASS_V(s, nir_opt_dce);2812}28132814/* Only lower 32-bit floats. The only other modifier type officially2815* supported by TGSI is 32-bit integer negates, but even those are broken on2816* virglrenderer, so skip lowering all integer and f64 float mods.2817*/2818NIR_PASS_V(s, nir_lower_to_source_mods, nir_lower_float_source_mods);2819NIR_PASS_V(s, nir_convert_from_ssa, true);2820NIR_PASS_V(s, nir_lower_vec_to_movs, NULL, NULL);28212822/* locals_to_regs will leave dead derefs that are good to clean up. */2823NIR_PASS_V(s, nir_lower_locals_to_regs);2824NIR_PASS_V(s, nir_opt_dce);28252826if (debug) {2827fprintf(stderr, "NIR before translation to TGSI:\n");2828nir_print_shader(s, stderr);2829}28302831c = rzalloc(NULL, struct ntt_compile);2832c->screen = screen;28332834c->needs_texcoord_semantic =2835screen->get_param(screen, PIPE_CAP_TGSI_TEXCOORD);2836c->any_reg_as_address =2837screen->get_param(screen, PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS);2838c->has_txf_lz =2839screen->get_param(screen, PIPE_CAP_TGSI_TEX_TXF_LZ);28402841c->s = s;2842c->native_integers = native_integers;2843c->ureg = ureg_create(pipe_shader_type_from_mesa(s->info.stage));2844ureg_setup_shader_info(c->ureg, &s->info);28452846ntt_setup_inputs(c);2847ntt_setup_uniforms(c);28482849if (s->info.stage == MESA_SHADER_FRAGMENT) {2850/* The draw module's polygon stipple layer doesn't respect the chosen2851* coordinate mode, so leave it as unspecified unless we're actually2852* reading the position in the shader already. See2853* gl-2.1-polygon-stipple-fs on softpipe.2854*/2855if ((s->info.inputs_read & VARYING_BIT_POS) ||2856BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRAG_COORD)) {2857ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_ORIGIN,2858s->info.fs.origin_upper_left ?2859TGSI_FS_COORD_ORIGIN_UPPER_LEFT :2860TGSI_FS_COORD_ORIGIN_LOWER_LEFT);28612862ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_PIXEL_CENTER,2863s->info.fs.pixel_center_integer ?2864TGSI_FS_COORD_PIXEL_CENTER_INTEGER :2865TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER);2866}2867}2868/* Emit the main function */2869nir_function_impl *impl = nir_shader_get_entrypoint(c->s);2870ntt_emit_impl(c, impl);2871ureg_END(c->ureg);28722873tgsi_tokens = ureg_get_tokens(c->ureg, NULL);28742875if (debug) {2876fprintf(stderr, "TGSI after translation from NIR:\n");2877tgsi_dump(tgsi_tokens, 0);2878}28792880ureg_destroy(c->ureg);28812882ralloc_free(c);2883ralloc_free(s);28842885return tgsi_tokens;2886}28872888static const nir_shader_compiler_options nir_to_tgsi_compiler_options = {2889.fuse_ffma32 = true,2890.fuse_ffma64 = true,2891.lower_extract_byte = true,2892.lower_extract_word = true,2893.lower_insert_byte = true,2894.lower_insert_word = true,2895.lower_fdph = true,2896.lower_flrp64 = true,2897.lower_fmod = true,2898.lower_rotate = true,2899.lower_uniforms_to_ubo = true,2900.lower_vector_cmp = true,2901.use_interpolated_input_intrinsics = true,2902};29032904/* Returns a default compiler options for drivers with only nir-to-tgsi-based2905* NIR support.2906*/2907const void *2908nir_to_tgsi_get_compiler_options(struct pipe_screen *pscreen,2909enum pipe_shader_ir ir,2910unsigned shader)2911{2912assert(ir == PIPE_SHADER_IR_NIR);2913return &nir_to_tgsi_compiler_options;2914}291529162917