Path: blob/21.2-virgl/src/amd/compiler/aco_instruction_selection.cpp
7097 views
/*1* Copyright © 2018 Valve Corporation2* Copyright © 2018 Google3*4* Permission is hereby granted, free of charge, to any person obtaining a5* copy of this software and associated documentation files (the "Software"),6* to deal in the Software without restriction, including without limitation7* the rights to use, copy, modify, merge, publish, distribute, sublicense,8* and/or sell copies of the Software, and to permit persons to whom the9* Software is furnished to do so, subject to the following conditions:10*11* The above copyright notice and this permission notice (including the next12* paragraph) shall be included in all copies or substantial portions of the13* Software.14*15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS21* IN THE SOFTWARE.22*23*/2425#include "aco_instruction_selection.h"2627#include "aco_builder.h"28#include "aco_ir.h"2930#include "common/ac_exp_param.h"31#include "common/sid.h"32#include "vulkan/radv_descriptor_set.h"3334#include "util/fast_idiv_by_const.h"35#include "util/memstream.h"3637#include <array>38#include <functional>39#include <map>40#include <numeric>41#include <stack>42#include <vector>4344namespace aco {45namespace {4647#define isel_err(...) _isel_err(ctx, __FILE__, __LINE__, __VA_ARGS__)4849static void50_isel_err(isel_context* ctx, const char* file, unsigned line, const nir_instr* instr,51const char* msg)52{53char* out;54size_t outsize;55struct u_memstream mem;56u_memstream_open(&mem, &out, &outsize);57FILE* const memf = u_memstream_get(&mem);5859fprintf(memf, "%s: ", msg);60nir_print_instr(instr, memf);61u_memstream_close(&mem);6263_aco_err(ctx->program, file, line, out);64free(out);65}6667struct if_context {68Temp cond;6970bool divergent_old;71bool exec_potentially_empty_discard_old;72bool exec_potentially_empty_break_old;73uint16_t exec_potentially_empty_break_depth_old;7475unsigned BB_if_idx;76unsigned invert_idx;77bool uniform_has_then_branch;78bool then_branch_divergent;79Block BB_invert;80Block BB_endif;81};8283struct loop_context {84Block loop_exit;8586unsigned header_idx_old;87Block* exit_old;88bool divergent_cont_old;89bool divergent_branch_old;90bool divergent_if_old;91};9293static bool visit_cf_list(struct isel_context* ctx, struct exec_list* list);9495static void96add_logical_edge(unsigned pred_idx, Block* succ)97{98succ->logical_preds.emplace_back(pred_idx);99}100101static void102add_linear_edge(unsigned pred_idx, Block* succ)103{104succ->linear_preds.emplace_back(pred_idx);105}106107static void108add_edge(unsigned pred_idx, Block* succ)109{110add_logical_edge(pred_idx, succ);111add_linear_edge(pred_idx, succ);112}113114static void115append_logical_start(Block* b)116{117Builder(NULL, b).pseudo(aco_opcode::p_logical_start);118}119120static void121append_logical_end(Block* b)122{123Builder(NULL, b).pseudo(aco_opcode::p_logical_end);124}125126Temp127get_ssa_temp(struct isel_context* ctx, nir_ssa_def* def)128{129uint32_t id = ctx->first_temp_id + def->index;130return Temp(id, ctx->program->temp_rc[id]);131}132133Temp134emit_mbcnt(isel_context* ctx, Temp dst, Operand mask = Operand(), Operand base = Operand::zero())135{136Builder bld(ctx->program, ctx->block);137assert(mask.isUndefined() || mask.isTemp() || (mask.isFixed() && mask.physReg() == exec));138assert(mask.isUndefined() || mask.bytes() == bld.lm.bytes());139140if (ctx->program->wave_size == 32) {141Operand mask_lo = mask.isUndefined() ? Operand::c32(-1u) : mask;142return bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, Definition(dst), mask_lo, base);143}144145Operand mask_lo = Operand::c32(-1u);146Operand mask_hi = Operand::c32(-1u);147148if (mask.isTemp()) {149RegClass rc = RegClass(mask.regClass().type(), 1);150Builder::Result mask_split =151bld.pseudo(aco_opcode::p_split_vector, bld.def(rc), bld.def(rc), mask);152mask_lo = Operand(mask_split.def(0).getTemp());153mask_hi = Operand(mask_split.def(1).getTemp());154} else if (mask.physReg() == exec) {155mask_lo = Operand(exec_lo, s1);156mask_hi = Operand(exec_hi, s1);157}158159Temp mbcnt_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), mask_lo, base);160161if (ctx->program->chip_class <= GFX7)162return bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, Definition(dst), mask_hi, mbcnt_lo);163else164return bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(dst), mask_hi, mbcnt_lo);165}166167Temp168emit_wqm(Builder& bld, Temp src, Temp dst = Temp(0, s1), bool program_needs_wqm = false)169{170if (!dst.id())171dst = bld.tmp(src.regClass());172173assert(src.size() == dst.size());174175if (bld.program->stage != fragment_fs) {176if (!dst.id())177return src;178179bld.copy(Definition(dst), src);180return dst;181}182183bld.pseudo(aco_opcode::p_wqm, Definition(dst), src);184bld.program->needs_wqm |= program_needs_wqm;185return dst;186}187188static Temp189emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data)190{191if (index.regClass() == s1)192return bld.readlane(bld.def(s1), data, index);193194if (ctx->options->chip_class <= GFX7) {195/* GFX6-7: there is no bpermute instruction */196Operand index_op(index);197Operand input_data(data);198index_op.setLateKill(true);199input_data.setLateKill(true);200201return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc),202index_op, input_data);203} else if (ctx->options->chip_class >= GFX10 && ctx->program->wave_size == 64) {204205/* GFX10 wave64 mode: emulate full-wave bpermute */206Temp index_is_lo =207bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand::c32(31u), index);208Builder::Result index_is_lo_split =209bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo);210Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc),211index_is_lo_split.def(1).getTemp());212Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),213index_is_lo_split.def(0).getTemp(), index_is_lo_n1);214Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);215Operand input_data(data);216217index_x4.setLateKill(true);218input_data.setLateKill(true);219same_half.setLateKill(true);220221/* We need one pair of shared VGPRs:222* Note, that these have twice the allocation granularity of normal VGPRs */223ctx->program->config->num_shared_vgprs = 2 * ctx->program->dev.vgpr_alloc_granule;224225return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc),226index_x4, input_data, same_half);227} else {228/* GFX8-9 or GFX10 wave32: bpermute works normally */229Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);230return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);231}232}233234static Temp235emit_masked_swizzle(isel_context* ctx, Builder& bld, Temp src, unsigned mask)236{237if (ctx->options->chip_class >= GFX8) {238unsigned and_mask = mask & 0x1f;239unsigned or_mask = (mask >> 5) & 0x1f;240unsigned xor_mask = (mask >> 10) & 0x1f;241242uint16_t dpp_ctrl = 0xffff;243244// TODO: we could use DPP8 for some swizzles245if (and_mask == 0x1f && or_mask < 4 && xor_mask < 4) {246unsigned res[4] = {0, 1, 2, 3};247for (unsigned i = 0; i < 4; i++)248res[i] = ((res[i] | or_mask) ^ xor_mask) & 0x3;249dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]);250} else if (and_mask == 0x1f && !or_mask && xor_mask == 8) {251dpp_ctrl = dpp_row_rr(8);252} else if (and_mask == 0x1f && !or_mask && xor_mask == 0xf) {253dpp_ctrl = dpp_row_mirror;254} else if (and_mask == 0x1f && !or_mask && xor_mask == 0x7) {255dpp_ctrl = dpp_row_half_mirror;256}257258if (dpp_ctrl != 0xffff)259return bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);260}261262return bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false);263}264265Temp266as_vgpr(isel_context* ctx, Temp val)267{268if (val.type() == RegType::sgpr) {269Builder bld(ctx->program, ctx->block);270return bld.copy(bld.def(RegType::vgpr, val.size()), val);271}272assert(val.type() == RegType::vgpr);273return val;274}275276// assumes a != 0xffffffff277void278emit_v_div_u32(isel_context* ctx, Temp dst, Temp a, uint32_t b)279{280assert(b != 0);281Builder bld(ctx->program, ctx->block);282283if (util_is_power_of_two_or_zero(b)) {284bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand::c32(util_logbase2(b)), a);285return;286}287288util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32);289290assert(info.multiplier <= 0xffffffff);291292bool pre_shift = info.pre_shift != 0;293bool increment = info.increment != 0;294bool multiply = true;295bool post_shift = info.post_shift != 0;296297if (!pre_shift && !increment && !multiply && !post_shift) {298bld.copy(Definition(dst), a);299return;300}301302Temp pre_shift_dst = a;303if (pre_shift) {304pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst;305bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand::c32(info.pre_shift),306a);307}308309Temp increment_dst = pre_shift_dst;310if (increment) {311increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst;312bld.vadd32(Definition(increment_dst), Operand::c32(info.increment), pre_shift_dst);313}314315Temp multiply_dst = increment_dst;316if (multiply) {317multiply_dst = post_shift ? bld.tmp(v1) : dst;318bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst,319bld.copy(bld.def(v1), Operand::c32(info.multiplier)));320}321322if (post_shift) {323bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand::c32(info.post_shift),324multiply_dst);325}326}327328void329emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)330{331Builder bld(ctx->program, ctx->block);332bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::c32(idx));333}334335Temp336emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)337{338/* no need to extract the whole vector */339if (src.regClass() == dst_rc) {340assert(idx == 0);341return src;342}343344assert(src.bytes() > (idx * dst_rc.bytes()));345Builder bld(ctx->program, ctx->block);346auto it = ctx->allocated_vec.find(src.id());347if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) {348if (it->second[idx].regClass() == dst_rc) {349return it->second[idx];350} else {351assert(!dst_rc.is_subdword());352assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);353return bld.copy(bld.def(dst_rc), it->second[idx]);354}355}356357if (dst_rc.is_subdword())358src = as_vgpr(ctx, src);359360if (src.bytes() == dst_rc.bytes()) {361assert(idx == 0);362return bld.copy(bld.def(dst_rc), src);363} else {364Temp dst = bld.tmp(dst_rc);365emit_extract_vector(ctx, src, idx, dst);366return dst;367}368}369370void371emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)372{373if (num_components == 1)374return;375if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())376return;377RegClass rc;378if (num_components > vec_src.size()) {379if (vec_src.type() == RegType::sgpr) {380/* should still help get_alu_src() */381emit_split_vector(ctx, vec_src, vec_src.size());382return;383}384/* sub-dword split */385rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword();386} else {387rc = RegClass(vec_src.type(), vec_src.size() / num_components);388}389aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(390aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};391split->operands[0] = Operand(vec_src);392std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;393for (unsigned i = 0; i < num_components; i++) {394elems[i] = ctx->program->allocateTmp(rc);395split->definitions[i] = Definition(elems[i]);396}397ctx->block->instructions.emplace_back(std::move(split));398ctx->allocated_vec.emplace(vec_src.id(), elems);399}400401/* This vector expansion uses a mask to determine which elements in the new vector402* come from the original vector. The other elements are undefined. */403void404expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)405{406emit_split_vector(ctx, vec_src, util_bitcount(mask));407408if (vec_src == dst)409return;410411Builder bld(ctx->program, ctx->block);412if (num_components == 1) {413if (dst.type() == RegType::sgpr)414bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);415else416bld.copy(Definition(dst), vec_src);417return;418}419420unsigned component_size = dst.size() / num_components;421std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;422423aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(424aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};425vec->definitions[0] = Definition(dst);426unsigned k = 0;427for (unsigned i = 0; i < num_components; i++) {428if (mask & (1 << i)) {429Temp src =430emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size));431if (dst.type() == RegType::sgpr)432src = bld.as_uniform(src);433vec->operands[i] = Operand(src);434} else {435vec->operands[i] = Operand::zero(component_size == 2 ? 8 : 4);436}437elems[i] = vec->operands[i].getTemp();438}439ctx->block->instructions.emplace_back(std::move(vec));440ctx->allocated_vec.emplace(dst.id(), elems);441}442443/* adjust misaligned small bit size loads */444void445byte_align_scalar(isel_context* ctx, Temp vec, Operand offset, Temp dst)446{447Builder bld(ctx->program, ctx->block);448Operand shift;449Temp select = Temp();450if (offset.isConstant()) {451assert(offset.constantValue() && offset.constantValue() < 4);452shift = Operand::c32(offset.constantValue() * 8);453} else {454/* bit_offset = 8 * (offset & 0x3) */455Temp tmp =456bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand::c32(3u));457select = bld.tmp(s1);458shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.scc(Definition(select)), tmp,459Operand::c32(3u));460}461462if (vec.size() == 1) {463bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), vec, shift);464} else if (vec.size() == 2) {465Temp tmp = dst.size() == 2 ? dst : bld.tmp(s2);466bld.sop2(aco_opcode::s_lshr_b64, Definition(tmp), bld.def(s1, scc), vec, shift);467if (tmp == dst)468emit_split_vector(ctx, dst, 2);469else470emit_extract_vector(ctx, tmp, 0, dst);471} else if (vec.size() == 3 || vec.size() == 4) {472Temp lo = bld.tmp(s2), hi;473if (vec.size() == 3) {474/* this can happen if we use VMEM for a uniform load */475hi = bld.tmp(s1);476bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);477} else {478hi = bld.tmp(s2);479bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);480hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand::zero());481}482if (select != Temp())483hi =484bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand::zero(), bld.scc(select));485lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift);486Temp mid = bld.tmp(s1);487lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo);488hi = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), hi, shift);489mid = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), hi, mid);490bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, mid);491emit_split_vector(ctx, dst, 2);492}493}494495void496byte_align_vector(isel_context* ctx, Temp vec, Operand offset, Temp dst, unsigned component_size)497{498Builder bld(ctx->program, ctx->block);499if (offset.isTemp()) {500Temp tmp[4] = {vec, vec, vec, vec};501502if (vec.size() == 4) {503tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1), tmp[3] = bld.tmp(v1);504bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]),505Definition(tmp[2]), Definition(tmp[3]), vec);506} else if (vec.size() == 3) {507tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1);508bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]),509Definition(tmp[2]), vec);510} else if (vec.size() == 2) {511tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1];512bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec);513}514for (unsigned i = 0; i < dst.size(); i++)515tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset);516517vec = tmp[0];518if (dst.size() == 2)519vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]);520521offset = Operand::zero();522}523524unsigned num_components = vec.bytes() / component_size;525if (vec.regClass() == dst.regClass()) {526assert(offset.constantValue() == 0);527bld.copy(Definition(dst), vec);528emit_split_vector(ctx, dst, num_components);529return;530}531532emit_split_vector(ctx, vec, num_components);533std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;534RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();535536assert(offset.constantValue() % component_size == 0);537unsigned skip = offset.constantValue() / component_size;538for (unsigned i = skip; i < num_components; i++)539elems[i - skip] = emit_extract_vector(ctx, vec, i, rc);540541if (dst.type() == RegType::vgpr) {542/* if dst is vgpr - split the src and create a shrunk version according to the mask. */543num_components = dst.bytes() / component_size;544aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>(545aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};546for (unsigned i = 0; i < num_components; i++)547create_vec->operands[i] = Operand(elems[i]);548create_vec->definitions[0] = Definition(dst);549bld.insert(std::move(create_vec));550551} else if (skip) {552/* if dst is sgpr - split the src, but move the original to sgpr. */553vec = bld.pseudo(aco_opcode::p_as_uniform, bld.def(RegClass(RegType::sgpr, vec.size())), vec);554byte_align_scalar(ctx, vec, offset, dst);555} else {556assert(dst.size() == vec.size());557bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);558}559560ctx->allocated_vec.emplace(dst.id(), elems);561}562563Temp564bool_to_vector_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s2))565{566Builder bld(ctx->program, ctx->block);567if (!dst.id())568dst = bld.tmp(bld.lm);569570assert(val.regClass() == s1);571assert(dst.regClass() == bld.lm);572573return bld.sop2(Builder::s_cselect, Definition(dst), Operand::c32(-1), Operand::zero(),574bld.scc(val));575}576577Temp578bool_to_scalar_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s1))579{580Builder bld(ctx->program, ctx->block);581if (!dst.id())582dst = bld.tmp(s1);583584assert(val.regClass() == bld.lm);585assert(dst.regClass() == s1);586587/* if we're currently in WQM mode, ensure that the source is also computed in WQM */588Temp tmp = bld.tmp(s1);589bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(tmp)), val, Operand(exec, bld.lm));590return emit_wqm(bld, tmp, dst);591}592593/**594* Copies the first src_bits of the input to the output Temp. Input bits at positions larger than595* src_bits and dst_bits are truncated.596*597* Sign extension may be applied using the sign_extend parameter. The position of the input sign598* bit is indicated by src_bits in this case.599*600* If dst.bytes() is larger than dst_bits/8, the value of the upper bits is undefined.601*/602Temp603convert_int(isel_context* ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits,604bool sign_extend, Temp dst = Temp())605{606assert(!(sign_extend && dst_bits < src_bits) &&607"Shrinking integers is not supported for signed inputs");608609if (!dst.id()) {610if (dst_bits % 32 == 0 || src.type() == RegType::sgpr)611dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u));612else613dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword());614}615616assert(src.type() == RegType::sgpr || src_bits == src.bytes() * 8);617assert(dst.type() == RegType::sgpr || dst_bits == dst.bytes() * 8);618619if (dst.bytes() == src.bytes() && dst_bits < src_bits) {620/* Copy the raw value, leaving an undefined value in the upper bits for621* the caller to handle appropriately */622return bld.copy(Definition(dst), src);623} else if (dst.bytes() < src.bytes()) {624return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::zero());625}626627Temp tmp = dst;628if (dst_bits == 64)629tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1);630631if (tmp == src) {632} else if (src.regClass() == s1) {633assert(src_bits < 32);634bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), src, Operand::zero(),635Operand::c32(src_bits), Operand::c32((unsigned)sign_extend));636} else if (ctx->options->chip_class >= GFX8) {637assert(src_bits < 32);638assert(src_bits != 8 || src.regClass() == v1b);639assert(src_bits != 16 || src.regClass() == v2b);640assert(dst_bits >= 16);641aco_ptr<SDWA_instruction> sdwa{642create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};643sdwa->operands[0] = Operand(src);644sdwa->definitions[0] = Definition(tmp);645if (sign_extend)646sdwa->sel[0] = src_bits == 8 ? sdwa_sbyte : sdwa_sword;647else648sdwa->sel[0] = src_bits == 8 ? sdwa_ubyte : sdwa_uword;649sdwa->dst_sel = tmp.bytes() == 2 ? sdwa_uword : sdwa_udword;650bld.insert(std::move(sdwa));651} else {652assert(src_bits < 32);653assert(ctx->options->chip_class == GFX6 || ctx->options->chip_class == GFX7);654aco_opcode opcode = sign_extend ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32;655bld.vop3(opcode, Definition(tmp), src, Operand::zero(),656Operand::c32(src_bits == 8 ? 8u : 16u));657}658659if (dst_bits == 64) {660if (sign_extend && dst.regClass() == s2) {661Temp high =662bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(31u));663bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);664} else if (sign_extend && dst.regClass() == v2) {665Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), tmp);666bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);667} else {668bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand::zero());669}670}671672return dst;673}674675enum sgpr_extract_mode {676sgpr_extract_sext,677sgpr_extract_zext,678sgpr_extract_undef,679};680681Temp682extract_8_16_bit_sgpr_element(isel_context* ctx, Temp dst, nir_alu_src* src, sgpr_extract_mode mode)683{684Temp vec = get_ssa_temp(ctx, src->src.ssa);685unsigned src_size = src->src.ssa->bit_size;686unsigned swizzle = src->swizzle[0];687688if (vec.size() > 1) {689assert(src_size == 16);690vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);691swizzle = swizzle & 1;692}693694Builder bld(ctx->program, ctx->block);695Temp tmp = dst.regClass() == s2 ? bld.tmp(s1) : dst;696697if (mode == sgpr_extract_undef && swizzle == 0)698bld.copy(Definition(tmp), vec);699else700bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), Operand(vec),701Operand::c32(swizzle), Operand::c32(src_size),702Operand::c32((mode == sgpr_extract_sext)));703704if (dst.regClass() == s2)705convert_int(ctx, bld, tmp, 32, 64, mode == sgpr_extract_sext, dst);706707return dst;708}709710Temp711get_alu_src(struct isel_context* ctx, nir_alu_src src, unsigned size = 1)712{713if (src.src.ssa->num_components == 1 && size == 1)714return get_ssa_temp(ctx, src.src.ssa);715716Temp vec = get_ssa_temp(ctx, src.src.ssa);717unsigned elem_size = vec.bytes() / src.src.ssa->num_components;718bool identity_swizzle = true;719720for (unsigned i = 0; identity_swizzle && i < size; i++) {721if (src.swizzle[i] != i)722identity_swizzle = false;723}724if (identity_swizzle)725return emit_extract_vector(ctx, vec, 0, RegClass::get(vec.type(), elem_size * size));726727assert(elem_size > 0);728assert(vec.bytes() % elem_size == 0);729730if (elem_size < 4 && vec.type() == RegType::sgpr) {731assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16);732assert(size == 1);733return extract_8_16_bit_sgpr_element(ctx, ctx->program->allocateTmp(s1), &src,734sgpr_extract_undef);735}736737RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword()738: RegClass(vec.type(), elem_size / 4);739if (size == 1) {740return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);741} else {742assert(size <= 4);743std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;744aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(745aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};746for (unsigned i = 0; i < size; ++i) {747elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);748vec_instr->operands[i] = Operand{elems[i]};749}750Temp dst = ctx->program->allocateTmp(RegClass(vec.type(), elem_size * size / 4));751vec_instr->definitions[0] = Definition(dst);752ctx->block->instructions.emplace_back(std::move(vec_instr));753ctx->allocated_vec.emplace(dst.id(), elems);754return dst;755}756}757758Temp759get_alu_src_vop3p(struct isel_context* ctx, nir_alu_src src)760{761/* returns v2b or v1 for vop3p usage.762* The source expects exactly 2 16bit components763* which are within the same dword764*/765assert(src.src.ssa->bit_size == 16);766assert(src.swizzle[0] >> 1 == src.swizzle[1] >> 1);767768Temp tmp = get_ssa_temp(ctx, src.src.ssa);769if (tmp.size() == 1)770return tmp;771772/* the size is larger than 1 dword: check the swizzle */773unsigned dword = src.swizzle[0] >> 1;774775/* extract a full dword if possible */776if (tmp.bytes() >= (dword + 1) * 4) {777return emit_extract_vector(ctx, tmp, dword, RegClass(tmp.type(), 1));778} else {779/* This must be a swizzled access to %a.zz where %a is v6b */780assert(((src.swizzle[0] | src.swizzle[1]) & 1) == 0);781assert(tmp.regClass() == v6b && dword == 1);782return emit_extract_vector(ctx, tmp, dword * 2, v2b);783}784}785786uint32_t787get_alu_src_ub(isel_context* ctx, nir_alu_instr* instr, int src_idx)788{789nir_ssa_scalar scalar =790nir_ssa_scalar{instr->src[src_idx].src.ssa, instr->src[src_idx].swizzle[0]};791return nir_unsigned_upper_bound(ctx->shader, ctx->range_ht, scalar, &ctx->ub_config);792}793794Temp795convert_pointer_to_64_bit(isel_context* ctx, Temp ptr, bool non_uniform = false)796{797if (ptr.size() == 2)798return ptr;799Builder bld(ctx->program, ctx->block);800if (ptr.type() == RegType::vgpr && !non_uniform)801ptr = bld.as_uniform(ptr);802return bld.pseudo(aco_opcode::p_create_vector, bld.def(RegClass(ptr.type(), 2)), ptr,803Operand::c32((unsigned)ctx->options->address32_hi));804}805806void807emit_sop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,808bool writes_scc, uint8_t uses_ub = 0)809{810aco_ptr<SOP2_instruction> sop2{811create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};812sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));813sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));814sop2->definitions[0] = Definition(dst);815if (instr->no_unsigned_wrap)816sop2->definitions[0].setNUW(true);817if (writes_scc)818sop2->definitions[1] = Definition(ctx->program->allocateId(s1), scc, s1);819820for (int i = 0; i < 2; i++) {821if (uses_ub & (1 << i)) {822uint32_t src_ub = get_alu_src_ub(ctx, instr, i);823if (src_ub <= 0xffff)824sop2->operands[i].set16bit(true);825else if (src_ub <= 0xffffff)826sop2->operands[i].set24bit(true);827}828}829830ctx->block->instructions.emplace_back(std::move(sop2));831}832833void834emit_vop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,835bool commutative, bool swap_srcs = false, bool flush_denorms = false,836bool nuw = false, uint8_t uses_ub = 0)837{838Builder bld(ctx->program, ctx->block);839bld.is_precise = instr->exact;840841Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);842Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);843if (src1.type() == RegType::sgpr) {844if (commutative && src0.type() == RegType::vgpr) {845Temp t = src0;846src0 = src1;847src1 = t;848} else {849src1 = as_vgpr(ctx, src1);850}851}852853Operand op0(src0);854Operand op1(src1);855856for (int i = 0; i < 2; i++) {857if (uses_ub & (1 << i)) {858uint32_t src_ub = get_alu_src_ub(ctx, instr, swap_srcs ? !i : i);859if (src_ub <= 0xffff)860bld.set16bit(i ? op1 : op0);861else if (src_ub <= 0xffffff)862bld.set24bit(i ? op1 : op0);863}864}865866if (flush_denorms && ctx->program->chip_class < GFX9) {867assert(dst.size() == 1);868Temp tmp = bld.vop2(op, bld.def(v1), op0, op1);869bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);870} else {871if (nuw) {872bld.nuw().vop2(op, Definition(dst), op0, op1);873} else {874bld.vop2(op, Definition(dst), op0, op1);875}876}877}878879void880emit_vop2_instruction_logic64(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)881{882Builder bld(ctx->program, ctx->block);883bld.is_precise = instr->exact;884885Temp src0 = get_alu_src(ctx, instr->src[0]);886Temp src1 = get_alu_src(ctx, instr->src[1]);887888if (src1.type() == RegType::sgpr) {889assert(src0.type() == RegType::vgpr);890std::swap(src0, src1);891}892893Temp src00 = bld.tmp(src0.type(), 1);894Temp src01 = bld.tmp(src0.type(), 1);895bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);896Temp src10 = bld.tmp(v1);897Temp src11 = bld.tmp(v1);898bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);899Temp lo = bld.vop2(op, bld.def(v1), src00, src10);900Temp hi = bld.vop2(op, bld.def(v1), src01, src11);901bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);902}903904void905emit_vop3a_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,906bool flush_denorms = false, unsigned num_sources = 2, bool swap_srcs = false)907{908assert(num_sources == 2 || num_sources == 3);909Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};910bool has_sgpr = false;911for (unsigned i = 0; i < num_sources; i++) {912src[i] = get_alu_src(ctx, instr->src[swap_srcs ? 1 - i : i]);913if (has_sgpr)914src[i] = as_vgpr(ctx, src[i]);915else916has_sgpr = src[i].type() == RegType::sgpr;917}918919Builder bld(ctx->program, ctx->block);920bld.is_precise = instr->exact;921if (flush_denorms && ctx->program->chip_class < GFX9) {922Temp tmp;923if (num_sources == 3)924tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1], src[2]);925else926tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1]);927if (dst.size() == 1)928bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);929else930bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand::c64(0x3FF0000000000000), tmp);931} else if (num_sources == 3) {932bld.vop3(op, Definition(dst), src[0], src[1], src[2]);933} else {934bld.vop3(op, Definition(dst), src[0], src[1]);935}936}937938Builder::Result939emit_vop3p_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,940bool swap_srcs = false)941{942Temp src0 = get_alu_src_vop3p(ctx, instr->src[swap_srcs]);943Temp src1 = get_alu_src_vop3p(ctx, instr->src[!swap_srcs]);944if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)945src1 = as_vgpr(ctx, src1);946assert(instr->dest.dest.ssa.num_components == 2);947948/* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */949unsigned opsel_lo =950(instr->src[!swap_srcs].swizzle[0] & 1) << 1 | (instr->src[swap_srcs].swizzle[0] & 1);951unsigned opsel_hi =952(instr->src[!swap_srcs].swizzle[1] & 1) << 1 | (instr->src[swap_srcs].swizzle[1] & 1);953954Builder bld(ctx->program, ctx->block);955bld.is_precise = instr->exact;956Builder::Result res = bld.vop3p(op, Definition(dst), src0, src1, opsel_lo, opsel_hi);957emit_split_vector(ctx, dst, 2);958return res;959}960961void962emit_vop1_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)963{964Builder bld(ctx->program, ctx->block);965bld.is_precise = instr->exact;966if (dst.type() == RegType::sgpr)967bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),968bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0])));969else970bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));971}972973void974emit_vopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)975{976Temp src0 = get_alu_src(ctx, instr->src[0]);977Temp src1 = get_alu_src(ctx, instr->src[1]);978assert(src0.size() == src1.size());979980aco_ptr<Instruction> vopc;981if (src1.type() == RegType::sgpr) {982if (src0.type() == RegType::vgpr) {983/* to swap the operands, we might also have to change the opcode */984switch (op) {985case aco_opcode::v_cmp_lt_f16: op = aco_opcode::v_cmp_gt_f16; break;986case aco_opcode::v_cmp_ge_f16: op = aco_opcode::v_cmp_le_f16; break;987case aco_opcode::v_cmp_lt_i16: op = aco_opcode::v_cmp_gt_i16; break;988case aco_opcode::v_cmp_ge_i16: op = aco_opcode::v_cmp_le_i16; break;989case aco_opcode::v_cmp_lt_u16: op = aco_opcode::v_cmp_gt_u16; break;990case aco_opcode::v_cmp_ge_u16: op = aco_opcode::v_cmp_le_u16; break;991case aco_opcode::v_cmp_lt_f32: op = aco_opcode::v_cmp_gt_f32; break;992case aco_opcode::v_cmp_ge_f32: op = aco_opcode::v_cmp_le_f32; break;993case aco_opcode::v_cmp_lt_i32: op = aco_opcode::v_cmp_gt_i32; break;994case aco_opcode::v_cmp_ge_i32: op = aco_opcode::v_cmp_le_i32; break;995case aco_opcode::v_cmp_lt_u32: op = aco_opcode::v_cmp_gt_u32; break;996case aco_opcode::v_cmp_ge_u32: op = aco_opcode::v_cmp_le_u32; break;997case aco_opcode::v_cmp_lt_f64: op = aco_opcode::v_cmp_gt_f64; break;998case aco_opcode::v_cmp_ge_f64: op = aco_opcode::v_cmp_le_f64; break;999case aco_opcode::v_cmp_lt_i64: op = aco_opcode::v_cmp_gt_i64; break;1000case aco_opcode::v_cmp_ge_i64: op = aco_opcode::v_cmp_le_i64; break;1001case aco_opcode::v_cmp_lt_u64: op = aco_opcode::v_cmp_gt_u64; break;1002case aco_opcode::v_cmp_ge_u64: op = aco_opcode::v_cmp_le_u64; break;1003default: /* eq and ne are commutative */ break;1004}1005Temp t = src0;1006src0 = src1;1007src1 = t;1008} else {1009src1 = as_vgpr(ctx, src1);1010}1011}10121013Builder bld(ctx->program, ctx->block);1014bld.vopc(op, bld.hint_vcc(Definition(dst)), src0, src1);1015}10161017void1018emit_sopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)1019{1020Temp src0 = get_alu_src(ctx, instr->src[0]);1021Temp src1 = get_alu_src(ctx, instr->src[1]);1022Builder bld(ctx->program, ctx->block);10231024assert(dst.regClass() == bld.lm);1025assert(src0.type() == RegType::sgpr);1026assert(src1.type() == RegType::sgpr);1027assert(src0.regClass() == src1.regClass());10281029/* Emit the SALU comparison instruction */1030Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);1031/* Turn the result into a per-lane bool */1032bool_to_vector_condition(ctx, cmp, dst);1033}10341035void1036emit_comparison(isel_context* ctx, nir_alu_instr* instr, Temp dst, aco_opcode v16_op,1037aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes,1038aco_opcode s64_op = aco_opcode::num_opcodes)1039{1040aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op1041: instr->src[0].src.ssa->bit_size == 32 ? s32_op1042: aco_opcode::num_opcodes;1043aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op1044: instr->src[0].src.ssa->bit_size == 32 ? v32_op1045: v16_op;1046bool use_valu = s_op == aco_opcode::num_opcodes || nir_dest_is_divergent(instr->dest.dest) ||1047get_ssa_temp(ctx, instr->src[0].src.ssa).type() == RegType::vgpr ||1048get_ssa_temp(ctx, instr->src[1].src.ssa).type() == RegType::vgpr;1049aco_opcode op = use_valu ? v_op : s_op;1050assert(op != aco_opcode::num_opcodes);1051assert(dst.regClass() == ctx->program->lane_mask);10521053if (use_valu)1054emit_vopc_instruction(ctx, instr, op, dst);1055else1056emit_sopc_instruction(ctx, instr, op, dst);1057}10581059void1060emit_boolean_logic(isel_context* ctx, nir_alu_instr* instr, Builder::WaveSpecificOpcode op,1061Temp dst)1062{1063Builder bld(ctx->program, ctx->block);1064Temp src0 = get_alu_src(ctx, instr->src[0]);1065Temp src1 = get_alu_src(ctx, instr->src[1]);10661067assert(dst.regClass() == bld.lm);1068assert(src0.regClass() == bld.lm);1069assert(src1.regClass() == bld.lm);10701071bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);1072}10731074void1075emit_bcsel(isel_context* ctx, nir_alu_instr* instr, Temp dst)1076{1077Builder bld(ctx->program, ctx->block);1078Temp cond = get_alu_src(ctx, instr->src[0]);1079Temp then = get_alu_src(ctx, instr->src[1]);1080Temp els = get_alu_src(ctx, instr->src[2]);10811082assert(cond.regClass() == bld.lm);10831084if (dst.type() == RegType::vgpr) {1085aco_ptr<Instruction> bcsel;1086if (dst.size() == 1) {1087then = as_vgpr(ctx, then);1088els = as_vgpr(ctx, els);10891090bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);1091} else if (dst.size() == 2) {1092Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);1093bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);1094Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);1095bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);10961097Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);1098Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);10991100bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);1101} else {1102isel_err(&instr->instr, "Unimplemented NIR instr bit size");1103}1104return;1105}11061107if (instr->dest.dest.ssa.bit_size == 1) {1108assert(dst.regClass() == bld.lm);1109assert(then.regClass() == bld.lm);1110assert(els.regClass() == bld.lm);1111}11121113if (!nir_src_is_divergent(instr->src[0].src)) { /* uniform condition and values in sgpr */1114if (dst.regClass() == s1 || dst.regClass() == s2) {1115assert((then.regClass() == s1 || then.regClass() == s2) &&1116els.regClass() == then.regClass());1117assert(dst.size() == then.size());1118aco_opcode op =1119dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;1120bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));1121} else {1122isel_err(&instr->instr, "Unimplemented uniform bcsel bit size");1123}1124return;1125}11261127/* divergent boolean bcsel1128* this implements bcsel on bools: dst = s0 ? s1 : s21129* are going to be: dst = (s0 & s1) | (~s0 & s2) */1130assert(instr->dest.dest.ssa.bit_size == 1);11311132if (cond.id() != then.id())1133then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);11341135if (cond.id() == els.id())1136bld.copy(Definition(dst), then);1137else1138bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,1139bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));1140}11411142void1143emit_scaled_op(isel_context* ctx, Builder& bld, Definition dst, Temp val, aco_opcode op,1144uint32_t undo)1145{1146/* multiply by 16777216 to handle denormals */1147Temp is_denormal =1148bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)), as_vgpr(ctx, val),1149bld.copy(bld.def(v1), Operand::c32((1u << 7) | (1u << 4))));1150Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x4b800000u), val);1151scaled = bld.vop1(op, bld.def(v1), scaled);1152scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(undo), scaled);11531154Temp not_scaled = bld.vop1(op, bld.def(v1), val);11551156bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal);1157}11581159void1160emit_rcp(isel_context* ctx, Builder& bld, Definition dst, Temp val)1161{1162if (ctx->block->fp_mode.denorm32 == 0) {1163bld.vop1(aco_opcode::v_rcp_f32, dst, val);1164return;1165}11661167emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u);1168}11691170void1171emit_rsq(isel_context* ctx, Builder& bld, Definition dst, Temp val)1172{1173if (ctx->block->fp_mode.denorm32 == 0) {1174bld.vop1(aco_opcode::v_rsq_f32, dst, val);1175return;1176}11771178emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u);1179}11801181void1182emit_sqrt(isel_context* ctx, Builder& bld, Definition dst, Temp val)1183{1184if (ctx->block->fp_mode.denorm32 == 0) {1185bld.vop1(aco_opcode::v_sqrt_f32, dst, val);1186return;1187}11881189emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u);1190}11911192void1193emit_log2(isel_context* ctx, Builder& bld, Definition dst, Temp val)1194{1195if (ctx->block->fp_mode.denorm32 == 0) {1196bld.vop1(aco_opcode::v_log_f32, dst, val);1197return;1198}11991200emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u);1201}12021203Temp1204emit_trunc_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)1205{1206if (ctx->options->chip_class >= GFX7)1207return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val);12081209/* GFX6 doesn't support V_TRUNC_F64, lower it. */1210/* TODO: create more efficient code! */1211if (val.type() == RegType::sgpr)1212val = as_vgpr(ctx, val);12131214/* Split the input value. */1215Temp val_lo = bld.tmp(v1), val_hi = bld.tmp(v1);1216bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);12171218/* Extract the exponent and compute the unbiased value. */1219Temp exponent =1220bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand::c32(20u), Operand::c32(11u));1221exponent = bld.vsub32(bld.def(v1), exponent, Operand::c32(1023u));12221223/* Extract the fractional part. */1224Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),1225Operand::c32(0x000fffffu));1226fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent);12271228Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1);1229bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi),1230fract_mask);12311232Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1);1233Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo);1234fract_lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_lo, tmp);1235tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_hi);1236fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);12371238/* Get the sign bit. */1239Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x80000000u), val_hi);12401241/* Decide the operation to apply depending on the unbiased exponent. */1242Temp exp_lt0 = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), exponent,1243Operand::zero());1244Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo,1245bld.copy(bld.def(v1), Operand::zero()), exp_lt0);1246Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0);1247Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand::c32(51u));1248dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51);1249dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_hi, val_hi, exp_gt51);12501251return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi);1252}12531254Temp1255emit_floor_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)1256{1257if (ctx->options->chip_class >= GFX7)1258return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);12591260/* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually1261* lowered at NIR level for precision reasons). */1262Temp src0 = as_vgpr(ctx, val);12631264Temp mask = bld.copy(bld.def(s1), Operand::c32(3u)); /* isnan */1265Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::c32(-1u),1266Operand::c32(0x3fefffffu));12671268Temp isnan =1269bld.vopc_e64(aco_opcode::v_cmp_class_f64, bld.hint_vcc(bld.def(bld.lm)), src0, mask);1270Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0);1271Temp min = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), fract, min_val);12721273Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);1274bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), src0);1275Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);1276bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), min);12771278Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, isnan);1279Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, isnan);12801281Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);12821283Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, v);1284add->vop3().neg[1] = true;12851286return add->definitions[0].getTemp();1287}12881289Temp1290uadd32_sat(Builder& bld, Definition dst, Temp src0, Temp src1)1291{1292if (bld.program->chip_class < GFX8) {1293Builder::Result add = bld.vadd32(bld.def(v1), src0, src1, true);1294return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, add.def(0).getTemp(), Operand::c32(-1),1295add.def(1).getTemp());1296}12971298Builder::Result add(NULL);1299if (bld.program->chip_class >= GFX9) {1300add = bld.vop2_e64(aco_opcode::v_add_u32, dst, src0, src1);1301} else {1302add = bld.vop2_e64(aco_opcode::v_add_co_u32, dst, bld.hint_vcc(bld.def(bld.lm)), src0, src1);1303}1304add.instr->vop3().clamp = 1;1305return dst.getTemp();1306}13071308void1309visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)1310{1311if (!instr->dest.dest.is_ssa) {1312isel_err(&instr->instr, "nir alu dst not in ssa");1313abort();1314}1315Builder bld(ctx->program, ctx->block);1316bld.is_precise = instr->exact;1317Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);1318switch (instr->op) {1319case nir_op_vec2:1320case nir_op_vec3:1321case nir_op_vec4:1322case nir_op_vec5: {1323std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;1324unsigned num = instr->dest.dest.ssa.num_components;1325for (unsigned i = 0; i < num; ++i)1326elems[i] = get_alu_src(ctx, instr->src[i]);13271328if (instr->dest.dest.ssa.bit_size >= 32 || dst.type() == RegType::vgpr) {1329aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(1330aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};1331RegClass elem_rc = RegClass::get(RegType::vgpr, instr->dest.dest.ssa.bit_size / 8u);1332for (unsigned i = 0; i < num; ++i) {1333if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword())1334elems[i] = emit_extract_vector(ctx, elems[i], 0, elem_rc);1335vec->operands[i] = Operand{elems[i]};1336}1337vec->definitions[0] = Definition(dst);1338ctx->block->instructions.emplace_back(std::move(vec));1339ctx->allocated_vec.emplace(dst.id(), elems);1340} else {1341bool use_s_pack = ctx->program->chip_class >= GFX9;1342Temp mask = bld.copy(bld.def(s1), Operand::c32((1u << instr->dest.dest.ssa.bit_size) - 1));13431344std::array<Temp, NIR_MAX_VEC_COMPONENTS> packed;1345uint32_t const_vals[NIR_MAX_VEC_COMPONENTS] = {};1346for (unsigned i = 0; i < num; i++) {1347unsigned packed_size = use_s_pack ? 16 : 32;1348unsigned idx = i * instr->dest.dest.ssa.bit_size / packed_size;1349unsigned offset = i * instr->dest.dest.ssa.bit_size % packed_size;1350if (nir_src_is_const(instr->src[i].src)) {1351const_vals[idx] |= nir_src_as_uint(instr->src[i].src) << offset;1352continue;1353}13541355if (offset != packed_size - instr->dest.dest.ssa.bit_size)1356elems[i] =1357bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask);13581359if (offset)1360elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), elems[i],1361Operand::c32(offset));13621363if (packed[idx].id())1364packed[idx] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), elems[i],1365packed[idx]);1366else1367packed[idx] = elems[i];1368}13691370if (use_s_pack) {1371for (unsigned i = 0; i < dst.size(); i++) {1372bool same = !!packed[i * 2].id() == !!packed[i * 2 + 1].id();13731374if (packed[i * 2].id() && packed[i * 2 + 1].id())1375packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],1376packed[i * 2 + 1]);1377else if (packed[i * 2 + 1].id())1378packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1),1379Operand::c32(const_vals[i * 2]), packed[i * 2 + 1]);1380else if (packed[i * 2].id())1381packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],1382Operand::c32(const_vals[i * 2 + 1]));13831384if (same)1385const_vals[i] = const_vals[i * 2] | (const_vals[i * 2 + 1] << 16);1386else1387const_vals[i] = 0;1388}1389}13901391for (unsigned i = 0; i < dst.size(); i++) {1392if (const_vals[i] && packed[i].id())1393packed[i] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),1394Operand::c32(const_vals[i]), packed[i]);1395else if (!packed[i].id())1396packed[i] = bld.copy(bld.def(s1), Operand::c32(const_vals[i]));1397}13981399if (dst.size() == 1)1400bld.copy(Definition(dst), packed[0]);1401else if (dst.size() == 2)1402bld.pseudo(aco_opcode::p_create_vector, Definition(dst), packed[0], packed[1]);1403else1404bld.pseudo(aco_opcode::p_create_vector, Definition(dst), packed[0], packed[1],1405packed[2]);1406}1407break;1408}1409case nir_op_mov: {1410Temp src = get_alu_src(ctx, instr->src[0]);1411if (src.type() == RegType::vgpr && dst.type() == RegType::sgpr) {1412/* use size() instead of bytes() for 8/16-bit */1413assert(src.size() == dst.size() && "wrong src or dst register class for nir_op_mov");1414bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);1415} else {1416assert(src.bytes() == dst.bytes() && "wrong src or dst register class for nir_op_mov");1417bld.copy(Definition(dst), src);1418}1419break;1420}1421case nir_op_inot: {1422Temp src = get_alu_src(ctx, instr->src[0]);1423if (instr->dest.dest.ssa.bit_size == 1) {1424assert(src.regClass() == bld.lm);1425assert(dst.regClass() == bld.lm);1426/* Don't use s_andn2 here, this allows the optimizer to make a better decision */1427Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);1428bld.sop2(Builder::s_and, Definition(dst), bld.def(s1, scc), tmp, Operand(exec, bld.lm));1429} else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {1430emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);1431} else if (dst.regClass() == v2) {1432Temp lo = bld.tmp(v1), hi = bld.tmp(v1);1433bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);1434lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo);1435hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi);1436bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);1437} else if (dst.type() == RegType::sgpr) {1438aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;1439bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);1440} else {1441isel_err(&instr->instr, "Unimplemented NIR instr bit size");1442}1443break;1444}1445case nir_op_iabs: {1446Temp src = get_alu_src(ctx, instr->src[0]);1447if (dst.regClass() == s1) {1448bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), src);1449} else if (dst.regClass() == v1) {1450bld.vop2(aco_opcode::v_max_i32, Definition(dst), src,1451bld.vsub32(bld.def(v1), Operand::zero(), src));1452} else {1453isel_err(&instr->instr, "Unimplemented NIR instr bit size");1454}1455break;1456}1457case nir_op_isign: {1458Temp src = get_alu_src(ctx, instr->src[0]);1459if (dst.regClass() == s1) {1460Temp tmp =1461bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(-1));1462bld.sop2(aco_opcode::s_min_i32, Definition(dst), bld.def(s1, scc), tmp, Operand::c32(1u));1463} else if (dst.regClass() == s2) {1464Temp neg =1465bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand::c32(63u));1466Temp neqz;1467if (ctx->program->chip_class >= GFX8)1468neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand::zero());1469else1470neqz =1471bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand::zero())1472.def(1)1473.getTemp();1474/* SCC gets zero-extended to 64 bit */1475bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz));1476} else if (dst.regClass() == v1) {1477bld.vop3(aco_opcode::v_med3_i32, Definition(dst), Operand::c32(-1), src, Operand::c32(1u));1478} else if (dst.regClass() == v2) {1479Temp upper = emit_extract_vector(ctx, src, 1, v1);1480Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), upper);1481Temp gtz =1482bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(bld.lm)), Operand::zero(), src);1483Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(1u), neg, gtz);1484upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), neg, gtz);1485bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);1486} else {1487isel_err(&instr->instr, "Unimplemented NIR instr bit size");1488}1489break;1490}1491case nir_op_imax: {1492if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {1493emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_i16_e64, dst);1494} else if (dst.regClass() == v2b) {1495emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i16, dst, true);1496} else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {1497emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_i16, dst);1498} else if (dst.regClass() == v1) {1499emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);1500} else if (dst.regClass() == s1) {1501emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);1502} else {1503isel_err(&instr->instr, "Unimplemented NIR instr bit size");1504}1505break;1506}1507case nir_op_umax: {1508if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {1509emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_u16_e64, dst);1510} else if (dst.regClass() == v2b) {1511emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u16, dst, true);1512} else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {1513emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_u16, dst);1514} else if (dst.regClass() == v1) {1515emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);1516} else if (dst.regClass() == s1) {1517emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);1518} else {1519isel_err(&instr->instr, "Unimplemented NIR instr bit size");1520}1521break;1522}1523case nir_op_imin: {1524if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {1525emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_i16_e64, dst);1526} else if (dst.regClass() == v2b) {1527emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i16, dst, true);1528} else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {1529emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_i16, dst);1530} else if (dst.regClass() == v1) {1531emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);1532} else if (dst.regClass() == s1) {1533emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);1534} else {1535isel_err(&instr->instr, "Unimplemented NIR instr bit size");1536}1537break;1538}1539case nir_op_umin: {1540if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {1541emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_u16_e64, dst);1542} else if (dst.regClass() == v2b) {1543emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u16, dst, true);1544} else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {1545emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_u16, dst);1546} else if (dst.regClass() == v1) {1547emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);1548} else if (dst.regClass() == s1) {1549emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);1550} else {1551isel_err(&instr->instr, "Unimplemented NIR instr bit size");1552}1553break;1554}1555case nir_op_ior: {1556if (instr->dest.dest.ssa.bit_size == 1) {1557emit_boolean_logic(ctx, instr, Builder::s_or, dst);1558} else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {1559emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);1560} else if (dst.regClass() == v2) {1561emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);1562} else if (dst.regClass() == s1) {1563emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);1564} else if (dst.regClass() == s2) {1565emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);1566} else {1567isel_err(&instr->instr, "Unimplemented NIR instr bit size");1568}1569break;1570}1571case nir_op_iand: {1572if (instr->dest.dest.ssa.bit_size == 1) {1573emit_boolean_logic(ctx, instr, Builder::s_and, dst);1574} else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {1575emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);1576} else if (dst.regClass() == v2) {1577emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst);1578} else if (dst.regClass() == s1) {1579emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);1580} else if (dst.regClass() == s2) {1581emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);1582} else {1583isel_err(&instr->instr, "Unimplemented NIR instr bit size");1584}1585break;1586}1587case nir_op_ixor: {1588if (instr->dest.dest.ssa.bit_size == 1) {1589emit_boolean_logic(ctx, instr, Builder::s_xor, dst);1590} else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {1591emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);1592} else if (dst.regClass() == v2) {1593emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst);1594} else if (dst.regClass() == s1) {1595emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);1596} else if (dst.regClass() == s2) {1597emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);1598} else {1599isel_err(&instr->instr, "Unimplemented NIR instr bit size");1600}1601break;1602}1603case nir_op_ushr: {1604if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {1605emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshrrev_b16_e64, dst, false, 2, true);1606} else if (dst.regClass() == v2b) {1607emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b16, dst, false, true);1608} else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {1609emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshrrev_b16, dst, true);1610} else if (dst.regClass() == v1) {1611emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);1612} else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {1613bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),1614get_alu_src(ctx, instr->src[0]));1615} else if (dst.regClass() == v2) {1616emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshr_b64, dst);1617} else if (dst.regClass() == s2) {1618emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);1619} else if (dst.regClass() == s1) {1620emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);1621} else {1622isel_err(&instr->instr, "Unimplemented NIR instr bit size");1623}1624break;1625}1626case nir_op_ishl: {1627if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {1628emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshlrev_b16_e64, dst, false, 2, true);1629} else if (dst.regClass() == v2b) {1630emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b16, dst, false, true);1631} else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {1632emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshlrev_b16, dst, true);1633} else if (dst.regClass() == v1) {1634emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true, false,1635false, 1);1636} else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {1637bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),1638get_alu_src(ctx, instr->src[0]));1639} else if (dst.regClass() == v2) {1640emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshl_b64, dst);1641} else if (dst.regClass() == s1) {1642emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true, 1);1643} else if (dst.regClass() == s2) {1644emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);1645} else {1646isel_err(&instr->instr, "Unimplemented NIR instr bit size");1647}1648break;1649}1650case nir_op_ishr: {1651if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {1652emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashrrev_i16_e64, dst, false, 2, true);1653} else if (dst.regClass() == v2b) {1654emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i16, dst, false, true);1655} else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {1656emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_ashrrev_i16, dst, true);1657} else if (dst.regClass() == v1) {1658emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);1659} else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {1660bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst), get_alu_src(ctx, instr->src[1]),1661get_alu_src(ctx, instr->src[0]));1662} else if (dst.regClass() == v2) {1663emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashr_i64, dst);1664} else if (dst.regClass() == s1) {1665emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);1666} else if (dst.regClass() == s2) {1667emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);1668} else {1669isel_err(&instr->instr, "Unimplemented NIR instr bit size");1670}1671break;1672}1673case nir_op_find_lsb: {1674Temp src = get_alu_src(ctx, instr->src[0]);1675if (src.regClass() == s1) {1676bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);1677} else if (src.regClass() == v1) {1678emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);1679} else if (src.regClass() == s2) {1680bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);1681} else {1682isel_err(&instr->instr, "Unimplemented NIR instr bit size");1683}1684break;1685}1686case nir_op_ufind_msb:1687case nir_op_ifind_msb: {1688Temp src = get_alu_src(ctx, instr->src[0]);1689if (src.regClass() == s1 || src.regClass() == s2) {1690aco_opcode op = src.regClass() == s21691? (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b641692: aco_opcode::s_flbit_i32_i64)1693: (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b321694: aco_opcode::s_flbit_i32);1695Temp msb_rev = bld.sop1(op, bld.def(s1), src);16961697Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),1698Operand::c32(src.size() * 32u - 1u), msb_rev);1699Temp msb = sub.def(0).getTemp();1700Temp carry = sub.def(1).getTemp();17011702bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), msb,1703bld.scc(carry));1704} else if (src.regClass() == v1) {1705aco_opcode op =1706instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;1707Temp msb_rev = bld.tmp(v1);1708emit_vop1_instruction(ctx, instr, op, msb_rev);1709Temp msb = bld.tmp(v1);1710Temp carry =1711bld.vsub32(Definition(msb), Operand::c32(31u), Operand(msb_rev), true).def(1).getTemp();1712bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand::c32(-1), carry);1713} else if (src.regClass() == v2) {1714aco_opcode op =1715instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;17161717Temp lo = bld.tmp(v1), hi = bld.tmp(v1);1718bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);17191720lo = uadd32_sat(bld, bld.def(v1), bld.copy(bld.def(s1), Operand::c32(32u)),1721bld.vop1(op, bld.def(v1), lo));1722hi = bld.vop1(op, bld.def(v1), hi);1723Temp found_hi = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::c32(-1), hi);17241725Temp msb_rev = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lo, hi, found_hi);17261727Temp msb = bld.tmp(v1);1728Temp carry =1729bld.vsub32(Definition(msb), Operand::c32(63u), Operand(msb_rev), true).def(1).getTemp();1730bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand::c32(-1), carry);1731} else {1732isel_err(&instr->instr, "Unimplemented NIR instr bit size");1733}1734break;1735}1736case nir_op_bitfield_reverse: {1737if (dst.regClass() == s1) {1738bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));1739} else if (dst.regClass() == v1) {1740bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));1741} else {1742isel_err(&instr->instr, "Unimplemented NIR instr bit size");1743}1744break;1745}1746case nir_op_iadd: {1747if (dst.regClass() == s1) {1748emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);1749break;1750} else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX10) {1751emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_u16_e64, dst);1752break;1753} else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX8) {1754emit_vop2_instruction(ctx, instr, aco_opcode::v_add_u16, dst, true);1755break;1756} else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {1757emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst);1758break;1759}17601761Temp src0 = get_alu_src(ctx, instr->src[0]);1762Temp src1 = get_alu_src(ctx, instr->src[1]);1763if (dst.type() == RegType::vgpr && dst.bytes() <= 4) {1764bld.vadd32(Definition(dst), Operand(src0), Operand(src1));1765break;1766}17671768assert(src0.size() == 2 && src1.size() == 2);1769Temp src00 = bld.tmp(src0.type(), 1);1770Temp src01 = bld.tmp(dst.type(), 1);1771bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);1772Temp src10 = bld.tmp(src1.type(), 1);1773Temp src11 = bld.tmp(dst.type(), 1);1774bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);17751776if (dst.regClass() == s2) {1777Temp carry = bld.tmp(s1);1778Temp dst0 =1779bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);1780Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11,1781bld.scc(carry));1782bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);1783} else if (dst.regClass() == v2) {1784Temp dst0 = bld.tmp(v1);1785Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();1786Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);1787bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);1788} else {1789isel_err(&instr->instr, "Unimplemented NIR instr bit size");1790}1791break;1792}1793case nir_op_uadd_sat: {1794Temp src0 = get_alu_src(ctx, instr->src[0]);1795Temp src1 = get_alu_src(ctx, instr->src[1]);1796if (dst.regClass() == s1) {1797Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);1798bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1);1799bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), tmp,1800bld.scc(carry));1801} else if (dst.regClass() == v2b) {1802Instruction* add_instr;1803if (ctx->program->chip_class >= GFX10) {1804add_instr = bld.vop3(aco_opcode::v_add_u16_e64, Definition(dst), src0, src1).instr;1805} else {1806if (src1.type() == RegType::sgpr)1807std::swap(src0, src1);1808add_instr =1809bld.vop2_e64(aco_opcode::v_add_u16, Definition(dst), src0, as_vgpr(ctx, src1)).instr;1810}1811add_instr->vop3().clamp = 1;1812} else if (dst.regClass() == v1) {1813uadd32_sat(bld, Definition(dst), src0, src1);1814} else {1815isel_err(&instr->instr, "Unimplemented NIR instr bit size");1816}1817break;1818}1819case nir_op_uadd_carry: {1820Temp src0 = get_alu_src(ctx, instr->src[0]);1821Temp src1 = get_alu_src(ctx, instr->src[1]);1822if (dst.regClass() == s1) {1823bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);1824break;1825}1826if (dst.regClass() == v1) {1827Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();1828bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),1829carry);1830break;1831}18321833Temp src00 = bld.tmp(src0.type(), 1);1834Temp src01 = bld.tmp(dst.type(), 1);1835bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);1836Temp src10 = bld.tmp(src1.type(), 1);1837Temp src11 = bld.tmp(dst.type(), 1);1838bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);1839if (dst.regClass() == s2) {1840Temp carry = bld.tmp(s1);1841bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);1842carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,1843bld.scc(carry))1844.def(1)1845.getTemp();1846bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());1847} else if (dst.regClass() == v2) {1848Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();1849carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();1850carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),1851Operand::c32(1u), carry);1852bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());1853} else {1854isel_err(&instr->instr, "Unimplemented NIR instr bit size");1855}1856break;1857}1858case nir_op_isub: {1859if (dst.regClass() == s1) {1860emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);1861break;1862} else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {1863emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst);1864break;1865}18661867Temp src0 = get_alu_src(ctx, instr->src[0]);1868Temp src1 = get_alu_src(ctx, instr->src[1]);1869if (dst.regClass() == v1) {1870bld.vsub32(Definition(dst), src0, src1);1871break;1872} else if (dst.bytes() <= 2) {1873if (ctx->program->chip_class >= GFX10)1874bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1);1875else if (src1.type() == RegType::sgpr)1876bld.vop2(aco_opcode::v_subrev_u16, Definition(dst), src1, as_vgpr(ctx, src0));1877else if (ctx->program->chip_class >= GFX8)1878bld.vop2(aco_opcode::v_sub_u16, Definition(dst), src0, as_vgpr(ctx, src1));1879else1880bld.vsub32(Definition(dst), src0, src1);1881break;1882}18831884Temp src00 = bld.tmp(src0.type(), 1);1885Temp src01 = bld.tmp(dst.type(), 1);1886bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);1887Temp src10 = bld.tmp(src1.type(), 1);1888Temp src11 = bld.tmp(dst.type(), 1);1889bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);1890if (dst.regClass() == s2) {1891Temp borrow = bld.tmp(s1);1892Temp dst0 =1893bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);1894Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11,1895bld.scc(borrow));1896bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);1897} else if (dst.regClass() == v2) {1898Temp lower = bld.tmp(v1);1899Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();1900Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);1901bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);1902} else {1903isel_err(&instr->instr, "Unimplemented NIR instr bit size");1904}1905break;1906}1907case nir_op_usub_borrow: {1908Temp src0 = get_alu_src(ctx, instr->src[0]);1909Temp src1 = get_alu_src(ctx, instr->src[1]);1910if (dst.regClass() == s1) {1911bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);1912break;1913} else if (dst.regClass() == v1) {1914Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();1915bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),1916borrow);1917break;1918}19191920Temp src00 = bld.tmp(src0.type(), 1);1921Temp src01 = bld.tmp(dst.type(), 1);1922bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);1923Temp src10 = bld.tmp(src1.type(), 1);1924Temp src11 = bld.tmp(dst.type(), 1);1925bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);1926if (dst.regClass() == s2) {1927Temp borrow = bld.tmp(s1);1928bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);1929borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,1930bld.scc(borrow))1931.def(1)1932.getTemp();1933bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());1934} else if (dst.regClass() == v2) {1935Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();1936borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();1937borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),1938Operand::c32(1u), borrow);1939bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());1940} else {1941isel_err(&instr->instr, "Unimplemented NIR instr bit size");1942}1943break;1944}1945case nir_op_imul: {1946if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX10) {1947emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u16_e64, dst);1948} else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX8) {1949emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true);1950} else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {1951emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_lo_u16, dst);1952} else if (dst.type() == RegType::vgpr) {1953Temp src0 = get_alu_src(ctx, instr->src[0]);1954Temp src1 = get_alu_src(ctx, instr->src[1]);1955uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);1956uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);19571958if (src0_ub <= 0xffff && src1_ub <= 0xffff && src0_ub * src1_ub <= 0xffff &&1959(ctx->options->chip_class == GFX8 || ctx->options->chip_class == GFX9)) {1960/* If the 16-bit multiplication can't overflow, emit v_mul_lo_u161961* but only on GFX8-9 because GFX10 doesn't zero the upper 161962* bits.1963*/1964emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true /* commutative */,1965false, false, true /* nuw */);1966} else if (src0_ub <= 0xffff && src1_ub <= 0xffff && ctx->options->chip_class >= GFX9) {1967/* Initialize the accumulator to 0 to allow further combinations1968* in the optimizer.1969*/1970Operand op0(src0);1971Operand op1(src1);1972bld.vop3(aco_opcode::v_mad_u32_u16, Definition(dst), bld.set16bit(op0),1973bld.set16bit(op1), Operand::zero());1974} else if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {1975emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_u32_u24, dst, true);1976} else if (nir_src_is_const(instr->src[0].src)) {1977bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[1]),1978nir_src_as_uint(instr->src[0].src), false);1979} else if (nir_src_is_const(instr->src[1].src)) {1980bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[0]),1981nir_src_as_uint(instr->src[1].src), false);1982} else {1983emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u32, dst);1984}1985} else if (dst.regClass() == s1) {1986emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);1987} else {1988isel_err(&instr->instr, "Unimplemented NIR instr bit size");1989}1990break;1991}1992case nir_op_umul_high: {1993if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {1994emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_u32, dst, false);1995} else if (dst.bytes() == 4) {1996uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);1997uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);19981999Temp tmp = dst.regClass() == s1 ? bld.tmp(v1) : dst;2000if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {2001emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_hi_u32_u24, tmp, true);2002} else {2003emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_u32, tmp);2004}20052006if (dst.regClass() == s1)2007bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);2008} else {2009isel_err(&instr->instr, "Unimplemented NIR instr bit size");2010}2011break;2012}2013case nir_op_imul_high: {2014if (dst.regClass() == v1) {2015emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_i32, dst);2016} else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {2017emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_i32, dst, false);2018} else if (dst.regClass() == s1) {2019Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),2020as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));2021bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);2022} else {2023isel_err(&instr->instr, "Unimplemented NIR instr bit size");2024}2025break;2026}2027case nir_op_fmul: {2028if (dst.regClass() == v2b) {2029emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true);2030} else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {2031emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_f16, dst);2032} else if (dst.regClass() == v1) {2033emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);2034} else if (dst.regClass() == v2) {2035emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_f64, dst);2036} else {2037isel_err(&instr->instr, "Unimplemented NIR instr bit size");2038}2039break;2040}2041case nir_op_fadd: {2042if (dst.regClass() == v2b) {2043emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true);2044} else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {2045emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);2046} else if (dst.regClass() == v1) {2047emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);2048} else if (dst.regClass() == v2) {2049emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_f64, dst);2050} else {2051isel_err(&instr->instr, "Unimplemented NIR instr bit size");2052}2053break;2054}2055case nir_op_fsub: {2056if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {2057Instruction* add = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);2058VOP3P_instruction& sub = add->vop3p();2059sub.neg_lo[1] = true;2060sub.neg_hi[1] = true;2061break;2062}20632064Temp src0 = get_alu_src(ctx, instr->src[0]);2065Temp src1 = get_alu_src(ctx, instr->src[1]);2066if (dst.regClass() == v2b) {2067if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)2068emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false);2069else2070emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true);2071} else if (dst.regClass() == v1) {2072if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)2073emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);2074else2075emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);2076} else if (dst.regClass() == v2) {2077Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), as_vgpr(ctx, src0),2078as_vgpr(ctx, src1));2079add->vop3().neg[1] = true;2080} else {2081isel_err(&instr->instr, "Unimplemented NIR instr bit size");2082}2083break;2084}2085case nir_op_fmax: {2086if (dst.regClass() == v2b) {2087// TODO: check fp_mode.must_flush_denorms16_642088emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true);2089} else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {2090emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_f16, dst);2091} else if (dst.regClass() == v1) {2092emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false,2093ctx->block->fp_mode.must_flush_denorms32);2094} else if (dst.regClass() == v2) {2095emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_f64, dst,2096ctx->block->fp_mode.must_flush_denorms16_64);2097} else {2098isel_err(&instr->instr, "Unimplemented NIR instr bit size");2099}2100break;2101}2102case nir_op_fmin: {2103if (dst.regClass() == v2b) {2104// TODO: check fp_mode.must_flush_denorms16_642105emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true);2106} else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {2107emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_f16, dst, true);2108} else if (dst.regClass() == v1) {2109emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false,2110ctx->block->fp_mode.must_flush_denorms32);2111} else if (dst.regClass() == v2) {2112emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_f64, dst,2113ctx->block->fp_mode.must_flush_denorms16_64);2114} else {2115isel_err(&instr->instr, "Unimplemented NIR instr bit size");2116}2117break;2118}2119case nir_op_cube_face_coord_amd: {2120Temp in = get_alu_src(ctx, instr->src[0], 3);2121Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1),2122emit_extract_vector(ctx, in, 2, v1)};2123Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);2124ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma);2125Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);2126Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);2127sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3f000000u /*0.5*/),2128bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, ma));2129tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3f000000u /*0.5*/),2130bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, ma));2131bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc);2132break;2133}2134case nir_op_cube_face_index_amd: {2135Temp in = get_alu_src(ctx, instr->src[0], 3);2136Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1),2137emit_extract_vector(ctx, in, 2, v1)};2138bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]);2139break;2140}2141case nir_op_bcsel: {2142emit_bcsel(ctx, instr, dst);2143break;2144}2145case nir_op_frsq: {2146if (dst.regClass() == v2b) {2147emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst);2148} else if (dst.regClass() == v1) {2149Temp src = get_alu_src(ctx, instr->src[0]);2150emit_rsq(ctx, bld, Definition(dst), src);2151} else if (dst.regClass() == v2) {2152/* Lowered at NIR level for precision reasons. */2153emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);2154} else {2155isel_err(&instr->instr, "Unimplemented NIR instr bit size");2156}2157break;2158}2159case nir_op_fneg: {2160if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {2161Temp src = get_alu_src_vop3p(ctx, instr->src[0]);2162bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0xBC00),2163instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);2164emit_split_vector(ctx, dst, 2);2165break;2166}2167Temp src = get_alu_src(ctx, instr->src[0]);2168if (dst.regClass() == v2b) {2169bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0xbc00u), as_vgpr(ctx, src));2170} else if (dst.regClass() == v1) {2171bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0xbf800000u),2172as_vgpr(ctx, src));2173} else if (dst.regClass() == v2) {2174if (ctx->block->fp_mode.must_flush_denorms16_64)2175src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand::c64(0x3FF0000000000000),2176as_vgpr(ctx, src));2177Temp upper = bld.tmp(v1), lower = bld.tmp(v1);2178bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);2179upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand::c32(0x80000000u), upper);2180bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);2181} else {2182isel_err(&instr->instr, "Unimplemented NIR instr bit size");2183}2184break;2185}2186case nir_op_fabs: {2187Temp src = get_alu_src(ctx, instr->src[0]);2188if (dst.regClass() == v2b) {2189Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst),2190Operand::c16(0x3c00), as_vgpr(ctx, src))2191.instr;2192mul->vop3().abs[1] = true;2193} else if (dst.regClass() == v1) {2194Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f32, Definition(dst),2195Operand::c32(0x3f800000u), as_vgpr(ctx, src))2196.instr;2197mul->vop3().abs[1] = true;2198} else if (dst.regClass() == v2) {2199if (ctx->block->fp_mode.must_flush_denorms16_64)2200src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand::c64(0x3FF0000000000000),2201as_vgpr(ctx, src));2202Temp upper = bld.tmp(v1), lower = bld.tmp(v1);2203bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);2204upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7FFFFFFFu), upper);2205bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);2206} else {2207isel_err(&instr->instr, "Unimplemented NIR instr bit size");2208}2209break;2210}2211case nir_op_fsat: {2212if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {2213Temp src = get_alu_src_vop3p(ctx, instr->src[0]);2214Instruction* vop3p =2215bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0x3C00),2216instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);2217vop3p->vop3p().clamp = true;2218emit_split_vector(ctx, dst, 2);2219break;2220}2221Temp src = get_alu_src(ctx, instr->src[0]);2222if (dst.regClass() == v2b) {2223bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand::c16(0u), Operand::c16(0x3c00),2224src);2225} else if (dst.regClass() == v1) {2226bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand::zero(),2227Operand::c32(0x3f800000u), src);2228/* apparently, it is not necessary to flush denorms if this instruction is used with these2229* operands */2230// TODO: confirm that this holds under any circumstances2231} else if (dst.regClass() == v2) {2232Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand::zero());2233add->vop3().clamp = true;2234} else {2235isel_err(&instr->instr, "Unimplemented NIR instr bit size");2236}2237break;2238}2239case nir_op_flog2: {2240if (dst.regClass() == v2b) {2241emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst);2242} else if (dst.regClass() == v1) {2243Temp src = get_alu_src(ctx, instr->src[0]);2244emit_log2(ctx, bld, Definition(dst), src);2245} else {2246isel_err(&instr->instr, "Unimplemented NIR instr bit size");2247}2248break;2249}2250case nir_op_frcp: {2251if (dst.regClass() == v2b) {2252emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst);2253} else if (dst.regClass() == v1) {2254Temp src = get_alu_src(ctx, instr->src[0]);2255emit_rcp(ctx, bld, Definition(dst), src);2256} else if (dst.regClass() == v2) {2257/* Lowered at NIR level for precision reasons. */2258emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);2259} else {2260isel_err(&instr->instr, "Unimplemented NIR instr bit size");2261}2262break;2263}2264case nir_op_fexp2: {2265if (dst.regClass() == v2b) {2266emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst);2267} else if (dst.regClass() == v1) {2268emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);2269} else {2270isel_err(&instr->instr, "Unimplemented NIR instr bit size");2271}2272break;2273}2274case nir_op_fsqrt: {2275if (dst.regClass() == v2b) {2276emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst);2277} else if (dst.regClass() == v1) {2278Temp src = get_alu_src(ctx, instr->src[0]);2279emit_sqrt(ctx, bld, Definition(dst), src);2280} else if (dst.regClass() == v2) {2281/* Lowered at NIR level for precision reasons. */2282emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);2283} else {2284isel_err(&instr->instr, "Unimplemented NIR instr bit size");2285}2286break;2287}2288case nir_op_ffract: {2289if (dst.regClass() == v2b) {2290emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst);2291} else if (dst.regClass() == v1) {2292emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);2293} else if (dst.regClass() == v2) {2294emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);2295} else {2296isel_err(&instr->instr, "Unimplemented NIR instr bit size");2297}2298break;2299}2300case nir_op_ffloor: {2301if (dst.regClass() == v2b) {2302emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst);2303} else if (dst.regClass() == v1) {2304emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);2305} else if (dst.regClass() == v2) {2306Temp src = get_alu_src(ctx, instr->src[0]);2307emit_floor_f64(ctx, bld, Definition(dst), src);2308} else {2309isel_err(&instr->instr, "Unimplemented NIR instr bit size");2310}2311break;2312}2313case nir_op_fceil: {2314if (dst.regClass() == v2b) {2315emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst);2316} else if (dst.regClass() == v1) {2317emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);2318} else if (dst.regClass() == v2) {2319if (ctx->options->chip_class >= GFX7) {2320emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);2321} else {2322/* GFX6 doesn't support V_CEIL_F64, lower it. */2323/* trunc = trunc(src0)2324* if (src0 > 0.0 && src0 != trunc)2325* trunc += 1.02326*/2327Temp src0 = get_alu_src(ctx, instr->src[0]);2328Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0);2329Temp tmp0 =2330bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand::zero());2331Temp tmp1 =2332bld.vopc(aco_opcode::v_cmp_lg_f64, bld.hint_vcc(bld.def(bld.lm)), src0, trunc);2333Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc),2334tmp0, tmp1);2335Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),2336bld.copy(bld.def(v1), Operand::zero()),2337bld.copy(bld.def(v1), Operand::c32(0x3ff00000u)), cond);2338add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2),2339bld.copy(bld.def(v1), Operand::zero()), add);2340bld.vop3(aco_opcode::v_add_f64, Definition(dst), trunc, add);2341}2342} else {2343isel_err(&instr->instr, "Unimplemented NIR instr bit size");2344}2345break;2346}2347case nir_op_ftrunc: {2348if (dst.regClass() == v2b) {2349emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst);2350} else if (dst.regClass() == v1) {2351emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);2352} else if (dst.regClass() == v2) {2353Temp src = get_alu_src(ctx, instr->src[0]);2354emit_trunc_f64(ctx, bld, Definition(dst), src);2355} else {2356isel_err(&instr->instr, "Unimplemented NIR instr bit size");2357}2358break;2359}2360case nir_op_fround_even: {2361if (dst.regClass() == v2b) {2362emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst);2363} else if (dst.regClass() == v1) {2364emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);2365} else if (dst.regClass() == v2) {2366if (ctx->options->chip_class >= GFX7) {2367emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);2368} else {2369/* GFX6 doesn't support V_RNDNE_F64, lower it. */2370Temp src0_lo = bld.tmp(v1), src0_hi = bld.tmp(v1);2371Temp src0 = get_alu_src(ctx, instr->src[0]);2372bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0);23732374Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1),2375bld.copy(bld.def(s1), Operand::c32(-2u)));2376Temp bfi =2377bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask,2378bld.copy(bld.def(v1), Operand::c32(0x43300000u)), as_vgpr(ctx, src0_hi));2379Temp tmp =2380bld.vop3(aco_opcode::v_add_f64, bld.def(v2), src0,2381bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));2382Instruction* sub =2383bld.vop3(aco_opcode::v_add_f64, bld.def(v2), tmp,2384bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));2385sub->vop3().neg[1] = true;2386tmp = sub->definitions[0].getTemp();23872388Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),2389Operand::c32(0x432fffffu));2390Instruction* vop3 =2391bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.hint_vcc(bld.def(bld.lm)), src0, v);2392vop3->vop3().abs[0] = true;2393Temp cond = vop3->definitions[0].getTemp();23942395Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1);2396bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp);2397Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo,2398as_vgpr(ctx, src0_lo), cond);2399Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi,2400as_vgpr(ctx, src0_hi), cond);24012402bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);2403}2404} else {2405isel_err(&instr->instr, "Unimplemented NIR instr bit size");2406}2407break;2408}2409case nir_op_fsin:2410case nir_op_fcos: {2411Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));2412aco_ptr<Instruction> norm;2413if (dst.regClass() == v2b) {2414Temp half_pi = bld.copy(bld.def(s1), Operand::c32(0x3118u));2415Temp tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v1), half_pi, src);2416aco_opcode opcode =2417instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;2418bld.vop1(opcode, Definition(dst), tmp);2419} else if (dst.regClass() == v1) {2420Temp half_pi = bld.copy(bld.def(s1), Operand::c32(0x3e22f983u));2421Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);24222423/* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */2424if (ctx->options->chip_class < GFX9)2425tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp);24262427aco_opcode opcode =2428instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;2429bld.vop1(opcode, Definition(dst), tmp);2430} else {2431isel_err(&instr->instr, "Unimplemented NIR instr bit size");2432}2433break;2434}2435case nir_op_ldexp: {2436if (dst.regClass() == v2b) {2437emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false);2438} else if (dst.regClass() == v1) {2439emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f32, dst);2440} else if (dst.regClass() == v2) {2441emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f64, dst);2442} else {2443isel_err(&instr->instr, "Unimplemented NIR instr bit size");2444}2445break;2446}2447case nir_op_frexp_sig: {2448if (dst.regClass() == v2b) {2449emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f16, dst);2450} else if (dst.regClass() == v1) {2451emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f32, dst);2452} else if (dst.regClass() == v2) {2453emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f64, dst);2454} else {2455isel_err(&instr->instr, "Unimplemented NIR instr bit size");2456}2457break;2458}2459case nir_op_frexp_exp: {2460if (instr->src[0].src.ssa->bit_size == 16) {2461Temp src = get_alu_src(ctx, instr->src[0]);2462Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src);2463tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tmp, Operand::zero());2464convert_int(ctx, bld, tmp, 8, 32, true, dst);2465} else if (instr->src[0].src.ssa->bit_size == 32) {2466emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f32, dst);2467} else if (instr->src[0].src.ssa->bit_size == 64) {2468emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f64, dst);2469} else {2470isel_err(&instr->instr, "Unimplemented NIR instr bit size");2471}2472break;2473}2474case nir_op_fsign: {2475Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));2476if (dst.regClass() == v2b) {2477assert(ctx->program->chip_class >= GFX9);2478/* replace negative zero with positive zero */2479src = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), Operand::zero(), src);2480src =2481bld.vop3(aco_opcode::v_med3_i16, bld.def(v2b), Operand::c16(-1), src, Operand::c16(1u));2482bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);2483} else if (dst.regClass() == v1) {2484src = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::zero(), src);2485src =2486bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand::c32(-1), src, Operand::c32(1u));2487bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);2488} else if (dst.regClass() == v2) {2489Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)),2490Operand::zero(), src);2491Temp tmp = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));2492Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp,2493emit_extract_vector(ctx, src, 1, v1), cond);24942495cond =2496bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(bld.lm)), Operand::zero(), src);2497tmp = bld.copy(bld.def(v1), Operand::c32(0xBFF00000u));2498upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);24992500bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);2501} else {2502isel_err(&instr->instr, "Unimplemented NIR instr bit size");2503}2504break;2505}2506case nir_op_f2f16:2507case nir_op_f2f16_rtne: {2508Temp src = get_alu_src(ctx, instr->src[0]);2509if (instr->src[0].src.ssa->bit_size == 64)2510src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);2511if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne)2512/* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to2513* keep value numbering and the scheduler simpler.2514*/2515bld.vop1(aco_opcode::p_cvt_f16_f32_rtne, Definition(dst), src);2516else2517bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);2518break;2519}2520case nir_op_f2f16_rtz: {2521Temp src = get_alu_src(ctx, instr->src[0]);2522if (instr->src[0].src.ssa->bit_size == 64)2523src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);2524if (ctx->block->fp_mode.round16_64 == fp_round_tz)2525bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);2526else if (ctx->program->chip_class == GFX8 || ctx->program->chip_class == GFX9)2527bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, Definition(dst), src, Operand::zero());2528else2529bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, as_vgpr(ctx, src));2530break;2531}2532case nir_op_f2f32: {2533if (instr->src[0].src.ssa->bit_size == 16) {2534emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, dst);2535} else if (instr->src[0].src.ssa->bit_size == 64) {2536emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);2537} else {2538isel_err(&instr->instr, "Unimplemented NIR instr bit size");2539}2540break;2541}2542case nir_op_f2f64: {2543Temp src = get_alu_src(ctx, instr->src[0]);2544if (instr->src[0].src.ssa->bit_size == 16)2545src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);2546bld.vop1(aco_opcode::v_cvt_f64_f32, Definition(dst), src);2547break;2548}2549case nir_op_i2f16: {2550assert(dst.regClass() == v2b);2551Temp src = get_alu_src(ctx, instr->src[0]);2552const unsigned input_size = instr->src[0].src.ssa->bit_size;2553if (input_size <= 16) {2554/* Expand integer to the size expected by the uint→float converter used below */2555unsigned target_size = (ctx->program->chip_class >= GFX8 ? 16 : 32);2556if (input_size != target_size) {2557src = convert_int(ctx, bld, src, input_size, target_size, true);2558}2559} else if (input_size == 64) {2560/* Truncate down to 32 bits; if any of the upper bits are relevant,2561* the value does not fall into the single-precision float range2562* anyway. SPIR-V does not mandate any specific behavior for such2563* large inputs.2564*/2565src = convert_int(ctx, bld, src, 64, 32, false);2566}25672568if (ctx->program->chip_class >= GFX8 && input_size <= 16) {2569bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);2570} else {2571/* Convert to f32 and then down to f16. This is needed to handle2572* inputs slightly outside the range [INT16_MIN, INT16_MAX],2573* which are representable via f16 but wouldn't be converted2574* correctly by v_cvt_f16_i16.2575*2576* This is also the fallback-path taken on GFX7 and earlier, which2577* do not support direct f16⟷i16 conversions.2578*/2579src = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), src);2580bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);2581}2582break;2583}2584case nir_op_i2f32: {2585assert(dst.size() == 1);2586Temp src = get_alu_src(ctx, instr->src[0]);2587const unsigned input_size = instr->src[0].src.ssa->bit_size;2588if (input_size <= 32) {2589if (input_size <= 16) {2590/* Sign-extend to 32-bits */2591src = convert_int(ctx, bld, src, input_size, 32, true);2592}2593bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);2594} else {2595assert(input_size == 64);2596RegClass rc = RegClass(src.type(), 1);2597Temp lower = bld.tmp(rc), upper = bld.tmp(rc);2598bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);2599lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);2600upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);2601upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u));2602upper = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), lower, upper);2603bld.vop1(aco_opcode::v_cvt_f32_f64, Definition(dst), upper);2604}26052606break;2607}2608case nir_op_i2f64: {2609if (instr->src[0].src.ssa->bit_size <= 32) {2610Temp src = get_alu_src(ctx, instr->src[0]);2611if (instr->src[0].src.ssa->bit_size <= 16)2612src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);2613bld.vop1(aco_opcode::v_cvt_f64_i32, Definition(dst), src);2614} else if (instr->src[0].src.ssa->bit_size == 64) {2615Temp src = get_alu_src(ctx, instr->src[0]);2616RegClass rc = RegClass(src.type(), 1);2617Temp lower = bld.tmp(rc), upper = bld.tmp(rc);2618bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);2619lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);2620upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);2621upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u));2622bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);26232624} else {2625isel_err(&instr->instr, "Unimplemented NIR instr bit size");2626}2627break;2628}2629case nir_op_u2f16: {2630assert(dst.regClass() == v2b);2631Temp src = get_alu_src(ctx, instr->src[0]);2632const unsigned input_size = instr->src[0].src.ssa->bit_size;2633if (input_size <= 16) {2634/* Expand integer to the size expected by the uint→float converter used below */2635unsigned target_size = (ctx->program->chip_class >= GFX8 ? 16 : 32);2636if (input_size != target_size) {2637src = convert_int(ctx, bld, src, input_size, target_size, false);2638}2639} else if (input_size == 64) {2640/* Truncate down to 32 bits; if any of the upper bits are non-zero,2641* the value does not fall into the single-precision float range2642* anyway. SPIR-V does not mandate any specific behavior for such2643* large inputs.2644*/2645src = convert_int(ctx, bld, src, 64, 32, false);2646}26472648if (ctx->program->chip_class >= GFX8) {2649/* float16 has a range of [0, 65519]. Converting from larger2650* inputs is UB, so we just need to consider the lower 16 bits */2651bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src);2652} else {2653/* GFX7 and earlier do not support direct f16⟷u16 conversions */2654src = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), src);2655bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);2656}2657break;2658}2659case nir_op_u2f32: {2660assert(dst.size() == 1);2661Temp src = get_alu_src(ctx, instr->src[0]);2662const unsigned input_size = instr->src[0].src.ssa->bit_size;2663if (input_size == 8) {2664bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src);2665} else if (input_size <= 32) {2666if (input_size == 16)2667src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);2668bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(dst), src);2669} else {2670assert(input_size == 64);2671RegClass rc = RegClass(src.type(), 1);2672Temp lower = bld.tmp(rc), upper = bld.tmp(rc);2673bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);2674lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);2675upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);2676upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u));2677upper = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), lower, upper);2678bld.vop1(aco_opcode::v_cvt_f32_f64, Definition(dst), upper);2679}2680break;2681}2682case nir_op_u2f64: {2683if (instr->src[0].src.ssa->bit_size <= 32) {2684Temp src = get_alu_src(ctx, instr->src[0]);2685if (instr->src[0].src.ssa->bit_size <= 16)2686src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);2687bld.vop1(aco_opcode::v_cvt_f64_u32, Definition(dst), src);2688} else if (instr->src[0].src.ssa->bit_size == 64) {2689Temp src = get_alu_src(ctx, instr->src[0]);2690RegClass rc = RegClass(src.type(), 1);2691Temp lower = bld.tmp(rc), upper = bld.tmp(rc);2692bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);2693lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);2694upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);2695upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u));2696bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);2697} else {2698isel_err(&instr->instr, "Unimplemented NIR instr bit size");2699}2700break;2701}2702case nir_op_f2i8:2703case nir_op_f2i16: {2704if (instr->src[0].src.ssa->bit_size == 16) {2705if (ctx->program->chip_class >= GFX8) {2706emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst);2707} else {2708/* GFX7 and earlier do not support direct f16⟷i16 conversions */2709Temp tmp = bld.tmp(v1);2710emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);2711tmp = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp);2712tmp = convert_int(ctx, bld, tmp, 32, instr->dest.dest.ssa.bit_size, false,2713(dst.type() == RegType::sgpr) ? Temp() : dst);2714if (dst.type() == RegType::sgpr) {2715bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);2716}2717}2718} else if (instr->src[0].src.ssa->bit_size == 32) {2719emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);2720} else {2721emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);2722}2723break;2724}2725case nir_op_f2u8:2726case nir_op_f2u16: {2727if (instr->src[0].src.ssa->bit_size == 16) {2728if (ctx->program->chip_class >= GFX8) {2729emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst);2730} else {2731/* GFX7 and earlier do not support direct f16⟷u16 conversions */2732Temp tmp = bld.tmp(v1);2733emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);2734tmp = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp);2735tmp = convert_int(ctx, bld, tmp, 32, instr->dest.dest.ssa.bit_size, false,2736(dst.type() == RegType::sgpr) ? Temp() : dst);2737if (dst.type() == RegType::sgpr) {2738bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);2739}2740}2741} else if (instr->src[0].src.ssa->bit_size == 32) {2742emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);2743} else {2744emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);2745}2746break;2747}2748case nir_op_f2i32: {2749Temp src = get_alu_src(ctx, instr->src[0]);2750if (instr->src[0].src.ssa->bit_size == 16) {2751Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);2752if (dst.type() == RegType::vgpr) {2753bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), tmp);2754} else {2755bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),2756bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp));2757}2758} else if (instr->src[0].src.ssa->bit_size == 32) {2759emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);2760} else if (instr->src[0].src.ssa->bit_size == 64) {2761emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);2762} else {2763isel_err(&instr->instr, "Unimplemented NIR instr bit size");2764}2765break;2766}2767case nir_op_f2u32: {2768Temp src = get_alu_src(ctx, instr->src[0]);2769if (instr->src[0].src.ssa->bit_size == 16) {2770Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);2771if (dst.type() == RegType::vgpr) {2772bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), tmp);2773} else {2774bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),2775bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp));2776}2777} else if (instr->src[0].src.ssa->bit_size == 32) {2778emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);2779} else if (instr->src[0].src.ssa->bit_size == 64) {2780emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);2781} else {2782isel_err(&instr->instr, "Unimplemented NIR instr bit size");2783}2784break;2785}2786case nir_op_f2i64: {2787Temp src = get_alu_src(ctx, instr->src[0]);2788if (instr->src[0].src.ssa->bit_size == 16)2789src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);27902791if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) {2792Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);2793exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand::zero(), exponent,2794Operand::c32(64u));2795Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7fffffu), src);2796Temp sign = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), src);2797mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(0x800000u), mantissa);2798mantissa = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(7u), mantissa);2799mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), mantissa);2800Temp new_exponent = bld.tmp(v1);2801Temp borrow =2802bld.vsub32(Definition(new_exponent), Operand::c32(63u), exponent, true).def(1).getTemp();2803if (ctx->program->chip_class >= GFX8)2804mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa);2805else2806mantissa = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), mantissa, new_exponent);2807Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand::c32(0xfffffffeu));2808Temp lower = bld.tmp(v1), upper = bld.tmp(v1);2809bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);2810lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower,2811Operand::c32(0xffffffffu), borrow);2812upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), upper, saturate, borrow);2813lower = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, lower);2814upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, upper);2815Temp new_lower = bld.tmp(v1);2816borrow = bld.vsub32(Definition(new_lower), lower, sign, true).def(1).getTemp();2817Temp new_upper = bld.vsub32(bld.def(v1), upper, sign, false, borrow);2818bld.pseudo(aco_opcode::p_create_vector, Definition(dst), new_lower, new_upper);28192820} else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) {2821if (src.type() == RegType::vgpr)2822src = bld.as_uniform(src);2823Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src,2824Operand::c32(0x80017u));2825exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent,2826Operand::c32(126u));2827exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand::zero(),2828exponent);2829exponent = bld.sop2(aco_opcode::s_min_i32, bld.def(s1), bld.def(s1, scc),2830Operand::c32(64u), exponent);2831Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),2832Operand::c32(0x7fffffu), src);2833Temp sign =2834bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(31u));2835mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),2836Operand::c32(0x800000u), mantissa);2837mantissa = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa,2838Operand::c32(7u));2839mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(), mantissa);2840exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),2841Operand::c32(63u), exponent);2842mantissa =2843bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent);2844Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent,2845Operand::c32(0xffffffffu)); // exp >= 642846Temp saturate = bld.sop1(aco_opcode::s_brev_b64, bld.def(s2), Operand::c32(0xfffffffeu));2847mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), saturate, mantissa, cond);2848Temp lower = bld.tmp(s1), upper = bld.tmp(s1);2849bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);2850lower = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, lower);2851upper = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, upper);2852Temp borrow = bld.tmp(s1);2853lower =2854bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign);2855upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign,2856bld.scc(borrow));2857bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);28582859} else if (instr->src[0].src.ssa->bit_size == 64) {2860Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(),2861Operand::c32(0x3df00000u));2862Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src);2863Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);2864vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(),2865Operand::c32(0xc1f00000u));2866Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul);2867Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);2868Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);2869Temp upper = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), floor);2870if (dst.type() == RegType::sgpr) {2871lower = bld.as_uniform(lower);2872upper = bld.as_uniform(upper);2873}2874bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);28752876} else {2877isel_err(&instr->instr, "Unimplemented NIR instr bit size");2878}2879break;2880}2881case nir_op_f2u64: {2882Temp src = get_alu_src(ctx, instr->src[0]);2883if (instr->src[0].src.ssa->bit_size == 16)2884src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);28852886if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) {2887Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);2888Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)),2889Operand::c32(64u), exponent);2890exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand::zero(), exponent);2891Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7fffffu), src);2892mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(0x800000u), mantissa);2893Temp exponent_small = bld.vsub32(bld.def(v1), Operand::c32(24u), exponent);2894Temp small = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), exponent_small, mantissa);2895mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), mantissa);2896Temp new_exponent = bld.tmp(v1);2897Temp cond_small =2898bld.vsub32(Definition(new_exponent), exponent, Operand::c32(24u), true).def(1).getTemp();2899if (ctx->program->chip_class >= GFX8)2900mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa);2901else2902mantissa = bld.vop3(aco_opcode::v_lshl_b64, bld.def(v2), mantissa, new_exponent);2903Temp lower = bld.tmp(v1), upper = bld.tmp(v1);2904bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);2905lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lower, small, cond_small);2906upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), upper, Operand::zero(),2907cond_small);2908lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(0xffffffffu), lower,2909exponent_in_range);2910upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(0xffffffffu), upper,2911exponent_in_range);2912bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);29132914} else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) {2915if (src.type() == RegType::vgpr)2916src = bld.as_uniform(src);2917Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src,2918Operand::c32(0x80017u));2919exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent,2920Operand::c32(126u));2921exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand::zero(),2922exponent);2923Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),2924Operand::c32(0x7fffffu), src);2925mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),2926Operand::c32(0x800000u), mantissa);2927Temp exponent_small = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),2928Operand::c32(24u), exponent);2929Temp small = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), mantissa,2930exponent_small);2931mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(), mantissa);2932Temp exponent_large = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),2933exponent, Operand::c32(24u));2934mantissa = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), mantissa,2935exponent_large);2936Temp cond =2937bld.sopc(aco_opcode::s_cmp_ge_i32, bld.def(s1, scc), Operand::c32(64u), exponent);2938mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa,2939Operand::c32(0xffffffffu), cond);2940Temp lower = bld.tmp(s1), upper = bld.tmp(s1);2941bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);2942Temp cond_small =2943bld.sopc(aco_opcode::s_cmp_le_i32, bld.def(s1, scc), exponent, Operand::c32(24u));2944lower = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), small, lower, cond_small);2945upper =2946bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::zero(), upper, cond_small);2947bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);29482949} else if (instr->src[0].src.ssa->bit_size == 64) {2950Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(),2951Operand::c32(0x3df00000u));2952Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src);2953Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);2954vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(),2955Operand::c32(0xc1f00000u));2956Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul);2957Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);2958Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);2959Temp upper = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), floor);2960if (dst.type() == RegType::sgpr) {2961lower = bld.as_uniform(lower);2962upper = bld.as_uniform(upper);2963}2964bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);29652966} else {2967isel_err(&instr->instr, "Unimplemented NIR instr bit size");2968}2969break;2970}2971case nir_op_b2f16: {2972Temp src = get_alu_src(ctx, instr->src[0]);2973assert(src.regClass() == bld.lm);29742975if (dst.regClass() == s1) {2976src = bool_to_scalar_condition(ctx, src);2977bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3c00u), src);2978} else if (dst.regClass() == v2b) {2979Temp one = bld.copy(bld.def(v1), Operand::c32(0x3c00u));2980bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), one, src);2981} else {2982unreachable("Wrong destination register class for nir_op_b2f16.");2983}2984break;2985}2986case nir_op_b2f32: {2987Temp src = get_alu_src(ctx, instr->src[0]);2988assert(src.regClass() == bld.lm);29892990if (dst.regClass() == s1) {2991src = bool_to_scalar_condition(ctx, src);2992bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3f800000u), src);2993} else if (dst.regClass() == v1) {2994bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(),2995Operand::c32(0x3f800000u), src);2996} else {2997unreachable("Wrong destination register class for nir_op_b2f32.");2998}2999break;3000}3001case nir_op_b2f64: {3002Temp src = get_alu_src(ctx, instr->src[0]);3003assert(src.regClass() == bld.lm);30043005if (dst.regClass() == s2) {3006src = bool_to_scalar_condition(ctx, src);3007bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c32(0x3f800000u),3008Operand::zero(), bld.scc(src));3009} else if (dst.regClass() == v2) {3010Temp one = bld.copy(bld.def(v2), Operand::c32(0x3FF00000u));3011Temp upper =3012bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), one, src);3013bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);3014} else {3015unreachable("Wrong destination register class for nir_op_b2f64.");3016}3017break;3018}3019case nir_op_i2i8:3020case nir_op_i2i16:3021case nir_op_i2i32:3022case nir_op_i2i64: {3023if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {3024/* no need to do the extract in get_alu_src() */3025sgpr_extract_mode mode = instr->dest.dest.ssa.bit_size > instr->src[0].src.ssa->bit_size3026? sgpr_extract_sext3027: sgpr_extract_undef;3028extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);3029} else {3030const unsigned input_bitsize = instr->src[0].src.ssa->bit_size;3031const unsigned output_bitsize = instr->dest.dest.ssa.bit_size;3032convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), input_bitsize, output_bitsize,3033output_bitsize > input_bitsize, dst);3034}3035break;3036}3037case nir_op_u2u8:3038case nir_op_u2u16:3039case nir_op_u2u32:3040case nir_op_u2u64: {3041if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {3042/* no need to do the extract in get_alu_src() */3043sgpr_extract_mode mode = instr->dest.dest.ssa.bit_size > instr->src[0].src.ssa->bit_size3044? sgpr_extract_zext3045: sgpr_extract_undef;3046extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);3047} else {3048convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), instr->src[0].src.ssa->bit_size,3049instr->dest.dest.ssa.bit_size, false, dst);3050}3051break;3052}3053case nir_op_b2b32:3054case nir_op_b2i8:3055case nir_op_b2i16:3056case nir_op_b2i32:3057case nir_op_b2i64: {3058Temp src = get_alu_src(ctx, instr->src[0]);3059assert(src.regClass() == bld.lm);30603061Temp tmp = dst.bytes() == 8 ? bld.tmp(RegClass::get(dst.type(), 4)) : dst;3062if (tmp.regClass() == s1) {3063bool_to_scalar_condition(ctx, src, tmp);3064} else if (tmp.type() == RegType::vgpr) {3065bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(tmp), Operand::zero(), Operand::c32(1u),3066src);3067} else {3068unreachable("Invalid register class for b2i32");3069}30703071if (tmp != dst)3072bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand::zero());3073break;3074}3075case nir_op_b2b1:3076case nir_op_i2b1: {3077Temp src = get_alu_src(ctx, instr->src[0]);3078assert(dst.regClass() == bld.lm);30793080if (src.type() == RegType::vgpr) {3081assert(src.regClass() == v1 || src.regClass() == v2);3082assert(dst.regClass() == bld.lm);3083bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,3084Definition(dst), Operand::zero(), src)3085.def(0)3086.setHint(vcc);3087} else {3088assert(src.regClass() == s1 || src.regClass() == s2);3089Temp tmp;3090if (src.regClass() == s2 && ctx->program->chip_class <= GFX7) {3091tmp =3092bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand::zero(), src)3093.def(1)3094.getTemp();3095} else {3096tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,3097bld.scc(bld.def(s1)), Operand::zero(), src);3098}3099bool_to_vector_condition(ctx, tmp, dst);3100}3101break;3102}3103case nir_op_unpack_64_2x32:3104case nir_op_unpack_32_2x16:3105case nir_op_unpack_64_4x16:3106bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));3107emit_split_vector(ctx, dst, instr->op == nir_op_unpack_64_4x16 ? 4 : 2);3108break;3109case nir_op_pack_64_2x32_split: {3110Temp src0 = get_alu_src(ctx, instr->src[0]);3111Temp src1 = get_alu_src(ctx, instr->src[1]);31123113bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);3114break;3115}3116case nir_op_unpack_64_2x32_split_x:3117bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),3118get_alu_src(ctx, instr->src[0]));3119break;3120case nir_op_unpack_64_2x32_split_y:3121bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),3122get_alu_src(ctx, instr->src[0]));3123break;3124case nir_op_unpack_32_2x16_split_x:3125if (dst.type() == RegType::vgpr) {3126bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),3127get_alu_src(ctx, instr->src[0]));3128} else {3129bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));3130}3131break;3132case nir_op_unpack_32_2x16_split_y:3133if (dst.type() == RegType::vgpr) {3134bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),3135get_alu_src(ctx, instr->src[0]));3136} else {3137bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc),3138get_alu_src(ctx, instr->src[0]), Operand::c32(1u), Operand::c32(16u),3139Operand::zero());3140}3141break;3142case nir_op_pack_32_2x16_split: {3143Temp src0 = get_alu_src(ctx, instr->src[0]);3144Temp src1 = get_alu_src(ctx, instr->src[1]);3145if (dst.regClass() == v1) {3146src0 = emit_extract_vector(ctx, src0, 0, v2b);3147src1 = emit_extract_vector(ctx, src1, 0, v2b);3148bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);3149} else {3150src0 = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), src0,3151Operand::c32(0xFFFFu));3152src1 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), src1,3153Operand::c32(16u));3154bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), src0, src1);3155}3156break;3157}3158case nir_op_pack_half_2x16_split: {3159if (dst.regClass() == v1) {3160nir_const_value* val = nir_src_as_const_value(instr->src[1].src);3161if (val && val->u32 == 0 && ctx->program->chip_class <= GFX9) {3162/* upper bits zero on GFX6-GFX9 */3163bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), get_alu_src(ctx, instr->src[0]));3164} else if (!ctx->block->fp_mode.care_about_round16_64 ||3165ctx->block->fp_mode.round16_64 == fp_round_tz) {3166if (ctx->program->chip_class == GFX8 || ctx->program->chip_class == GFX9)3167emit_vop3a_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32_e64, dst);3168else3169emit_vop2_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32, dst, false);3170} else {3171Temp src0 =3172bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), get_alu_src(ctx, instr->src[0]));3173Temp src1 =3174bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), get_alu_src(ctx, instr->src[1]));3175bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);3176}3177} else {3178isel_err(&instr->instr, "Unimplemented NIR instr bit size");3179}3180break;3181}3182case nir_op_unpack_half_2x16_split_x_flush_to_zero:3183case nir_op_unpack_half_2x16_split_x: {3184Temp src = get_alu_src(ctx, instr->src[0]);3185if (src.regClass() == v1)3186src = bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src);3187if (dst.regClass() == v1) {3188assert(ctx->block->fp_mode.must_flush_denorms16_64 ==3189(instr->op == nir_op_unpack_half_2x16_split_x_flush_to_zero));3190bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);3191} else {3192isel_err(&instr->instr, "Unimplemented NIR instr bit size");3193}3194break;3195}3196case nir_op_unpack_half_2x16_split_y_flush_to_zero:3197case nir_op_unpack_half_2x16_split_y: {3198Temp src = get_alu_src(ctx, instr->src[0]);3199if (src.regClass() == s1)3200src =3201bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(16u));3202else3203src =3204bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src).def(1).getTemp();3205if (dst.regClass() == v1) {3206assert(ctx->block->fp_mode.must_flush_denorms16_64 ==3207(instr->op == nir_op_unpack_half_2x16_split_y_flush_to_zero));3208bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);3209} else {3210isel_err(&instr->instr, "Unimplemented NIR instr bit size");3211}3212break;3213}3214case nir_op_sad_u8x4: {3215assert(dst.regClass() == v1);3216emit_vop3a_instruction(ctx, instr, aco_opcode::v_sad_u8, dst, false, 3u, false);3217break;3218}3219case nir_op_fquantize2f16: {3220Temp src = get_alu_src(ctx, instr->src[0]);3221Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src);3222Temp f32, cmp_res;32233224if (ctx->program->chip_class >= GFX8) {3225Temp mask = bld.copy(3226bld.def(s1), Operand::c32(0x36Fu)); /* value is NOT negative/positive denormal value */3227cmp_res =3228bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.hint_vcc(bld.def(bld.lm)), f16, mask);3229f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);3230} else {3231/* 0x38800000 is smallest half float value (2^-14) in 32-bit float,3232* so compare the result and flush to 0 if it's smaller.3233*/3234f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);3235Temp smallest = bld.copy(bld.def(s1), Operand::c32(0x38800000u));3236Instruction* tmp0 = bld.vopc_e64(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), f32, smallest);3237tmp0->vop3().abs[0] = true;3238Temp tmp1 =3239bld.vopc(aco_opcode::v_cmp_lg_f32, bld.hint_vcc(bld.def(bld.lm)), Operand::zero(), f32);3240cmp_res = bld.sop2(aco_opcode::s_nand_b64, bld.def(s2), bld.def(s1, scc),3241tmp0->definitions[0].getTemp(), tmp1);3242}32433244if (ctx->block->fp_mode.preserve_signed_zero_inf_nan32) {3245Temp copysign_0 =3246bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::zero(), as_vgpr(ctx, src));3247bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), copysign_0, f32, cmp_res);3248} else {3249bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), f32, cmp_res);3250}3251break;3252}3253case nir_op_bfm: {3254Temp bits = get_alu_src(ctx, instr->src[0]);3255Temp offset = get_alu_src(ctx, instr->src[1]);32563257if (dst.regClass() == s1) {3258bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);3259} else if (dst.regClass() == v1) {3260bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);3261} else {3262isel_err(&instr->instr, "Unimplemented NIR instr bit size");3263}3264break;3265}3266case nir_op_bitfield_select: {32673268/* dst = (insert & bitmask) | (base & ~bitmask) */3269if (dst.regClass() == s1) {3270Temp bitmask = get_alu_src(ctx, instr->src[0]);3271Temp insert = get_alu_src(ctx, instr->src[1]);3272Temp base = get_alu_src(ctx, instr->src[2]);3273aco_ptr<Instruction> sop2;3274nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);3275nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);3276Operand lhs;3277if (const_insert && const_bitmask) {3278lhs = Operand::c32(const_insert->u32 & const_bitmask->u32);3279} else {3280insert =3281bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);3282lhs = Operand(insert);3283}32843285Operand rhs;3286nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);3287if (const_base && const_bitmask) {3288rhs = Operand::c32(const_base->u32 & ~const_bitmask->u32);3289} else {3290base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);3291rhs = Operand(base);3292}32933294bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);32953296} else if (dst.regClass() == v1) {3297emit_vop3a_instruction(ctx, instr, aco_opcode::v_bfi_b32, dst, false, 3);3298} else {3299isel_err(&instr->instr, "Unimplemented NIR instr bit size");3300}3301break;3302}3303case nir_op_ubfe:3304case nir_op_ibfe: {3305if (dst.bytes() != 4)3306unreachable("Unsupported BFE bit size");33073308if (dst.type() == RegType::sgpr) {3309Temp base = get_alu_src(ctx, instr->src[0]);33103311nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);3312nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);3313if (const_offset && const_bits) {3314uint32_t extract = (const_bits->u32 << 16) | (const_offset->u32 & 0x1f);3315aco_opcode opcode =3316instr->op == nir_op_ubfe ? aco_opcode::s_bfe_u32 : aco_opcode::s_bfe_i32;3317bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, Operand::c32(extract));3318break;3319}33203321Temp offset = get_alu_src(ctx, instr->src[1]);3322Temp bits = get_alu_src(ctx, instr->src[2]);3323if (instr->op == nir_op_ubfe) {3324Temp mask = bld.sop2(aco_opcode::s_bfm_b32, bld.def(s1), bits, offset);3325Temp masked =3326bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), base, mask);3327bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), masked, offset);3328} else {3329Operand bits_op = const_bits ? Operand::c32(const_bits->u32 << 16)3330: bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1),3331bld.def(s1, scc), bits, Operand::c32(16u));3332Operand offset_op = const_offset3333? Operand::c32(const_offset->u32 & 0x1fu)3334: bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),3335offset, Operand::c32(0x1fu));33363337Temp extract =3338bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), bits_op, offset_op);3339bld.sop2(aco_opcode::s_bfe_i32, Definition(dst), bld.def(s1, scc), base, extract);3340}33413342} else {3343aco_opcode opcode =3344instr->op == nir_op_ubfe ? aco_opcode::v_bfe_u32 : aco_opcode::v_bfe_i32;3345emit_vop3a_instruction(ctx, instr, opcode, dst, false, 3);3346}3347break;3348}3349case nir_op_extract_u8:3350case nir_op_extract_i8:3351case nir_op_extract_u16:3352case nir_op_extract_i16: {3353bool is_signed = instr->op == nir_op_extract_i16 || instr->op == nir_op_extract_i8;3354unsigned comp = instr->op == nir_op_extract_u8 || instr->op == nir_op_extract_i8 ? 4 : 2;3355uint32_t bits = comp == 4 ? 8 : 16;3356unsigned index = nir_src_as_uint(instr->src[1].src);3357if (bits >= instr->dest.dest.ssa.bit_size || index * bits >= instr->dest.dest.ssa.bit_size) {3358assert(index == 0);3359bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));3360} else if (dst.regClass() == s1 && instr->dest.dest.ssa.bit_size == 16) {3361Temp vec = get_ssa_temp(ctx, instr->src[0].src.ssa);3362unsigned swizzle = instr->src[0].swizzle[0];3363if (vec.size() > 1) {3364vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);3365swizzle = swizzle & 1;3366}3367index += swizzle * instr->dest.dest.ssa.bit_size / bits;3368bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc), Operand(vec),3369Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));3370} else {3371Temp src = get_alu_src(ctx, instr->src[0]);3372Definition def(dst);3373if (dst.bytes() == 8) {3374src = emit_extract_vector(ctx, src, index / comp, RegClass(src.type(), 1));3375index %= comp;3376def = bld.def(src.type(), 1);3377}3378assert(def.bytes() <= 4);3379if (def.regClass() == s1) {3380bld.pseudo(aco_opcode::p_extract, def, bld.def(s1, scc), Operand(src),3381Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));3382} else {3383src = emit_extract_vector(ctx, src, 0, def.regClass());3384bld.pseudo(aco_opcode::p_extract, def, Operand(src), Operand::c32(index),3385Operand::c32(bits), Operand::c32(is_signed));3386}3387if (dst.size() == 2)3388bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(),3389Operand::zero());3390}3391break;3392}3393case nir_op_insert_u8:3394case nir_op_insert_u16: {3395unsigned comp = instr->op == nir_op_insert_u8 ? 4 : 2;3396uint32_t bits = comp == 4 ? 8 : 16;3397unsigned index = nir_src_as_uint(instr->src[1].src);3398if (bits >= instr->dest.dest.ssa.bit_size || index * bits >= instr->dest.dest.ssa.bit_size) {3399assert(index == 0);3400bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));3401} else {3402Temp src = get_alu_src(ctx, instr->src[0]);3403Definition def(dst);3404bool swap = false;3405if (dst.bytes() == 8) {3406src = emit_extract_vector(ctx, src, 0u, RegClass(src.type(), 1));3407swap = index >= comp;3408index %= comp;3409def = bld.def(src.type(), 1);3410}3411if (def.regClass() == s1) {3412bld.pseudo(aco_opcode::p_insert, def, bld.def(s1, scc), Operand(src),3413Operand::c32(index), Operand::c32(bits));3414} else {3415src = emit_extract_vector(ctx, src, 0, def.regClass());3416bld.pseudo(aco_opcode::p_insert, def, Operand(src), Operand::c32(index),3417Operand::c32(bits));3418}3419if (dst.size() == 2 && swap)3420bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(),3421def.getTemp());3422else if (dst.size() == 2)3423bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(),3424Operand::zero());3425}3426break;3427}3428case nir_op_bit_count: {3429Temp src = get_alu_src(ctx, instr->src[0]);3430if (src.regClass() == s1) {3431bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);3432} else if (src.regClass() == v1) {3433bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand::zero());3434} else if (src.regClass() == v2) {3435bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), emit_extract_vector(ctx, src, 1, v1),3436bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),3437emit_extract_vector(ctx, src, 0, v1), Operand::zero()));3438} else if (src.regClass() == s2) {3439bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);3440} else {3441isel_err(&instr->instr, "Unimplemented NIR instr bit size");3442}3443break;3444}3445case nir_op_flt: {3446emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f16, aco_opcode::v_cmp_lt_f32,3447aco_opcode::v_cmp_lt_f64);3448break;3449}3450case nir_op_fge: {3451emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f16, aco_opcode::v_cmp_ge_f32,3452aco_opcode::v_cmp_ge_f64);3453break;3454}3455case nir_op_feq: {3456emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f16, aco_opcode::v_cmp_eq_f32,3457aco_opcode::v_cmp_eq_f64);3458break;3459}3460case nir_op_fneu: {3461emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f16, aco_opcode::v_cmp_neq_f32,3462aco_opcode::v_cmp_neq_f64);3463break;3464}3465case nir_op_ilt: {3466emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i16, aco_opcode::v_cmp_lt_i32,3467aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32);3468break;3469}3470case nir_op_ige: {3471emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i16, aco_opcode::v_cmp_ge_i32,3472aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32);3473break;3474}3475case nir_op_ieq: {3476if (instr->src[0].src.ssa->bit_size == 1)3477emit_boolean_logic(ctx, instr, Builder::s_xnor, dst);3478else3479emit_comparison(3480ctx, instr, dst, aco_opcode::v_cmp_eq_i16, aco_opcode::v_cmp_eq_i32,3481aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32,3482ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes);3483break;3484}3485case nir_op_ine: {3486if (instr->src[0].src.ssa->bit_size == 1)3487emit_boolean_logic(ctx, instr, Builder::s_xor, dst);3488else3489emit_comparison(3490ctx, instr, dst, aco_opcode::v_cmp_lg_i16, aco_opcode::v_cmp_lg_i32,3491aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32,3492ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes);3493break;3494}3495case nir_op_ult: {3496emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u16, aco_opcode::v_cmp_lt_u32,3497aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32);3498break;3499}3500case nir_op_uge: {3501emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u16, aco_opcode::v_cmp_ge_u32,3502aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32);3503break;3504}3505case nir_op_fddx:3506case nir_op_fddy:3507case nir_op_fddx_fine:3508case nir_op_fddy_fine:3509case nir_op_fddx_coarse:3510case nir_op_fddy_coarse: {3511Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));3512uint16_t dpp_ctrl1, dpp_ctrl2;3513if (instr->op == nir_op_fddx_fine) {3514dpp_ctrl1 = dpp_quad_perm(0, 0, 2, 2);3515dpp_ctrl2 = dpp_quad_perm(1, 1, 3, 3);3516} else if (instr->op == nir_op_fddy_fine) {3517dpp_ctrl1 = dpp_quad_perm(0, 1, 0, 1);3518dpp_ctrl2 = dpp_quad_perm(2, 3, 2, 3);3519} else {3520dpp_ctrl1 = dpp_quad_perm(0, 0, 0, 0);3521if (instr->op == nir_op_fddx || instr->op == nir_op_fddx_coarse)3522dpp_ctrl2 = dpp_quad_perm(1, 1, 1, 1);3523else3524dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);3525}35263527Temp tmp;3528if (ctx->program->chip_class >= GFX8) {3529Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1);3530tmp = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), src, tl, dpp_ctrl2);3531} else {3532Temp tl = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl1);3533Temp tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl2);3534tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), tr, tl);3535}3536emit_wqm(bld, tmp, dst, true);3537break;3538}3539default: isel_err(&instr->instr, "Unknown NIR ALU instr");3540}3541}35423543void3544visit_load_const(isel_context* ctx, nir_load_const_instr* instr)3545{3546Temp dst = get_ssa_temp(ctx, &instr->def);35473548// TODO: we really want to have the resulting type as this would allow for 64bit literals3549// which get truncated the lsb if double and msb if int3550// for now, we only use s_mov_b64 with 64bit inline constants3551assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");3552assert(dst.type() == RegType::sgpr);35533554Builder bld(ctx->program, ctx->block);35553556if (instr->def.bit_size == 1) {3557assert(dst.regClass() == bld.lm);3558int val = instr->value[0].b ? -1 : 0;3559Operand op = bld.lm.size() == 1 ? Operand::c32(val) : Operand::c64(val);3560bld.copy(Definition(dst), op);3561} else if (instr->def.bit_size == 8) {3562bld.copy(Definition(dst), Operand::c32(instr->value[0].u8));3563} else if (instr->def.bit_size == 16) {3564/* sign-extend to use s_movk_i32 instead of a literal */3565bld.copy(Definition(dst), Operand::c32(instr->value[0].i16));3566} else if (dst.size() == 1) {3567bld.copy(Definition(dst), Operand::c32(instr->value[0].u32));3568} else {3569assert(dst.size() != 1);3570aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(3571aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};3572if (instr->def.bit_size == 64)3573for (unsigned i = 0; i < dst.size(); i++)3574vec->operands[i] = Operand::c32(instr->value[0].u64 >> i * 32);3575else {3576for (unsigned i = 0; i < dst.size(); i++)3577vec->operands[i] = Operand::c32(instr->value[i].u32);3578}3579vec->definitions[0] = Definition(dst);3580ctx->block->instructions.emplace_back(std::move(vec));3581}3582}35833584uint32_t3585widen_mask(uint32_t mask, unsigned multiplier)3586{3587uint32_t new_mask = 0;3588for (unsigned i = 0; i < 32 && (1u << i) <= mask; ++i)3589if (mask & (1u << i))3590new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);3591return new_mask;3592}35933594struct LoadEmitInfo {3595Operand offset;3596Temp dst;3597unsigned num_components;3598unsigned component_size;3599Temp resource = Temp(0, s1);3600unsigned component_stride = 0;3601unsigned const_offset = 0;3602unsigned align_mul = 0;3603unsigned align_offset = 0;36043605bool glc = false;3606bool slc = false;3607unsigned swizzle_component_size = 0;3608memory_sync_info sync;3609Temp soffset = Temp(0, s1);3610};36113612struct EmitLoadParameters {3613using Callback = Temp (*)(Builder& bld, const LoadEmitInfo& info, Temp offset,3614unsigned bytes_needed, unsigned align, unsigned const_offset,3615Temp dst_hint);36163617Callback callback;3618bool byte_align_loads;3619bool supports_8bit_16bit_loads;3620unsigned max_const_offset_plus_one;3621};36223623void3624emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info,3625const EmitLoadParameters& params)3626{3627unsigned load_size = info.num_components * info.component_size;3628unsigned component_size = info.component_size;36293630unsigned num_vals = 0;3631Temp* const vals = (Temp*)alloca(info.dst.bytes() * sizeof(Temp));36323633unsigned const_offset = info.const_offset;36343635const unsigned align_mul = info.align_mul ? info.align_mul : component_size;3636unsigned align_offset = (info.align_offset + const_offset) % align_mul;36373638unsigned bytes_read = 0;3639while (bytes_read < load_size) {3640unsigned bytes_needed = load_size - bytes_read;36413642/* add buffer for unaligned loads */3643int byte_align = 0;3644if (params.byte_align_loads) {3645byte_align = align_mul % 4 == 0 ? align_offset % 4 : -1;3646}36473648if (byte_align) {3649if (bytes_needed > 2 || (bytes_needed == 2 && (align_mul % 2 || align_offset % 2)) ||3650!params.supports_8bit_16bit_loads) {3651if (info.component_stride) {3652assert(params.supports_8bit_16bit_loads && "unimplemented");3653bytes_needed = 2;3654byte_align = 0;3655} else {3656bytes_needed += byte_align == -1 ? 4 - info.align_mul : byte_align;3657bytes_needed = align(bytes_needed, 4);3658}3659} else {3660byte_align = 0;3661}3662}36633664if (info.swizzle_component_size)3665bytes_needed = MIN2(bytes_needed, info.swizzle_component_size);3666if (info.component_stride)3667bytes_needed = MIN2(bytes_needed, info.component_size);36683669bool need_to_align_offset = byte_align && (align_mul % 4 || align_offset % 4);36703671/* reduce constant offset */3672Operand offset = info.offset;3673unsigned reduced_const_offset = const_offset;3674bool remove_const_offset_completely = need_to_align_offset;3675if (const_offset &&3676(remove_const_offset_completely || const_offset >= params.max_const_offset_plus_one)) {3677unsigned to_add = const_offset;3678if (remove_const_offset_completely) {3679reduced_const_offset = 0;3680} else {3681to_add =3682const_offset / params.max_const_offset_plus_one * params.max_const_offset_plus_one;3683reduced_const_offset %= params.max_const_offset_plus_one;3684}3685Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();3686if (offset.isConstant()) {3687offset = Operand::c32(offset.constantValue() + to_add);3688} else if (offset_tmp.regClass() == s1) {3689offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), offset_tmp,3690Operand::c32(to_add));3691} else if (offset_tmp.regClass() == v1) {3692offset = bld.vadd32(bld.def(v1), offset_tmp, Operand::c32(to_add));3693} else {3694Temp lo = bld.tmp(offset_tmp.type(), 1);3695Temp hi = bld.tmp(offset_tmp.type(), 1);3696bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);36973698if (offset_tmp.regClass() == s2) {3699Temp carry = bld.tmp(s1);3700lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), lo,3701Operand::c32(to_add));3702hi = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), hi, carry);3703offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), lo, hi);3704} else {3705Temp new_lo = bld.tmp(v1);3706Temp carry =3707bld.vadd32(Definition(new_lo), lo, Operand::c32(to_add), true).def(1).getTemp();3708hi = bld.vadd32(bld.def(v1), hi, Operand::zero(), false, carry);3709offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_lo, hi);3710}3711}3712}37133714/* align offset down if needed */3715Operand aligned_offset = offset;3716unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;3717if (need_to_align_offset) {3718align = 4;3719Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();3720if (offset.isConstant()) {3721aligned_offset = Operand::c32(offset.constantValue() & 0xfffffffcu);3722} else if (offset_tmp.regClass() == s1) {3723aligned_offset = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),3724Operand::c32(0xfffffffcu), offset_tmp);3725} else if (offset_tmp.regClass() == s2) {3726aligned_offset = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc),3727Operand::c64(0xfffffffffffffffcllu), offset_tmp);3728} else if (offset_tmp.regClass() == v1) {3729aligned_offset =3730bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0xfffffffcu), offset_tmp);3731} else if (offset_tmp.regClass() == v2) {3732Temp hi = bld.tmp(v1), lo = bld.tmp(v1);3733bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);3734lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0xfffffffcu), lo);3735aligned_offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), lo, hi);3736}3737}3738Temp aligned_offset_tmp =3739aligned_offset.isTemp() ? aligned_offset.getTemp() : bld.copy(bld.def(s1), aligned_offset);37403741Temp val = params.callback(bld, info, aligned_offset_tmp, bytes_needed, align,3742reduced_const_offset, byte_align ? Temp() : info.dst);37433744/* the callback wrote directly to dst */3745if (val == info.dst) {3746assert(num_vals == 0);3747emit_split_vector(ctx, info.dst, info.num_components);3748return;3749}37503751/* shift result right if needed */3752if (params.byte_align_loads && info.component_size < 4) {3753Operand byte_align_off = Operand::c32(byte_align);3754if (byte_align == -1) {3755if (offset.isConstant())3756byte_align_off = Operand::c32(offset.constantValue() % 4u);3757else if (offset.size() == 2)3758byte_align_off = Operand(emit_extract_vector(ctx, offset.getTemp(), 0,3759RegClass(offset.getTemp().type(), 1)));3760else3761byte_align_off = offset;3762}37633764assert(val.bytes() >= load_size && "unimplemented");3765if (val.type() == RegType::sgpr)3766byte_align_scalar(ctx, val, byte_align_off, info.dst);3767else3768byte_align_vector(ctx, val, byte_align_off, info.dst, component_size);3769return;3770}37713772/* add result to list and advance */3773if (info.component_stride) {3774assert(val.bytes() == info.component_size && "unimplemented");3775const_offset += info.component_stride;3776align_offset = (align_offset + info.component_stride) % align_mul;3777} else {3778const_offset += val.bytes();3779align_offset = (align_offset + val.bytes()) % align_mul;3780}3781bytes_read += val.bytes();3782vals[num_vals++] = val;3783}37843785/* create array of components */3786unsigned components_split = 0;3787std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;3788bool has_vgprs = false;3789for (unsigned i = 0; i < num_vals;) {3790Temp* const tmp = (Temp*)alloca(num_vals * sizeof(Temp));3791unsigned num_tmps = 0;3792unsigned tmp_size = 0;3793RegType reg_type = RegType::sgpr;3794while ((!tmp_size || (tmp_size % component_size)) && i < num_vals) {3795if (vals[i].type() == RegType::vgpr)3796reg_type = RegType::vgpr;3797tmp_size += vals[i].bytes();3798tmp[num_tmps++] = vals[i++];3799}3800if (num_tmps > 1) {3801aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(3802aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)};3803for (unsigned j = 0; j < num_tmps; j++)3804vec->operands[j] = Operand(tmp[j]);3805tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size));3806vec->definitions[0] = Definition(tmp[0]);3807bld.insert(std::move(vec));3808}38093810if (tmp[0].bytes() % component_size) {3811/* trim tmp[0] */3812assert(i == num_vals);3813RegClass new_rc =3814RegClass::get(reg_type, tmp[0].bytes() / component_size * component_size);3815tmp[0] =3816bld.pseudo(aco_opcode::p_extract_vector, bld.def(new_rc), tmp[0], Operand::zero());3817}38183819RegClass elem_rc = RegClass::get(reg_type, component_size);38203821unsigned start = components_split;38223823if (tmp_size == elem_rc.bytes()) {3824allocated_vec[components_split++] = tmp[0];3825} else {3826assert(tmp_size % elem_rc.bytes() == 0);3827aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(3828aco_opcode::p_split_vector, Format::PSEUDO, 1, tmp_size / elem_rc.bytes())};3829for (auto& def : split->definitions) {3830Temp component = bld.tmp(elem_rc);3831allocated_vec[components_split++] = component;3832def = Definition(component);3833}3834split->operands[0] = Operand(tmp[0]);3835bld.insert(std::move(split));3836}38373838/* try to p_as_uniform early so we can create more optimizable code and3839* also update allocated_vec */3840for (unsigned j = start; j < components_split; j++) {3841if (allocated_vec[j].bytes() % 4 == 0 && info.dst.type() == RegType::sgpr)3842allocated_vec[j] = bld.as_uniform(allocated_vec[j]);3843has_vgprs |= allocated_vec[j].type() == RegType::vgpr;3844}3845}38463847/* concatenate components and p_as_uniform() result if needed */3848if (info.dst.type() == RegType::vgpr || !has_vgprs)3849ctx->allocated_vec.emplace(info.dst.id(), allocated_vec);38503851int padding_bytes =3852MAX2((int)info.dst.bytes() - int(allocated_vec[0].bytes() * info.num_components), 0);38533854aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(3855aco_opcode::p_create_vector, Format::PSEUDO, info.num_components + !!padding_bytes, 1)};3856for (unsigned i = 0; i < info.num_components; i++)3857vec->operands[i] = Operand(allocated_vec[i]);3858if (padding_bytes)3859vec->operands[info.num_components] = Operand(RegClass::get(RegType::vgpr, padding_bytes));3860if (info.dst.type() == RegType::sgpr && has_vgprs) {3861Temp tmp = bld.tmp(RegType::vgpr, info.dst.size());3862vec->definitions[0] = Definition(tmp);3863bld.insert(std::move(vec));3864bld.pseudo(aco_opcode::p_as_uniform, Definition(info.dst), tmp);3865} else {3866vec->definitions[0] = Definition(info.dst);3867bld.insert(std::move(vec));3868}3869}38703871Operand3872load_lds_size_m0(Builder& bld)3873{3874/* TODO: m0 does not need to be initialized on GFX9+ */3875return bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand::c32(0xffffffffu)));3876}38773878Temp3879lds_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,3880unsigned align, unsigned const_offset, Temp dst_hint)3881{3882offset = offset.regClass() == s1 ? bld.copy(bld.def(v1), offset) : offset;38833884Operand m = load_lds_size_m0(bld);38853886bool large_ds_read = bld.program->chip_class >= GFX7;3887bool usable_read2 = bld.program->chip_class >= GFX7;38883889bool read2 = false;3890unsigned size = 0;3891aco_opcode op;3892if (bytes_needed >= 16 && align % 16 == 0 && large_ds_read) {3893size = 16;3894op = aco_opcode::ds_read_b128;3895} else if (bytes_needed >= 16 && align % 8 == 0 && const_offset % 8 == 0 && usable_read2) {3896size = 16;3897read2 = true;3898op = aco_opcode::ds_read2_b64;3899} else if (bytes_needed >= 12 && align % 16 == 0 && large_ds_read) {3900size = 12;3901op = aco_opcode::ds_read_b96;3902} else if (bytes_needed >= 8 && align % 8 == 0) {3903size = 8;3904op = aco_opcode::ds_read_b64;3905} else if (bytes_needed >= 8 && align % 4 == 0 && const_offset % 4 == 0 && usable_read2) {3906size = 8;3907read2 = true;3908op = aco_opcode::ds_read2_b32;3909} else if (bytes_needed >= 4 && align % 4 == 0) {3910size = 4;3911op = aco_opcode::ds_read_b32;3912} else if (bytes_needed >= 2 && align % 2 == 0) {3913size = 2;3914op = bld.program->chip_class >= GFX9 ? aco_opcode::ds_read_u16_d16 : aco_opcode::ds_read_u16;3915} else {3916size = 1;3917op = bld.program->chip_class >= GFX9 ? aco_opcode::ds_read_u8_d16 : aco_opcode::ds_read_u8;3918}39193920unsigned const_offset_unit = read2 ? size / 2u : 1u;3921unsigned const_offset_range = read2 ? 255 * const_offset_unit : 65536;39223923if (const_offset > (const_offset_range - const_offset_unit)) {3924unsigned excess = const_offset - (const_offset % const_offset_range);3925offset = bld.vadd32(bld.def(v1), offset, Operand::c32(excess));3926const_offset -= excess;3927}39283929const_offset /= const_offset_unit;39303931RegClass rc = RegClass::get(RegType::vgpr, size);3932Temp val = rc == info.dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc);3933Instruction* instr;3934if (read2)3935instr = bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);3936else3937instr = bld.ds(op, Definition(val), offset, m, const_offset);3938instr->ds().sync = info.sync;39393940return val;3941}39423943const EmitLoadParameters lds_load_params{lds_load_callback, false, true, UINT32_MAX};39443945Temp3946smem_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,3947unsigned align, unsigned const_offset, Temp dst_hint)3948{3949unsigned size = 0;3950aco_opcode op;3951if (bytes_needed <= 4) {3952size = 1;3953op = info.resource.id() ? aco_opcode::s_buffer_load_dword : aco_opcode::s_load_dword;3954} else if (bytes_needed <= 8) {3955size = 2;3956op = info.resource.id() ? aco_opcode::s_buffer_load_dwordx2 : aco_opcode::s_load_dwordx2;3957} else if (bytes_needed <= 16) {3958size = 4;3959op = info.resource.id() ? aco_opcode::s_buffer_load_dwordx4 : aco_opcode::s_load_dwordx4;3960} else if (bytes_needed <= 32) {3961size = 8;3962op = info.resource.id() ? aco_opcode::s_buffer_load_dwordx8 : aco_opcode::s_load_dwordx8;3963} else {3964size = 16;3965op = info.resource.id() ? aco_opcode::s_buffer_load_dwordx16 : aco_opcode::s_load_dwordx16;3966}3967aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};3968if (info.resource.id()) {3969load->operands[0] = Operand(info.resource);3970load->operands[1] = Operand(offset);3971} else {3972load->operands[0] = Operand(offset);3973load->operands[1] = Operand::zero();3974}3975RegClass rc(RegType::sgpr, size);3976Temp val = dst_hint.id() && dst_hint.regClass() == rc ? dst_hint : bld.tmp(rc);3977load->definitions[0] = Definition(val);3978load->glc = info.glc;3979load->dlc = info.glc && bld.program->chip_class >= GFX10;3980load->sync = info.sync;3981bld.insert(std::move(load));3982return val;3983}39843985const EmitLoadParameters smem_load_params{smem_load_callback, true, false, 1024};39863987Temp3988mubuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,3989unsigned align_, unsigned const_offset, Temp dst_hint)3990{3991Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);3992Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);39933994if (info.soffset.id()) {3995if (soffset.isTemp())3996vaddr = bld.copy(bld.def(v1), soffset);3997soffset = Operand(info.soffset);3998}39994000unsigned bytes_size = 0;4001aco_opcode op;4002if (bytes_needed == 1 || align_ % 2) {4003bytes_size = 1;4004op = aco_opcode::buffer_load_ubyte;4005} else if (bytes_needed == 2 || align_ % 4) {4006bytes_size = 2;4007op = aco_opcode::buffer_load_ushort;4008} else if (bytes_needed <= 4) {4009bytes_size = 4;4010op = aco_opcode::buffer_load_dword;4011} else if (bytes_needed <= 8) {4012bytes_size = 8;4013op = aco_opcode::buffer_load_dwordx2;4014} else if (bytes_needed <= 12 && bld.program->chip_class > GFX6) {4015bytes_size = 12;4016op = aco_opcode::buffer_load_dwordx3;4017} else {4018bytes_size = 16;4019op = aco_opcode::buffer_load_dwordx4;4020}4021aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};4022mubuf->operands[0] = Operand(info.resource);4023mubuf->operands[1] = vaddr;4024mubuf->operands[2] = soffset;4025mubuf->offen = (offset.type() == RegType::vgpr);4026mubuf->glc = info.glc;4027mubuf->dlc = info.glc && bld.program->chip_class >= GFX10;4028mubuf->slc = info.slc;4029mubuf->sync = info.sync;4030mubuf->offset = const_offset;4031mubuf->swizzled = info.swizzle_component_size != 0;4032RegClass rc = RegClass::get(RegType::vgpr, bytes_size);4033Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);4034mubuf->definitions[0] = Definition(val);4035bld.insert(std::move(mubuf));40364037return val;4038}40394040const EmitLoadParameters mubuf_load_params{mubuf_load_callback, true, true, 4096};4041const EmitLoadParameters scratch_load_params{mubuf_load_callback, false, true, 4096};40424043Temp4044get_gfx6_global_rsrc(Builder& bld, Temp addr)4045{4046uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |4047S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);40484049if (addr.type() == RegType::vgpr)4050return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand::zero(), Operand::zero(),4051Operand::c32(-1u), Operand::c32(rsrc_conf));4052return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand::c32(-1u),4053Operand::c32(rsrc_conf));4054}40554056Temp4057global_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,4058unsigned align_, unsigned const_offset, Temp dst_hint)4059{4060unsigned bytes_size = 0;4061bool use_mubuf = bld.program->chip_class == GFX6;4062bool global = bld.program->chip_class >= GFX9;4063aco_opcode op;4064if (bytes_needed == 1) {4065bytes_size = 1;4066op = use_mubuf ? aco_opcode::buffer_load_ubyte4067: global ? aco_opcode::global_load_ubyte4068: aco_opcode::flat_load_ubyte;4069} else if (bytes_needed == 2) {4070bytes_size = 2;4071op = use_mubuf ? aco_opcode::buffer_load_ushort4072: global ? aco_opcode::global_load_ushort4073: aco_opcode::flat_load_ushort;4074} else if (bytes_needed <= 4) {4075bytes_size = 4;4076op = use_mubuf ? aco_opcode::buffer_load_dword4077: global ? aco_opcode::global_load_dword4078: aco_opcode::flat_load_dword;4079} else if (bytes_needed <= 8) {4080bytes_size = 8;4081op = use_mubuf ? aco_opcode::buffer_load_dwordx24082: global ? aco_opcode::global_load_dwordx24083: aco_opcode::flat_load_dwordx2;4084} else if (bytes_needed <= 12 && !use_mubuf) {4085bytes_size = 12;4086op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;4087} else {4088bytes_size = 16;4089op = use_mubuf ? aco_opcode::buffer_load_dwordx44090: global ? aco_opcode::global_load_dwordx44091: aco_opcode::flat_load_dwordx4;4092}4093RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));4094Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);4095if (use_mubuf) {4096aco_ptr<MUBUF_instruction> mubuf{4097create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};4098mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, offset));4099mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);4100mubuf->operands[2] = Operand::zero();4101mubuf->glc = info.glc;4102mubuf->dlc = false;4103mubuf->offset = 0;4104mubuf->addr64 = offset.type() == RegType::vgpr;4105mubuf->disable_wqm = false;4106mubuf->sync = info.sync;4107mubuf->definitions[0] = Definition(val);4108bld.insert(std::move(mubuf));4109} else {4110offset = offset.regClass() == s2 ? bld.copy(bld.def(v2), offset) : offset;41114112aco_ptr<FLAT_instruction> flat{4113create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};4114flat->operands[0] = Operand(offset);4115flat->operands[1] = Operand(s1);4116flat->glc = info.glc;4117flat->dlc = info.glc && bld.program->chip_class >= GFX10;4118flat->sync = info.sync;4119flat->offset = 0u;4120flat->definitions[0] = Definition(val);4121bld.insert(std::move(flat));4122}41234124return val;4125}41264127const EmitLoadParameters global_load_params{global_load_callback, true, true, 1};41284129Temp4130load_lds(isel_context* ctx, unsigned elem_size_bytes, unsigned num_components, Temp dst,4131Temp address, unsigned base_offset, unsigned align)4132{4133assert(util_is_power_of_two_nonzero(align));41344135Builder bld(ctx->program, ctx->block);41364137LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes};4138info.align_mul = align;4139info.align_offset = 0;4140info.sync = memory_sync_info(storage_shared);4141info.const_offset = base_offset;4142emit_load(ctx, bld, info, lds_load_params);41434144return dst;4145}41464147void4148split_store_data(isel_context* ctx, RegType dst_type, unsigned count, Temp* dst, unsigned* bytes,4149Temp src)4150{4151if (!count)4152return;41534154Builder bld(ctx->program, ctx->block);41554156/* count == 1 fast path */4157if (count == 1) {4158if (dst_type == RegType::sgpr)4159dst[0] = bld.as_uniform(src);4160else4161dst[0] = as_vgpr(ctx, src);4162return;4163}41644165/* elem_size_bytes is the greatest common divisor which is a power of 2 */4166unsigned elem_size_bytes =41671u << (ffs(std::accumulate(bytes, bytes + count, 8, std::bit_or<>{})) - 1);41684169ASSERTED bool is_subdword = elem_size_bytes < 4;4170assert(!is_subdword || dst_type == RegType::vgpr);41714172for (unsigned i = 0; i < count; i++)4173dst[i] = bld.tmp(RegClass::get(dst_type, bytes[i]));41744175std::vector<Temp> temps;4176/* use allocated_vec if possible */4177auto it = ctx->allocated_vec.find(src.id());4178if (it != ctx->allocated_vec.end()) {4179if (!it->second[0].id())4180goto split;4181unsigned elem_size = it->second[0].bytes();4182assert(src.bytes() % elem_size == 0);41834184for (unsigned i = 0; i < src.bytes() / elem_size; i++) {4185if (!it->second[i].id())4186goto split;4187}4188if (elem_size_bytes % elem_size)4189goto split;41904191temps.insert(temps.end(), it->second.begin(), it->second.begin() + src.bytes() / elem_size);4192elem_size_bytes = elem_size;4193}41944195split:4196/* split src if necessary */4197if (temps.empty()) {4198if (is_subdword && src.type() == RegType::sgpr)4199src = as_vgpr(ctx, src);4200if (dst_type == RegType::sgpr)4201src = bld.as_uniform(src);42024203unsigned num_elems = src.bytes() / elem_size_bytes;4204aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(4205aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elems)};4206split->operands[0] = Operand(src);4207for (unsigned i = 0; i < num_elems; i++) {4208temps.emplace_back(bld.tmp(RegClass::get(dst_type, elem_size_bytes)));4209split->definitions[i] = Definition(temps.back());4210}4211bld.insert(std::move(split));4212}42134214unsigned idx = 0;4215for (unsigned i = 0; i < count; i++) {4216unsigned op_count = dst[i].bytes() / elem_size_bytes;4217if (op_count == 1) {4218if (dst_type == RegType::sgpr)4219dst[i] = bld.as_uniform(temps[idx++]);4220else4221dst[i] = as_vgpr(ctx, temps[idx++]);4222continue;4223}42244225aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,4226Format::PSEUDO, op_count, 1)};4227for (unsigned j = 0; j < op_count; j++) {4228Temp tmp = temps[idx++];4229if (dst_type == RegType::sgpr)4230tmp = bld.as_uniform(tmp);4231vec->operands[j] = Operand(tmp);4232}4233vec->definitions[0] = Definition(dst[i]);4234bld.insert(std::move(vec));4235}4236return;4237}42384239bool4240scan_write_mask(uint32_t mask, uint32_t todo_mask, int* start, int* count)4241{4242unsigned start_elem = ffs(todo_mask) - 1;4243bool skip = !(mask & (1 << start_elem));4244if (skip)4245mask = ~mask & todo_mask;42464247mask &= todo_mask;42484249u_bit_scan_consecutive_range(&mask, start, count);42504251return !skip;4252}42534254void4255advance_write_mask(uint32_t* todo_mask, int start, int count)4256{4257*todo_mask &= ~u_bit_consecutive(0, count) << start;4258}42594260void4261store_lds(isel_context* ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask, Temp address,4262unsigned base_offset, unsigned align)4263{4264assert(util_is_power_of_two_nonzero(align));4265assert(util_is_power_of_two_nonzero(elem_size_bytes) && elem_size_bytes <= 8);42664267Builder bld(ctx->program, ctx->block);4268bool large_ds_write = ctx->options->chip_class >= GFX7;4269bool usable_write2 = ctx->options->chip_class >= GFX7;42704271unsigned write_count = 0;4272Temp write_datas[32];4273unsigned offsets[32];4274unsigned bytes[32];4275aco_opcode opcodes[32];42764277wrmask = widen_mask(wrmask, elem_size_bytes);42784279uint32_t todo = u_bit_consecutive(0, data.bytes());4280while (todo) {4281int offset, byte;4282if (!scan_write_mask(wrmask, todo, &offset, &byte)) {4283offsets[write_count] = offset;4284bytes[write_count] = byte;4285opcodes[write_count] = aco_opcode::num_opcodes;4286write_count++;4287advance_write_mask(&todo, offset, byte);4288continue;4289}42904291bool aligned2 = offset % 2 == 0 && align % 2 == 0;4292bool aligned4 = offset % 4 == 0 && align % 4 == 0;4293bool aligned8 = offset % 8 == 0 && align % 8 == 0;4294bool aligned16 = offset % 16 == 0 && align % 16 == 0;42954296// TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial4297aco_opcode op = aco_opcode::num_opcodes;4298if (byte >= 16 && aligned16 && large_ds_write) {4299op = aco_opcode::ds_write_b128;4300byte = 16;4301} else if (byte >= 12 && aligned16 && large_ds_write) {4302op = aco_opcode::ds_write_b96;4303byte = 12;4304} else if (byte >= 8 && aligned8) {4305op = aco_opcode::ds_write_b64;4306byte = 8;4307} else if (byte >= 4 && aligned4) {4308op = aco_opcode::ds_write_b32;4309byte = 4;4310} else if (byte >= 2 && aligned2) {4311op = aco_opcode::ds_write_b16;4312byte = 2;4313} else if (byte >= 1) {4314op = aco_opcode::ds_write_b8;4315byte = 1;4316} else {4317assert(false);4318}43194320offsets[write_count] = offset;4321bytes[write_count] = byte;4322opcodes[write_count] = op;4323write_count++;4324advance_write_mask(&todo, offset, byte);4325}43264327Operand m = load_lds_size_m0(bld);43284329split_store_data(ctx, RegType::vgpr, write_count, write_datas, bytes, data);43304331for (unsigned i = 0; i < write_count; i++) {4332aco_opcode op = opcodes[i];4333if (op == aco_opcode::num_opcodes)4334continue;43354336Temp split_data = write_datas[i];43374338unsigned second = write_count;4339if (usable_write2 && (op == aco_opcode::ds_write_b32 || op == aco_opcode::ds_write_b64)) {4340for (second = i + 1; second < write_count; second++) {4341if (opcodes[second] == op && (offsets[second] - offsets[i]) % split_data.bytes() == 0) {4342op = split_data.bytes() == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;4343opcodes[second] = aco_opcode::num_opcodes;4344break;4345}4346}4347}43484349bool write2 = op == aco_opcode::ds_write2_b32 || op == aco_opcode::ds_write2_b64;4350unsigned write2_off = (offsets[second] - offsets[i]) / split_data.bytes();43514352unsigned inline_offset = base_offset + offsets[i];4353unsigned max_offset = write2 ? (255 - write2_off) * split_data.bytes() : 65535;4354Temp address_offset = address;4355if (inline_offset > max_offset) {4356address_offset = bld.vadd32(bld.def(v1), Operand::c32(base_offset), address_offset);4357inline_offset = offsets[i];4358}43594360/* offsets[i] shouldn't be large enough for this to happen */4361assert(inline_offset <= max_offset);43624363Instruction* instr;4364if (write2) {4365Temp second_data = write_datas[second];4366inline_offset /= split_data.bytes();4367instr = bld.ds(op, address_offset, split_data, second_data, m, inline_offset,4368inline_offset + write2_off);4369} else {4370instr = bld.ds(op, address_offset, split_data, m, inline_offset);4371}4372instr->ds().sync = memory_sync_info(storage_shared);4373}4374}43754376aco_opcode4377get_buffer_store_op(unsigned bytes)4378{4379switch (bytes) {4380case 1: return aco_opcode::buffer_store_byte;4381case 2: return aco_opcode::buffer_store_short;4382case 4: return aco_opcode::buffer_store_dword;4383case 8: return aco_opcode::buffer_store_dwordx2;4384case 12: return aco_opcode::buffer_store_dwordx3;4385case 16: return aco_opcode::buffer_store_dwordx4;4386}4387unreachable("Unexpected store size");4388return aco_opcode::num_opcodes;4389}43904391void4392split_buffer_store(isel_context* ctx, nir_intrinsic_instr* instr, bool smem, RegType dst_type,4393Temp data, unsigned writemask, int swizzle_element_size, unsigned* write_count,4394Temp* write_datas, unsigned* offsets)4395{4396unsigned write_count_with_skips = 0;4397bool skips[16];4398unsigned bytes[16];43994400/* determine how to split the data */4401unsigned todo = u_bit_consecutive(0, data.bytes());4402while (todo) {4403int offset, byte;4404skips[write_count_with_skips] = !scan_write_mask(writemask, todo, &offset, &byte);4405offsets[write_count_with_skips] = offset;4406if (skips[write_count_with_skips]) {4407bytes[write_count_with_skips] = byte;4408advance_write_mask(&todo, offset, byte);4409write_count_with_skips++;4410continue;4411}44124413/* only supported sizes are 1, 2, 4, 8, 12 and 16 bytes and can't be4414* larger than swizzle_element_size */4415byte = MIN2(byte, swizzle_element_size);4416if (byte % 4)4417byte = byte > 4 ? byte & ~0x3 : MIN2(byte, 2);44184419/* SMEM and GFX6 VMEM can't emit 12-byte stores */4420if ((ctx->program->chip_class == GFX6 || smem) && byte == 12)4421byte = 8;44224423/* dword or larger stores have to be dword-aligned */4424unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4;4425unsigned align_offset = (instr ? nir_intrinsic_align_offset(instr) : 0) + offset;4426bool dword_aligned = align_offset % 4 == 0 && align_mul % 4 == 0;4427if (!dword_aligned)4428byte = MIN2(byte, (align_offset % 2 == 0 && align_mul % 2 == 0) ? 2 : 1);44294430bytes[write_count_with_skips] = byte;4431advance_write_mask(&todo, offset, byte);4432write_count_with_skips++;4433}44344435/* actually split data */4436split_store_data(ctx, dst_type, write_count_with_skips, write_datas, bytes, data);44374438/* remove skips */4439for (unsigned i = 0; i < write_count_with_skips; i++) {4440if (skips[i])4441continue;4442write_datas[*write_count] = write_datas[i];4443offsets[*write_count] = offsets[i];4444(*write_count)++;4445}4446}44474448Temp4449create_vec_from_array(isel_context* ctx, Temp arr[], unsigned cnt, RegType reg_type,4450unsigned elem_size_bytes, unsigned split_cnt = 0u, Temp dst = Temp())4451{4452Builder bld(ctx->program, ctx->block);4453unsigned dword_size = elem_size_bytes / 4;44544455if (!dst.id())4456dst = bld.tmp(RegClass(reg_type, cnt * dword_size));44574458std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;4459aco_ptr<Pseudo_instruction> instr{4460create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)};4461instr->definitions[0] = Definition(dst);44624463for (unsigned i = 0; i < cnt; ++i) {4464if (arr[i].id()) {4465assert(arr[i].size() == dword_size);4466allocated_vec[i] = arr[i];4467instr->operands[i] = Operand(arr[i]);4468} else {4469Temp zero = bld.copy(bld.def(RegClass(reg_type, dword_size)),4470Operand::zero(dword_size == 2 ? 8 : 4));4471allocated_vec[i] = zero;4472instr->operands[i] = Operand(zero);4473}4474}44754476bld.insert(std::move(instr));44774478if (split_cnt)4479emit_split_vector(ctx, dst, split_cnt);4480else4481ctx->allocated_vec.emplace(dst.id(), allocated_vec); /* emit_split_vector already does this */44824483return dst;4484}44854486inline unsigned4487resolve_excess_vmem_const_offset(Builder& bld, Temp& voffset, unsigned const_offset)4488{4489if (const_offset >= 4096) {4490unsigned excess_const_offset = const_offset / 4096u * 4096u;4491const_offset %= 4096u;44924493if (!voffset.id())4494voffset = bld.copy(bld.def(v1), Operand::c32(excess_const_offset));4495else if (unlikely(voffset.regClass() == s1))4496voffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc),4497Operand::c32(excess_const_offset), Operand(voffset));4498else if (likely(voffset.regClass() == v1))4499voffset = bld.vadd32(bld.def(v1), Operand(voffset), Operand::c32(excess_const_offset));4500else4501unreachable("Unsupported register class of voffset");4502}45034504return const_offset;4505}45064507void4508emit_single_mubuf_store(isel_context* ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata,4509unsigned const_offset = 0u, memory_sync_info sync = memory_sync_info(),4510bool slc = false, bool swizzled = false)4511{4512assert(vdata.id());4513assert(vdata.size() != 3 || ctx->program->chip_class != GFX6);4514assert(vdata.size() >= 1 && vdata.size() <= 4);45154516Builder bld(ctx->program, ctx->block);4517aco_opcode op = get_buffer_store_op(vdata.bytes());4518const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);45194520Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);4521Operand soffset_op = soffset.id() ? Operand(soffset) : Operand::zero();4522Builder::Result r =4523bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset,4524/* offen */ !voffset_op.isUndefined(), /* swizzled */ swizzled,4525/* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, /* glc */ true,4526/* dlc*/ false, /* slc */ slc);45274528r.instr->mubuf().sync = sync;4529}45304531void4532store_vmem_mubuf(isel_context* ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset,4533unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask,4534bool allow_combining = true, memory_sync_info sync = memory_sync_info(),4535bool slc = false)4536{4537Builder bld(ctx->program, ctx->block);4538assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);4539assert(write_mask);4540write_mask = widen_mask(write_mask, elem_size_bytes);45414542unsigned write_count = 0;4543Temp write_datas[32];4544unsigned offsets[32];4545split_buffer_store(ctx, NULL, false, RegType::vgpr, src, write_mask, allow_combining ? 16 : 4,4546&write_count, write_datas, offsets);45474548for (unsigned i = 0; i < write_count; i++) {4549unsigned const_offset = offsets[i] + base_const_offset;4550emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, sync,4551slc, !allow_combining);4552}4553}45544555void4556load_vmem_mubuf(isel_context* ctx, Temp dst, Temp descriptor, Temp voffset, Temp soffset,4557unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components,4558unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true,4559bool slc = false)4560{4561assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);4562assert((num_components * elem_size_bytes) == dst.bytes());4563assert(!!stride != allow_combining);45644565Builder bld(ctx->program, ctx->block);45664567LoadEmitInfo info = {Operand(voffset), dst, num_components, elem_size_bytes, descriptor};4568info.component_stride = allow_combining ? 0 : stride;4569info.glc = true;4570info.slc = slc;4571info.swizzle_component_size = allow_combining ? 0 : 4;4572info.align_mul = MIN2(elem_size_bytes, 4);4573info.align_offset = 0;4574info.soffset = soffset;4575info.const_offset = base_const_offset;4576emit_load(ctx, bld, info, mubuf_load_params);4577}45784579Temp4580wave_id_in_threadgroup(isel_context* ctx)4581{4582Builder bld(ctx->program, ctx->block);4583return bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),4584get_arg(ctx, ctx->args->ac.merged_wave_info), Operand::c32(24u | (4u << 16)));4585}45864587Temp4588thread_id_in_threadgroup(isel_context* ctx)4589{4590/* tid_in_tg = wave_id * wave_size + tid_in_wave */45914592Builder bld(ctx->program, ctx->block);4593Temp tid_in_wave = emit_mbcnt(ctx, bld.tmp(v1));45944595if (ctx->program->workgroup_size <= ctx->program->wave_size)4596return tid_in_wave;45974598Temp wave_id_in_tg = wave_id_in_threadgroup(ctx);4599Temp num_pre_threads =4600bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), wave_id_in_tg,4601Operand::c32(ctx->program->wave_size == 64 ? 6u : 5u));4602return bld.vadd32(bld.def(v1), Operand(num_pre_threads), Operand(tid_in_wave));4603}46044605Temp4606get_tess_rel_patch_id(isel_context* ctx)4607{4608Builder bld(ctx->program, ctx->block);46094610switch (ctx->shader->info.stage) {4611case MESA_SHADER_TESS_CTRL:4612return bld.pseudo(aco_opcode::p_extract, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_rel_ids),4613Operand::zero(), Operand::c32(8u), Operand::zero());4614case MESA_SHADER_TESS_EVAL: return get_arg(ctx, ctx->args->ac.tes_rel_patch_id);4615default: unreachable("Unsupported stage in get_tess_rel_patch_id");4616}4617}46184619bool4620store_output_to_temps(isel_context* ctx, nir_intrinsic_instr* instr)4621{4622unsigned write_mask = nir_intrinsic_write_mask(instr);4623unsigned component = nir_intrinsic_component(instr);4624unsigned idx = nir_intrinsic_base(instr) * 4u + component;4625nir_src offset = *nir_get_io_offset_src(instr);46264627if (!nir_src_is_const(offset) || nir_src_as_uint(offset))4628return false;46294630Temp src = get_ssa_temp(ctx, instr->src[0].ssa);46314632if (instr->src[0].ssa->bit_size == 64)4633write_mask = widen_mask(write_mask, 2);46344635RegClass rc = instr->src[0].ssa->bit_size == 16 ? v2b : v1;46364637for (unsigned i = 0; i < 8; ++i) {4638if (write_mask & (1 << i)) {4639ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u);4640ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, rc);4641}4642idx++;4643}46444645return true;4646}46474648bool4649load_input_from_temps(isel_context* ctx, nir_intrinsic_instr* instr, Temp dst)4650{4651/* Only TCS per-vertex inputs are supported by this function.4652* Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations4653* is the same.4654*/4655if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq)4656return false;46574658nir_src* off_src = nir_get_io_offset_src(instr);4659nir_src* vertex_index_src = nir_get_io_vertex_index_src(instr);4660nir_instr* vertex_index_instr = vertex_index_src->ssa->parent_instr;4661bool can_use_temps =4662nir_src_is_const(*off_src) && vertex_index_instr->type == nir_instr_type_intrinsic &&4663nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id;46644665if (!can_use_temps)4666return false;46674668unsigned idx = nir_intrinsic_base(instr) * 4u + nir_intrinsic_component(instr) +46694 * nir_src_as_uint(*off_src);4670Temp* src = &ctx->inputs.temps[idx];4671create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u, 0, dst);46724673return true;4674}46754676static void export_vs_varying(isel_context* ctx, int slot, bool is_pos, int* next_pos);46774678void4679visit_store_output(isel_context* ctx, nir_intrinsic_instr* instr)4680{4681if (ctx->stage == vertex_vs || ctx->stage == tess_eval_vs || ctx->stage == fragment_fs ||4682ctx->stage == vertex_ngg || ctx->stage == tess_eval_ngg ||4683(ctx->stage == vertex_tess_control_hs && ctx->shader->info.stage == MESA_SHADER_VERTEX) ||4684ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {4685bool stored_to_temps = store_output_to_temps(ctx, instr);4686if (!stored_to_temps) {4687isel_err(instr->src[1].ssa->parent_instr, "Unimplemented output offset instruction");4688abort();4689}4690} else {4691unreachable("Shader stage not implemented");4692}46934694/* For NGG VS and TES shaders the primitive ID is exported manually after the other exports so we4695* have to emit an exp here manually */4696if (ctx->stage.hw == HWStage::NGG &&4697(ctx->stage.has(SWStage::VS) || ctx->stage.has(SWStage::TES)) &&4698nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_PRIMITIVE_ID)4699export_vs_varying(ctx, VARYING_SLOT_PRIMITIVE_ID, false, NULL);4700}47014702void4703emit_interp_instr(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst,4704Temp prim_mask)4705{4706Temp coord1 = emit_extract_vector(ctx, src, 0, v1);4707Temp coord2 = emit_extract_vector(ctx, src, 1, v1);47084709Builder bld(ctx->program, ctx->block);47104711if (dst.regClass() == v2b) {4712if (ctx->program->dev.has_16bank_lds) {4713assert(ctx->options->chip_class <= GFX8);4714Builder::Result interp_p1 =4715bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand::c32(2u) /* P0 */,4716bld.m0(prim_mask), idx, component);4717interp_p1 = bld.vintrp(aco_opcode::v_interp_p1lv_f16, bld.def(v2b), coord1,4718bld.m0(prim_mask), interp_p1, idx, component);4719bld.vintrp(aco_opcode::v_interp_p2_legacy_f16, Definition(dst), coord2, bld.m0(prim_mask),4720interp_p1, idx, component);4721} else {4722aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16;47234724if (ctx->options->chip_class == GFX8)4725interp_p2_op = aco_opcode::v_interp_p2_legacy_f16;47264727Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1ll_f16, bld.def(v1), coord1,4728bld.m0(prim_mask), idx, component);4729bld.vintrp(interp_p2_op, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, idx,4730component);4731}4732} else {4733Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1,4734bld.m0(prim_mask), idx, component);47354736if (ctx->program->dev.has_16bank_lds)4737interp_p1.instr->operands[0].setLateKill(true);47384739bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), interp_p1,4740idx, component);4741}4742}47434744void4745emit_load_frag_coord(isel_context* ctx, Temp dst, unsigned num_components)4746{4747Builder bld(ctx->program, ctx->block);47484749aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(4750aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));4751for (unsigned i = 0; i < num_components; i++)4752vec->operands[i] = Operand(get_arg(ctx, ctx->args->ac.frag_pos[i]));4753if (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {4754assert(num_components == 4);4755vec->operands[3] =4756bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->ac.frag_pos[3]));4757}47584759if (ctx->options->adjust_frag_coord_z &&4760G_0286CC_POS_Z_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {4761/* Adjust gl_FragCoord.z for VRS due to a hw bug on some GFX10.3 chips. */4762Operand frag_z = vec->operands[2];4763Temp adjusted_frag_z = bld.tmp(v1);4764Temp tmp;47654766/* dFdx fine */4767Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), frag_z, dpp_quad_perm(0, 0, 2, 2));4768tmp = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), frag_z, tl, dpp_quad_perm(1, 1, 3, 3));4769emit_wqm(bld, tmp, adjusted_frag_z, true);47704771/* adjusted_frag_z * 0.0625 + frag_z */4772adjusted_frag_z = bld.vop3(aco_opcode::v_fma_f32, bld.def(v1), adjusted_frag_z,4773Operand::c32(0x3d800000u /* 0.0625 */), frag_z);47744775/* VRS Rate X = Ancillary[2:3] */4776Temp x_rate =4777bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary),4778Operand::c32(2u), Operand::c32(2u));47794780/* xRate = xRate == 0x1 ? adjusted_frag_z : frag_z. */4781Temp cond =4782bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(x_rate));4783vec->operands[2] =4784bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), frag_z, adjusted_frag_z, cond);4785}47864787for (Operand& op : vec->operands)4788op = op.isUndefined() ? Operand::zero() : op;47894790vec->definitions[0] = Definition(dst);4791ctx->block->instructions.emplace_back(std::move(vec));4792emit_split_vector(ctx, dst, num_components);4793return;4794}47954796void4797emit_load_frag_shading_rate(isel_context* ctx, Temp dst)4798{4799Builder bld(ctx->program, ctx->block);4800Temp cond;48014802/* VRS Rate X = Ancillary[2:3]4803* VRS Rate Y = Ancillary[4:5]4804*/4805Temp x_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary),4806Operand::c32(2u), Operand::c32(2u));4807Temp y_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary),4808Operand::c32(4u), Operand::c32(2u));48094810/* xRate = xRate == 0x1 ? Horizontal2Pixels : None. */4811cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(x_rate));4812x_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand::zero()),4813bld.copy(bld.def(v1), Operand::c32(4u)), cond);48144815/* yRate = yRate == 0x1 ? Vertical2Pixels : None. */4816cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(y_rate));4817y_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand::zero()),4818bld.copy(bld.def(v1), Operand::c32(1u)), cond);48194820bld.vop2(aco_opcode::v_or_b32, Definition(dst), Operand(x_rate), Operand(y_rate));4821}48224823void4824visit_load_interpolated_input(isel_context* ctx, nir_intrinsic_instr* instr)4825{4826Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);4827Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);4828unsigned idx = nir_intrinsic_base(instr);4829unsigned component = nir_intrinsic_component(instr);4830Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);48314832assert(nir_src_is_const(instr->src[1]) && !nir_src_as_uint(instr->src[1]));48334834if (instr->dest.ssa.num_components == 1) {4835emit_interp_instr(ctx, idx, component, coords, dst, prim_mask);4836} else {4837aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(4838aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1));4839for (unsigned i = 0; i < instr->dest.ssa.num_components; i++) {4840Temp tmp = ctx->program->allocateTmp(v1);4841emit_interp_instr(ctx, idx, component + i, coords, tmp, prim_mask);4842vec->operands[i] = Operand(tmp);4843}4844vec->definitions[0] = Definition(dst);4845ctx->block->instructions.emplace_back(std::move(vec));4846}4847}48484849bool4850check_vertex_fetch_size(isel_context* ctx, const ac_data_format_info* vtx_info, unsigned offset,4851unsigned binding_align, unsigned channels)4852{4853unsigned vertex_byte_size = vtx_info->chan_byte_size * channels;4854if (vtx_info->chan_byte_size != 4 && channels == 3)4855return false;48564857/* Split typed vertex buffer loads on GFX6 and GFX10+ to avoid any4858* alignment issues that triggers memory violations and eventually a GPU4859* hang. This can happen if the stride (static or dynamic) is unaligned and4860* also if the VBO offset is aligned to a scalar (eg. stride is 8 and VBO4861* offset is 2 for R16G16B16A16_SNORM).4862*/4863return (ctx->options->chip_class >= GFX7 && ctx->options->chip_class <= GFX9) ||4864(offset % vertex_byte_size == 0 && MAX2(binding_align, 1) % vertex_byte_size == 0);4865}48664867uint8_t4868get_fetch_data_format(isel_context* ctx, const ac_data_format_info* vtx_info, unsigned offset,4869unsigned* channels, unsigned max_channels, unsigned binding_align)4870{4871if (!vtx_info->chan_byte_size) {4872*channels = vtx_info->num_channels;4873return vtx_info->chan_format;4874}48754876unsigned num_channels = *channels;4877if (!check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, *channels)) {4878unsigned new_channels = num_channels + 1;4879/* first, assume more loads is worse and try using a larger data format */4880while (new_channels <= max_channels &&4881!check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, new_channels)) {4882new_channels++;4883}48844885if (new_channels > max_channels) {4886/* then try decreasing load size (at the cost of more loads) */4887new_channels = *channels;4888while (new_channels > 1 &&4889!check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, new_channels))4890new_channels--;4891}48924893if (new_channels < *channels)4894*channels = new_channels;4895num_channels = new_channels;4896}48974898switch (vtx_info->chan_format) {4899case V_008F0C_BUF_DATA_FORMAT_8:4900return std::array<uint8_t, 4>{V_008F0C_BUF_DATA_FORMAT_8, V_008F0C_BUF_DATA_FORMAT_8_8,4901V_008F0C_BUF_DATA_FORMAT_INVALID,4902V_008F0C_BUF_DATA_FORMAT_8_8_8_8}[num_channels - 1];4903case V_008F0C_BUF_DATA_FORMAT_16:4904return std::array<uint8_t, 4>{V_008F0C_BUF_DATA_FORMAT_16, V_008F0C_BUF_DATA_FORMAT_16_16,4905V_008F0C_BUF_DATA_FORMAT_INVALID,4906V_008F0C_BUF_DATA_FORMAT_16_16_16_16}[num_channels - 1];4907case V_008F0C_BUF_DATA_FORMAT_32:4908return std::array<uint8_t, 4>{V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32,4909V_008F0C_BUF_DATA_FORMAT_32_32_32,4910V_008F0C_BUF_DATA_FORMAT_32_32_32_32}[num_channels - 1];4911}4912unreachable("shouldn't reach here");4913return V_008F0C_BUF_DATA_FORMAT_INVALID;4914}49154916/* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.4917* so we may need to fix it up. */4918Temp4919adjust_vertex_fetch_alpha(isel_context* ctx, unsigned adjustment, Temp alpha)4920{4921Builder bld(ctx->program, ctx->block);49224923if (adjustment == AC_FETCH_FORMAT_SSCALED)4924alpha = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), alpha);49254926/* For the integer-like cases, do a natural sign extension.4927*4928* For the SNORM case, the values are 0.0, 0.333, 0.666, 1.04929* and happen to contain 0, 1, 2, 3 as the two LSBs of the4930* exponent.4931*/4932unsigned offset = adjustment == AC_FETCH_FORMAT_SNORM ? 23u : 0u;4933alpha =4934bld.vop3(aco_opcode::v_bfe_i32, bld.def(v1), alpha, Operand::c32(offset), Operand::c32(2u));49354936/* Convert back to the right type. */4937if (adjustment == AC_FETCH_FORMAT_SNORM) {4938alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);4939alpha = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::c32(0xbf800000u), alpha);4940} else if (adjustment == AC_FETCH_FORMAT_SSCALED) {4941alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);4942}49434944return alpha;4945}49464947void4948visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr)4949{4950Builder bld(ctx->program, ctx->block);4951Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);4952nir_src offset = *nir_get_io_offset_src(instr);49534954if (ctx->shader->info.stage == MESA_SHADER_VERTEX) {49554956if (!nir_src_is_const(offset) || nir_src_as_uint(offset))4957isel_err(offset.ssa->parent_instr,4958"Unimplemented non-zero nir_intrinsic_load_input offset");49594960Temp vertex_buffers =4961convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.vertex_buffers));49624963unsigned location = nir_intrinsic_base(instr) - VERT_ATTRIB_GENERIC0;4964unsigned component = nir_intrinsic_component(instr);4965unsigned bitsize = instr->dest.ssa.bit_size;4966unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location];4967uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location];4968uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location];4969unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location];4970unsigned binding_align = ctx->options->key.vs.vertex_binding_align[attrib_binding];4971enum ac_fetch_format alpha_adjust = ctx->options->key.vs.alpha_adjust[location];49724973unsigned dfmt = attrib_format & 0xf;4974unsigned nfmt = (attrib_format >> 4) & 0x7;4975const struct ac_data_format_info* vtx_info = ac_get_data_format_info(dfmt);49764977unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component;4978unsigned num_channels = MIN2(util_last_bit(mask), vtx_info->num_channels);4979bool post_shuffle = ctx->options->key.vs.post_shuffle & (1 << location);4980if (post_shuffle)4981num_channels = MAX2(num_channels, 3);49824983unsigned desc_index =4984ctx->program->info->vs.use_per_attribute_vb_descs ? location : attrib_binding;4985desc_index = util_bitcount(ctx->program->info->vs.vb_desc_usage_mask &4986u_bit_consecutive(0, desc_index));4987Operand off = bld.copy(bld.def(s1), Operand::c32(desc_index * 16u));4988Temp list = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), vertex_buffers, off);49894990Temp index;4991if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) {4992uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location];4993Temp start_instance = get_arg(ctx, ctx->args->ac.start_instance);4994if (divisor) {4995Temp instance_id = get_arg(ctx, ctx->args->ac.instance_id);4996if (divisor != 1) {4997Temp divided = bld.tmp(v1);4998emit_v_div_u32(ctx, divided, as_vgpr(ctx, instance_id), divisor);4999index = bld.vadd32(bld.def(v1), start_instance, divided);5000} else {5001index = bld.vadd32(bld.def(v1), start_instance, instance_id);5002}5003} else {5004index = bld.copy(bld.def(v1), start_instance);5005}5006} else {5007index = bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->ac.base_vertex),5008get_arg(ctx, ctx->args->ac.vertex_id));5009}50105011Temp* const channels = (Temp*)alloca(num_channels * sizeof(Temp));5012unsigned channel_start = 0;5013bool direct_fetch = false;50145015/* skip unused channels at the start */5016if (vtx_info->chan_byte_size && !post_shuffle) {5017channel_start = ffs(mask) - 1;5018for (unsigned i = 0; i < MIN2(channel_start, num_channels); i++)5019channels[i] = Temp(0, s1);5020} else if (vtx_info->chan_byte_size && post_shuffle && !(mask & 0x8)) {5021num_channels = 3 - (ffs(mask) - 1);5022}50235024/* load channels */5025while (channel_start < num_channels) {5026unsigned fetch_component = num_channels - channel_start;5027unsigned fetch_offset = attrib_offset + channel_start * vtx_info->chan_byte_size;5028bool expanded = false;50295030/* use MUBUF when possible to avoid possible alignment issues */5031/* TODO: we could use SDWA to unpack 8/16-bit attributes without extra instructions */5032bool use_mubuf =5033(nfmt == V_008F0C_BUF_NUM_FORMAT_FLOAT || nfmt == V_008F0C_BUF_NUM_FORMAT_UINT ||5034nfmt == V_008F0C_BUF_NUM_FORMAT_SINT) &&5035vtx_info->chan_byte_size == 4;5036unsigned fetch_dfmt = V_008F0C_BUF_DATA_FORMAT_INVALID;5037if (!use_mubuf) {5038fetch_dfmt =5039get_fetch_data_format(ctx, vtx_info, fetch_offset, &fetch_component,5040vtx_info->num_channels - channel_start, binding_align);5041} else {5042if (fetch_component == 3 && ctx->options->chip_class == GFX6) {5043/* GFX6 only supports loading vec3 with MTBUF, expand to vec4. */5044fetch_component = 4;5045expanded = true;5046}5047}50485049unsigned fetch_bytes = fetch_component * bitsize / 8;50505051Temp fetch_index = index;5052if (attrib_stride != 0 && fetch_offset > attrib_stride) {5053fetch_index =5054bld.vadd32(bld.def(v1), Operand::c32(fetch_offset / attrib_stride), fetch_index);5055fetch_offset = fetch_offset % attrib_stride;5056}50575058Operand soffset = Operand::zero();5059if (fetch_offset >= 4096) {5060soffset = bld.copy(bld.def(s1), Operand::c32(fetch_offset / 4096 * 4096));5061fetch_offset %= 4096;5062}50635064aco_opcode opcode;5065switch (fetch_bytes) {5066case 2:5067assert(!use_mubuf && bitsize == 16);5068opcode = aco_opcode::tbuffer_load_format_d16_x;5069break;5070case 4:5071if (bitsize == 16) {5072assert(!use_mubuf);5073opcode = aco_opcode::tbuffer_load_format_d16_xy;5074} else {5075opcode =5076use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x;5077}5078break;5079case 6:5080assert(!use_mubuf && bitsize == 16);5081opcode = aco_opcode::tbuffer_load_format_d16_xyz;5082break;5083case 8:5084if (bitsize == 16) {5085assert(!use_mubuf);5086opcode = aco_opcode::tbuffer_load_format_d16_xyzw;5087} else {5088opcode =5089use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy;5090}5091break;5092case 12:5093assert(ctx->options->chip_class >= GFX7 ||5094(!use_mubuf && ctx->options->chip_class == GFX6));5095opcode =5096use_mubuf ? aco_opcode::buffer_load_dwordx3 : aco_opcode::tbuffer_load_format_xyz;5097break;5098case 16:5099opcode =5100use_mubuf ? aco_opcode::buffer_load_dwordx4 : aco_opcode::tbuffer_load_format_xyzw;5101break;5102default: unreachable("Unimplemented load_input vector size");5103}51045105Temp fetch_dst;5106if (channel_start == 0 && fetch_bytes == dst.bytes() && !post_shuffle && !expanded &&5107(alpha_adjust == AC_FETCH_FORMAT_NONE || num_channels <= 3)) {5108direct_fetch = true;5109fetch_dst = dst;5110} else {5111fetch_dst = bld.tmp(RegClass::get(RegType::vgpr, fetch_bytes));5112}51135114if (use_mubuf) {5115Instruction* mubuf = bld.mubuf(opcode, Definition(fetch_dst), list, fetch_index,5116soffset, fetch_offset, false, false, true)5117.instr;5118mubuf->mubuf().vtx_binding = attrib_binding + 1;5119} else {5120Instruction* mtbuf = bld.mtbuf(opcode, Definition(fetch_dst), list, fetch_index,5121soffset, fetch_dfmt, nfmt, fetch_offset, false, true)5122.instr;5123mtbuf->mtbuf().vtx_binding = attrib_binding + 1;5124}51255126emit_split_vector(ctx, fetch_dst, fetch_dst.size());51275128if (fetch_component == 1) {5129channels[channel_start] = fetch_dst;5130} else {5131for (unsigned i = 0; i < MIN2(fetch_component, num_channels - channel_start); i++)5132channels[channel_start + i] =5133emit_extract_vector(ctx, fetch_dst, i, bitsize == 16 ? v2b : v1);5134}51355136channel_start += fetch_component;5137}51385139if (!direct_fetch) {5140bool is_float =5141nfmt != V_008F0C_BUF_NUM_FORMAT_UINT && nfmt != V_008F0C_BUF_NUM_FORMAT_SINT;51425143static const unsigned swizzle_normal[4] = {0, 1, 2, 3};5144static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3};5145const unsigned* swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal;5146unsigned num_components = instr->dest.ssa.num_components;51475148aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(5149aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};5150std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;5151unsigned num_temp = 0;5152for (unsigned i = 0; i < num_components; i++) {5153unsigned idx = i + component;5154if (swizzle[idx] < num_channels && channels[swizzle[idx]].id()) {5155Temp channel = channels[swizzle[idx]];5156if (idx == 3 && alpha_adjust != AC_FETCH_FORMAT_NONE)5157channel = adjust_vertex_fetch_alpha(ctx, alpha_adjust, channel);5158vec->operands[i] = Operand(channel);51595160num_temp++;5161elems[i] = channel;5162} else if (is_float && idx == 3) {5163vec->operands[i] = Operand::c32(0x3f800000u);5164} else if (!is_float && idx == 3) {5165vec->operands[i] = Operand::c32(1u);5166} else {5167vec->operands[i] = Operand::zero();5168}5169}5170vec->definitions[0] = Definition(dst);5171ctx->block->instructions.emplace_back(std::move(vec));5172emit_split_vector(ctx, dst, num_components);51735174if (num_temp == num_components)5175ctx->allocated_vec.emplace(dst.id(), elems);5176}5177} else if (ctx->shader->info.stage == MESA_SHADER_FRAGMENT) {5178if (!nir_src_is_const(offset) || nir_src_as_uint(offset))5179isel_err(offset.ssa->parent_instr,5180"Unimplemented non-zero nir_intrinsic_load_input offset");51815182Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);51835184unsigned idx = nir_intrinsic_base(instr);5185unsigned component = nir_intrinsic_component(instr);5186unsigned vertex_id = 2; /* P0 */51875188if (instr->intrinsic == nir_intrinsic_load_input_vertex) {5189nir_const_value* src0 = nir_src_as_const_value(instr->src[0]);5190switch (src0->u32) {5191case 0:5192vertex_id = 2; /* P0 */5193break;5194case 1:5195vertex_id = 0; /* P10 */5196break;5197case 2:5198vertex_id = 1; /* P20 */5199break;5200default: unreachable("invalid vertex index");5201}5202}52035204if (dst.size() == 1) {5205bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand::c32(vertex_id),5206bld.m0(prim_mask), idx, component);5207} else {5208aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(5209aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};5210for (unsigned i = 0; i < dst.size(); i++)5211vec->operands[i] =5212bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand::c32(vertex_id),5213bld.m0(prim_mask), idx, component + i);5214vec->definitions[0] = Definition(dst);5215bld.insert(std::move(vec));5216}5217} else {5218unreachable("Shader stage not implemented");5219}5220}52215222void5223visit_load_tcs_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)5224{5225assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);52265227Builder bld(ctx->program, ctx->block);5228Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);52295230if (load_input_from_temps(ctx, instr, dst))5231return;52325233unreachable("LDS-based TCS input should have been lowered in NIR.");5234}52355236void5237visit_load_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)5238{5239switch (ctx->shader->info.stage) {5240case MESA_SHADER_TESS_CTRL: visit_load_tcs_per_vertex_input(ctx, instr); break;5241default: unreachable("Unimplemented shader stage");5242}5243}52445245void5246visit_load_tess_coord(isel_context* ctx, nir_intrinsic_instr* instr)5247{5248assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);52495250Builder bld(ctx->program, ctx->block);5251Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);52525253Operand tes_u(get_arg(ctx, ctx->args->ac.tes_u));5254Operand tes_v(get_arg(ctx, ctx->args->ac.tes_v));5255Operand tes_w = Operand::zero();52565257if (ctx->shader->info.tess.primitive_mode == GL_TRIANGLES) {5258Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tes_u, tes_v);5259tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand::c32(0x3f800000u /* 1.0f */), tmp);5260tes_w = Operand(tmp);5261}52625263Temp tess_coord = bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tes_u, tes_v, tes_w);5264emit_split_vector(ctx, tess_coord, 3);5265}52665267Temp5268load_desc_ptr(isel_context* ctx, unsigned desc_set)5269{5270if (ctx->program->info->need_indirect_descriptor_sets) {5271Builder bld(ctx->program, ctx->block);5272Temp ptr64 = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->descriptor_sets[0]));5273Operand off = bld.copy(bld.def(s1), Operand::c32(desc_set << 2));5274return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, off); //, false, false, false);5275}52765277return get_arg(ctx, ctx->args->descriptor_sets[desc_set]);5278}52795280void5281visit_load_resource(isel_context* ctx, nir_intrinsic_instr* instr)5282{5283Builder bld(ctx->program, ctx->block);5284Temp index = get_ssa_temp(ctx, instr->src[0].ssa);5285if (!nir_dest_is_divergent(instr->dest))5286index = bld.as_uniform(index);5287unsigned desc_set = nir_intrinsic_desc_set(instr);5288unsigned binding = nir_intrinsic_binding(instr);52895290Temp desc_ptr;5291radv_pipeline_layout* pipeline_layout = ctx->options->layout;5292radv_descriptor_set_layout* layout = pipeline_layout->set[desc_set].layout;5293unsigned offset = layout->binding[binding].offset;5294unsigned stride;5295if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||5296layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {5297unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start +5298layout->binding[binding].dynamic_offset_offset;5299desc_ptr = get_arg(ctx, ctx->args->ac.push_constants);5300offset = pipeline_layout->push_constant_size + 16 * idx;5301stride = 16;5302} else {5303desc_ptr = load_desc_ptr(ctx, desc_set);5304stride = layout->binding[binding].size;5305}53065307if (nir_src_is_const(instr->src[0])) {5308index =5309bld.copy(bld.def(s1), Operand::c32((offset + nir_src_as_uint(instr->src[0]) * stride)));5310} else if (index.type() == RegType::vgpr) {5311if (stride != 1) {5312bool index24bit = layout->binding[binding].array_size <= 0x1000000;5313index = bld.v_mul_imm(bld.def(v1), index, stride, index24bit);5314}5315if (offset)5316index = bld.vadd32(bld.def(v1), Operand::c32(offset), index);5317} else {5318if (stride != 1)5319index = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(stride), index);5320if (offset)5321index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),5322Operand::c32(offset), index);5323}53245325Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);5326std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;5327elems[0] = desc_ptr;5328elems[1] = index;5329ctx->allocated_vec.emplace(dst.id(), elems);5330bld.pseudo(aco_opcode::p_create_vector, Definition(dst), desc_ptr, index, Operand::zero());5331}53325333void5334load_buffer(isel_context* ctx, unsigned num_components, unsigned component_size, Temp dst,5335Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset, bool glc = false,5336bool allow_smem = true, memory_sync_info sync = memory_sync_info())5337{5338Builder bld(ctx->program, ctx->block);53395340bool use_smem =5341dst.type() != RegType::vgpr && (!glc || ctx->options->chip_class >= GFX8) && allow_smem;5342if (use_smem)5343offset = bld.as_uniform(offset);5344else {5345/* GFX6-7 are affected by a hw bug that prevents address clamping to5346* work correctly when the SGPR offset is used.5347*/5348if (offset.type() == RegType::sgpr && ctx->options->chip_class < GFX8)5349offset = as_vgpr(ctx, offset);5350}53515352LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc};5353info.glc = glc;5354info.sync = sync;5355info.align_mul = align_mul;5356info.align_offset = align_offset;5357if (use_smem)5358emit_load(ctx, bld, info, smem_load_params);5359else5360emit_load(ctx, bld, info, mubuf_load_params);5361}53625363Temp5364load_buffer_rsrc(isel_context* ctx, Temp rsrc)5365{5366Builder bld(ctx->program, ctx->block);5367Temp set_ptr = emit_extract_vector(ctx, rsrc, 0, RegClass(rsrc.type(), 1));5368Temp binding = bld.as_uniform(emit_extract_vector(ctx, rsrc, 1, RegClass(rsrc.type(), 1)));5369set_ptr = convert_pointer_to_64_bit(ctx, set_ptr);5370return bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), set_ptr, binding);5371}53725373bool5374is_inline_ubo(isel_context* ctx, nir_src rsrc)5375{5376nir_binding binding = nir_chase_binding(rsrc);5377if (!binding.success)5378return false;53795380radv_descriptor_set_layout* layout = ctx->options->layout->set[binding.desc_set].layout;5381return layout->binding[binding.binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT;5382}53835384void5385visit_load_ubo(isel_context* ctx, nir_intrinsic_instr* instr)5386{5387Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);5388Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa);53895390Builder bld(ctx->program, ctx->block);53915392if (is_inline_ubo(ctx, instr->src[0])) {5393Temp set_ptr = bld.as_uniform(emit_extract_vector(ctx, rsrc, 0, RegClass(rsrc.type(), 1)));5394Temp binding_off =5395bld.as_uniform(emit_extract_vector(ctx, rsrc, 1, RegClass(rsrc.type(), 1)));5396rsrc = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), set_ptr, binding_off);53975398uint32_t desc_type =5399S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |5400S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);5401if (ctx->options->chip_class >= GFX10) {5402desc_type |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |5403S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);5404} else {5405desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |5406S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);5407}5408rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), rsrc,5409Operand::c32(S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi)),5410Operand::c32(0xFFFFFFFFu), Operand::c32(desc_type));5411} else {5412rsrc = load_buffer_rsrc(ctx, rsrc);5413}5414unsigned size = instr->dest.ssa.bit_size / 8;5415load_buffer(ctx, instr->num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),5416nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr));5417}54185419void5420visit_load_sbt_amd(isel_context* ctx, nir_intrinsic_instr* instr)5421{5422Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);5423Temp index = get_ssa_temp(ctx, instr->src[0].ssa);5424unsigned binding = nir_intrinsic_binding(instr);5425unsigned base = nir_intrinsic_base(instr);54265427index = as_vgpr(ctx, index);54285429Builder bld(ctx->program, ctx->block);5430Temp desc_base = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.sbt_descriptors));5431Operand desc_off = bld.copy(bld.def(s1), Operand::c32(binding * 16u));5432Temp rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), desc_base, desc_off);54335434/* If we want more we need to implement */5435assert(instr->dest.ssa.bit_size == 32);5436assert(instr->num_components == 1);54375438bld.mubuf(aco_opcode::buffer_load_dword, Definition(dst), rsrc, index, Operand::zero(), base,5439false, false, true);5440}54415442void5443visit_load_push_constant(isel_context* ctx, nir_intrinsic_instr* instr)5444{5445Builder bld(ctx->program, ctx->block);5446Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);5447unsigned offset = nir_intrinsic_base(instr);5448unsigned count = instr->dest.ssa.num_components;5449nir_const_value* index_cv = nir_src_as_const_value(instr->src[0]);54505451if (index_cv && instr->dest.ssa.bit_size == 32) {5452unsigned start = (offset + index_cv->u32) / 4u;5453start -= ctx->args->ac.base_inline_push_consts;5454if (start + count <= ctx->args->ac.num_inline_push_consts) {5455std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;5456aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(5457aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};5458for (unsigned i = 0; i < count; ++i) {5459elems[i] = get_arg(ctx, ctx->args->ac.inline_push_consts[start + i]);5460vec->operands[i] = Operand{elems[i]};5461}5462vec->definitions[0] = Definition(dst);5463ctx->block->instructions.emplace_back(std::move(vec));5464ctx->allocated_vec.emplace(dst.id(), elems);5465return;5466}5467}54685469Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));5470if (offset != 0) // TODO check if index != 0 as well5471index = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),5472Operand::c32(offset), index);5473Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants));5474Temp vec = dst;5475bool trim = false;5476bool aligned = true;54775478if (instr->dest.ssa.bit_size == 8) {5479aligned = index_cv && (offset + index_cv->u32) % 4 == 0;5480bool fits_in_dword = count == 1 || (index_cv && ((offset + index_cv->u32) % 4 + count) <= 4);5481if (!aligned)5482vec = fits_in_dword ? bld.tmp(s1) : bld.tmp(s2);5483} else if (instr->dest.ssa.bit_size == 16) {5484aligned = index_cv && (offset + index_cv->u32) % 4 == 0;5485if (!aligned)5486vec = count == 4 ? bld.tmp(s4) : count > 1 ? bld.tmp(s2) : bld.tmp(s1);5487}54885489aco_opcode op;54905491switch (vec.size()) {5492case 1: op = aco_opcode::s_load_dword; break;5493case 2: op = aco_opcode::s_load_dwordx2; break;5494case 3:5495vec = bld.tmp(s4);5496trim = true;5497FALLTHROUGH;5498case 4: op = aco_opcode::s_load_dwordx4; break;5499case 6:5500vec = bld.tmp(s8);5501trim = true;5502FALLTHROUGH;5503case 8: op = aco_opcode::s_load_dwordx8; break;5504default: unreachable("unimplemented or forbidden load_push_constant.");5505}55065507bld.smem(op, Definition(vec), ptr, index).instr->smem().prevent_overflow = true;55085509if (!aligned) {5510Operand byte_offset = index_cv ? Operand::c32((offset + index_cv->u32) % 4) : Operand(index);5511byte_align_scalar(ctx, vec, byte_offset, dst);5512return;5513}55145515if (trim) {5516emit_split_vector(ctx, vec, 4);5517RegClass rc = dst.size() == 3 ? s1 : s2;5518bld.pseudo(aco_opcode::p_create_vector, Definition(dst), emit_extract_vector(ctx, vec, 0, rc),5519emit_extract_vector(ctx, vec, 1, rc), emit_extract_vector(ctx, vec, 2, rc));5520}5521emit_split_vector(ctx, dst, instr->dest.ssa.num_components);5522}55235524void5525visit_load_constant(isel_context* ctx, nir_intrinsic_instr* instr)5526{5527Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);55285529Builder bld(ctx->program, ctx->block);55305531uint32_t desc_type =5532S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |5533S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);5534if (ctx->options->chip_class >= GFX10) {5535desc_type |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |5536S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);5537} else {5538desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |5539S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);5540}55415542unsigned base = nir_intrinsic_base(instr);5543unsigned range = nir_intrinsic_range(instr);55445545Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);5546if (base && offset.type() == RegType::sgpr)5547offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,5548Operand::c32(base));5549else if (base && offset.type() == RegType::vgpr)5550offset = bld.vadd32(bld.def(v1), Operand::c32(base), offset);55515552Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),5553bld.pseudo(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc),5554Operand::c32(ctx->constant_data_offset)),5555Operand::c32(MIN2(base + range, ctx->shader->constant_data_size)),5556Operand::c32(desc_type));5557unsigned size = instr->dest.ssa.bit_size / 8;5558// TODO: get alignment information for subdword constants5559load_buffer(ctx, instr->num_components, size, dst, rsrc, offset, size, 0);5560}55615562void5563visit_discard_if(isel_context* ctx, nir_intrinsic_instr* instr)5564{5565if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)5566ctx->cf_info.exec_potentially_empty_discard = true;55675568ctx->program->needs_exact = true;55695570// TODO: optimize uniform conditions5571Builder bld(ctx->program, ctx->block);5572Temp src = get_ssa_temp(ctx, instr->src[0].ssa);5573assert(src.regClass() == bld.lm);5574src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));5575bld.pseudo(aco_opcode::p_discard_if, src);5576ctx->block->kind |= block_kind_uses_discard_if;5577return;5578}55795580void5581visit_discard(isel_context* ctx, nir_intrinsic_instr* instr)5582{5583Builder bld(ctx->program, ctx->block);55845585if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)5586ctx->cf_info.exec_potentially_empty_discard = true;55875588bool divergent =5589ctx->cf_info.parent_if.is_divergent || ctx->cf_info.parent_loop.has_divergent_continue;55905591if (ctx->block->loop_nest_depth && (nir_instr_is_last(&instr->instr) && !divergent)) {5592/* we handle discards the same way as jump instructions */5593append_logical_end(ctx->block);55945595/* in loops, discard behaves like break */5596Block* linear_target = ctx->cf_info.parent_loop.exit;5597ctx->block->kind |= block_kind_discard;55985599/* uniform discard - loop ends here */5600assert(nir_instr_is_last(&instr->instr));5601ctx->block->kind |= block_kind_uniform;5602ctx->cf_info.has_branch = true;5603bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));5604add_linear_edge(ctx->block->index, linear_target);5605return;5606}56075608/* it can currently happen that NIR doesn't remove the unreachable code */5609if (!nir_instr_is_last(&instr->instr)) {5610ctx->program->needs_exact = true;5611/* save exec somewhere temporarily so that it doesn't get5612* overwritten before the discard from outer exec masks */5613Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc),5614Operand::c32(0xFFFFFFFF), Operand(exec, bld.lm));5615bld.pseudo(aco_opcode::p_discard_if, cond);5616ctx->block->kind |= block_kind_uses_discard_if;5617return;5618}56195620/* This condition is incorrect for uniformly branched discards in a loop5621* predicated by a divergent condition, but the above code catches that case5622* and the discard would end up turning into a discard_if.5623* For example:5624* if (divergent) {5625* while (...) {5626* if (uniform) {5627* discard;5628* }5629* }5630* }5631*/5632if (!ctx->cf_info.parent_if.is_divergent) {5633/* program just ends here */5634ctx->block->kind |= block_kind_uses_discard_if;5635bld.pseudo(aco_opcode::p_discard_if, Operand::c32(0xFFFFFFFFu));5636// TODO: it will potentially be followed by a branch which is dead code to sanitize NIR phis5637} else {5638ctx->block->kind |= block_kind_discard;5639/* branch and linear edge is added by visit_if() */5640}5641}56425643enum aco_descriptor_type {5644ACO_DESC_IMAGE,5645ACO_DESC_FMASK,5646ACO_DESC_SAMPLER,5647ACO_DESC_BUFFER,5648ACO_DESC_PLANE_0,5649ACO_DESC_PLANE_1,5650ACO_DESC_PLANE_2,5651};56525653static bool5654should_declare_array(isel_context* ctx, enum glsl_sampler_dim sampler_dim, bool is_array)5655{5656if (sampler_dim == GLSL_SAMPLER_DIM_BUF)5657return false;5658ac_image_dim dim = ac_get_sampler_dim(ctx->options->chip_class, sampler_dim, is_array);5659return dim == ac_image_cube || dim == ac_image_1darray || dim == ac_image_2darray ||5660dim == ac_image_2darraymsaa;5661}56625663Temp5664get_sampler_desc(isel_context* ctx, nir_deref_instr* deref_instr,5665enum aco_descriptor_type desc_type, const nir_tex_instr* tex_instr, bool write)5666{5667/* FIXME: we should lower the deref with some new nir_intrinsic_load_desc5668std::unordered_map<uint64_t, Temp>::iterator it = ctx->tex_desc.find((uint64_t) desc_type <<566932 | deref_instr->dest.ssa.index); if (it != ctx->tex_desc.end()) return it->second;5670*/5671Temp index = Temp();5672bool index_set = false;5673unsigned constant_index = 0;5674unsigned descriptor_set;5675unsigned base_index;5676Builder bld(ctx->program, ctx->block);56775678if (!deref_instr) {5679assert(tex_instr);5680descriptor_set = 0;5681base_index = tex_instr->sampler_index;5682} else {5683while (deref_instr->deref_type != nir_deref_type_var) {5684unsigned array_size = glsl_get_aoa_size(deref_instr->type);5685if (!array_size)5686array_size = 1;56875688assert(deref_instr->deref_type == nir_deref_type_array);5689nir_const_value* const_value = nir_src_as_const_value(deref_instr->arr.index);5690if (const_value) {5691constant_index += array_size * const_value->u32;5692} else {5693Temp indirect = get_ssa_temp(ctx, deref_instr->arr.index.ssa);5694if (indirect.type() == RegType::vgpr)5695indirect = bld.as_uniform(indirect);56965697if (array_size != 1)5698indirect =5699bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(array_size), indirect);57005701if (!index_set) {5702index = indirect;5703index_set = true;5704} else {5705index =5706bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), index, indirect);5707}5708}57095710deref_instr = nir_src_as_deref(deref_instr->parent);5711}5712descriptor_set = deref_instr->var->data.descriptor_set;5713base_index = deref_instr->var->data.binding;5714}57155716Temp list = load_desc_ptr(ctx, descriptor_set);5717list = convert_pointer_to_64_bit(ctx, list);57185719struct radv_descriptor_set_layout* layout = ctx->options->layout->set[descriptor_set].layout;5720struct radv_descriptor_set_binding_layout* binding = layout->binding + base_index;5721unsigned offset = binding->offset;5722unsigned stride = binding->size;5723aco_opcode opcode;5724RegClass type;57255726assert(base_index < layout->binding_count);57275728switch (desc_type) {5729case ACO_DESC_IMAGE:5730type = s8;5731opcode = aco_opcode::s_load_dwordx8;5732break;5733case ACO_DESC_FMASK:5734type = s8;5735opcode = aco_opcode::s_load_dwordx8;5736offset += 32;5737break;5738case ACO_DESC_SAMPLER:5739type = s4;5740opcode = aco_opcode::s_load_dwordx4;5741if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)5742offset += radv_combined_image_descriptor_sampler_offset(binding);5743break;5744case ACO_DESC_BUFFER:5745type = s4;5746opcode = aco_opcode::s_load_dwordx4;5747break;5748case ACO_DESC_PLANE_0:5749case ACO_DESC_PLANE_1:5750type = s8;5751opcode = aco_opcode::s_load_dwordx8;5752offset += 32 * (desc_type - ACO_DESC_PLANE_0);5753break;5754case ACO_DESC_PLANE_2:5755type = s4;5756opcode = aco_opcode::s_load_dwordx4;5757offset += 64;5758break;5759default: unreachable("invalid desc_type\n");5760}57615762offset += constant_index * stride;57635764if (desc_type == ACO_DESC_SAMPLER && binding->immutable_samplers_offset &&5765(!index_set || binding->immutable_samplers_equal)) {5766if (binding->immutable_samplers_equal)5767constant_index = 0;57685769const uint32_t* samplers = radv_immutable_samplers(layout, binding);5770uint32_t dword0_mask = tex_instr->op == nir_texop_tg4 ? C_008F30_TRUNC_COORD : 0xffffffffu;5771return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),5772Operand::c32(samplers[constant_index * 4 + 0] & dword0_mask),5773Operand::c32(samplers[constant_index * 4 + 1]),5774Operand::c32(samplers[constant_index * 4 + 2]),5775Operand::c32(samplers[constant_index * 4 + 3]));5776}57775778Operand off;5779if (!index_set) {5780off = bld.copy(bld.def(s1), Operand::c32(offset));5781} else {5782off = Operand(5783(Temp)bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand::c32(offset),5784bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(stride), index)));5785}57865787Temp res = bld.smem(opcode, bld.def(type), list, off);57885789if (desc_type == ACO_DESC_PLANE_2) {5790Temp components[8];5791for (unsigned i = 0; i < 8; i++)5792components[i] = bld.tmp(s1);5793bld.pseudo(aco_opcode::p_split_vector, Definition(components[0]), Definition(components[1]),5794Definition(components[2]), Definition(components[3]), res);57955796Temp desc2 = get_sampler_desc(ctx, deref_instr, ACO_DESC_PLANE_1, tex_instr, write);5797bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), bld.def(s1), bld.def(s1),5798Definition(components[4]), Definition(components[5]), Definition(components[6]),5799Definition(components[7]), desc2);58005801res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), components[0], components[1],5802components[2], components[3], components[4], components[5], components[6],5803components[7]);5804} else if (desc_type == ACO_DESC_IMAGE && ctx->options->has_image_load_dcc_bug && !tex_instr &&5805!write) {5806Temp components[8];5807for (unsigned i = 0; i < 8; i++)5808components[i] = bld.tmp(s1);58095810bld.pseudo(aco_opcode::p_split_vector, Definition(components[0]), Definition(components[1]),5811Definition(components[2]), Definition(components[3]), Definition(components[4]),5812Definition(components[5]), Definition(components[6]), Definition(components[7]),5813res);58145815/* WRITE_COMPRESS_ENABLE must be 0 for all image loads to workaround a5816* hardware bug.5817*/5818components[6] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), components[6],5819bld.copy(bld.def(s1), Operand::c32(C_00A018_WRITE_COMPRESS_ENABLE)));58205821res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), components[0], components[1],5822components[2], components[3], components[4], components[5], components[6],5823components[7]);5824} else if (desc_type == ACO_DESC_SAMPLER && tex_instr->op == nir_texop_tg4) {5825Temp components[4];5826for (unsigned i = 0; i < 4; i++)5827components[i] = bld.tmp(s1);58285829bld.pseudo(aco_opcode::p_split_vector, Definition(components[0]), Definition(components[1]),5830Definition(components[2]), Definition(components[3]), res);58315832/* We want to always use the linear filtering truncation behaviour for5833* nir_texop_tg4, even if the sampler uses nearest/point filtering.5834*/5835components[0] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), components[0],5836Operand::c32(C_008F30_TRUNC_COORD));58375838res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), components[0], components[1],5839components[2], components[3]);5840}58415842return res;5843}58445845static int5846image_type_to_components_count(enum glsl_sampler_dim dim, bool array)5847{5848switch (dim) {5849case GLSL_SAMPLER_DIM_BUF: return 1;5850case GLSL_SAMPLER_DIM_1D: return array ? 2 : 1;5851case GLSL_SAMPLER_DIM_2D: return array ? 3 : 2;5852case GLSL_SAMPLER_DIM_MS: return array ? 4 : 3;5853case GLSL_SAMPLER_DIM_3D:5854case GLSL_SAMPLER_DIM_CUBE: return 3;5855case GLSL_SAMPLER_DIM_RECT:5856case GLSL_SAMPLER_DIM_SUBPASS: return 2;5857case GLSL_SAMPLER_DIM_SUBPASS_MS: return 3;5858default: break;5859}5860return 0;5861}58625863static MIMG_instruction*5864emit_mimg(Builder& bld, aco_opcode op, Definition dst, Temp rsrc, Operand samp,5865std::vector<Temp> coords, unsigned wqm_mask = 0, Operand vdata = Operand(v1))5866{5867/* Limit NSA instructions to 3 dwords on GFX10 to avoid stability issues. */5868unsigned max_nsa_size = bld.program->chip_class >= GFX10_3 ? 13 : 5;5869bool use_nsa = bld.program->chip_class >= GFX10 && coords.size() <= max_nsa_size;58705871if (!use_nsa) {5872Temp coord = coords[0];5873if (coords.size() > 1) {5874coord = bld.tmp(RegType::vgpr, coords.size());58755876aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(5877aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};5878for (unsigned i = 0; i < coords.size(); i++)5879vec->operands[i] = Operand(coords[i]);5880vec->definitions[0] = Definition(coord);5881bld.insert(std::move(vec));5882} else if (coord.type() == RegType::sgpr) {5883coord = bld.copy(bld.def(v1), coord);5884}58855886if (wqm_mask) {5887/* We don't need the bias, sample index, compare value or offset to be5888* computed in WQM but if the p_create_vector copies the coordinates, then it5889* needs to be in WQM. */5890coord = emit_wqm(bld, coord, bld.tmp(coord.regClass()), true);5891}58925893coords[0] = coord;5894coords.resize(1);5895} else {5896for (unsigned i = 0; i < coords.size(); i++) {5897if (wqm_mask & (1u << i))5898coords[i] = emit_wqm(bld, coords[i], bld.tmp(coords[i].regClass()), true);5899}59005901for (Temp& coord : coords) {5902if (coord.type() == RegType::sgpr)5903coord = bld.copy(bld.def(v1), coord);5904}5905}59065907aco_ptr<MIMG_instruction> mimg{5908create_instruction<MIMG_instruction>(op, Format::MIMG, 3 + coords.size(), dst.isTemp())};5909if (dst.isTemp())5910mimg->definitions[0] = dst;5911mimg->operands[0] = Operand(rsrc);5912mimg->operands[1] = samp;5913mimg->operands[2] = vdata;5914for (unsigned i = 0; i < coords.size(); i++)5915mimg->operands[3 + i] = Operand(coords[i]);59165917MIMG_instruction* res = mimg.get();5918bld.insert(std::move(mimg));5919return res;5920}59215922void5923visit_bvh64_intersect_ray_amd(isel_context* ctx, nir_intrinsic_instr* instr)5924{5925Builder bld(ctx->program, ctx->block);5926Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);5927Temp resource = get_ssa_temp(ctx, instr->src[0].ssa);5928Temp node = get_ssa_temp(ctx, instr->src[1].ssa);5929Temp tmax = get_ssa_temp(ctx, instr->src[2].ssa);5930Temp origin = get_ssa_temp(ctx, instr->src[3].ssa);5931Temp dir = get_ssa_temp(ctx, instr->src[4].ssa);5932Temp inv_dir = get_ssa_temp(ctx, instr->src[5].ssa);59335934std::vector<Temp> args;5935args.push_back(emit_extract_vector(ctx, node, 0, v1));5936args.push_back(emit_extract_vector(ctx, node, 1, v1));5937args.push_back(as_vgpr(ctx, tmax));5938args.push_back(emit_extract_vector(ctx, origin, 0, v1));5939args.push_back(emit_extract_vector(ctx, origin, 1, v1));5940args.push_back(emit_extract_vector(ctx, origin, 2, v1));5941args.push_back(emit_extract_vector(ctx, dir, 0, v1));5942args.push_back(emit_extract_vector(ctx, dir, 1, v1));5943args.push_back(emit_extract_vector(ctx, dir, 2, v1));5944args.push_back(emit_extract_vector(ctx, inv_dir, 0, v1));5945args.push_back(emit_extract_vector(ctx, inv_dir, 1, v1));5946args.push_back(emit_extract_vector(ctx, inv_dir, 2, v1));59475948MIMG_instruction* mimg = emit_mimg(bld, aco_opcode::image_bvh64_intersect_ray, Definition(dst),5949resource, Operand(s4), args);5950mimg->dim = ac_image_1d;5951mimg->dmask = 0xf;5952mimg->unrm = true;5953mimg->r128 = true;5954}59555956/* Adjust the sample index according to FMASK.5957*5958* For uncompressed MSAA surfaces, FMASK should return 0x76543210,5959* which is the identity mapping. Each nibble says which physical sample5960* should be fetched to get that sample.5961*5962* For example, 0x11111100 means there are only 2 samples stored and5963* the second sample covers 3/4 of the pixel. When reading samples 05964* and 1, return physical sample 0 (determined by the first two 0s5965* in FMASK), otherwise return physical sample 1.5966*5967* The sample index should be adjusted as follows:5968* sample_index = (fmask >> (sample_index * 4)) & 0xF;5969*/5970static Temp5971adjust_sample_index_using_fmask(isel_context* ctx, bool da, std::vector<Temp>& coords,5972Operand sample_index, Temp fmask_desc_ptr)5973{5974Builder bld(ctx->program, ctx->block);5975Temp fmask = bld.tmp(v1);5976unsigned dim = ctx->options->chip_class >= GFX105977? ac_get_sampler_dim(ctx->options->chip_class, GLSL_SAMPLER_DIM_2D, da)5978: 0;59795980MIMG_instruction* load = emit_mimg(bld, aco_opcode::image_load, Definition(fmask),5981fmask_desc_ptr, Operand(s4), coords);5982load->glc = false;5983load->dlc = false;5984load->dmask = 0x1;5985load->unrm = true;5986load->da = da;5987load->dim = dim;59885989Operand sample_index4;5990if (sample_index.isConstant()) {5991if (sample_index.constantValue() < 16) {5992sample_index4 = Operand::c32(sample_index.constantValue() << 2);5993} else {5994sample_index4 = Operand::zero();5995}5996} else if (sample_index.regClass() == s1) {5997sample_index4 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index,5998Operand::c32(2u));5999} else {6000assert(sample_index.regClass() == v1);6001sample_index4 =6002bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), sample_index);6003}60046005Temp final_sample;6006if (sample_index4.isConstant() && sample_index4.constantValue() == 0)6007final_sample = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(15u), fmask);6008else if (sample_index4.isConstant() && sample_index4.constantValue() == 28)6009final_sample = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand::c32(28u), fmask);6010else6011final_sample =6012bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), fmask, sample_index4, Operand::c32(4u));60136014/* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK6015* resource descriptor is 0 (invalid),6016*/6017Temp compare = bld.tmp(bld.lm);6018bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(compare), Operand::zero(),6019emit_extract_vector(ctx, fmask_desc_ptr, 1, s1))6020.def(0)6021.setHint(vcc);60226023Temp sample_index_v = bld.copy(bld.def(v1), sample_index);60246025/* Replace the MSAA sample index. */6026return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), sample_index_v, final_sample, compare);6027}60286029static std::vector<Temp>6030get_image_coords(isel_context* ctx, const nir_intrinsic_instr* instr, const struct glsl_type* type)6031{60326033Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);6034enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);6035bool is_array = glsl_sampler_type_is_array(type);6036ASSERTED bool add_frag_pos =6037(dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);6038assert(!add_frag_pos && "Input attachments should be lowered.");6039bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);6040bool gfx9_1d = ctx->options->chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D;6041int count = image_type_to_components_count(dim, is_array);6042std::vector<Temp> coords(count);6043Builder bld(ctx->program, ctx->block);60446045if (is_ms) {6046count--;6047Temp src2 = get_ssa_temp(ctx, instr->src[2].ssa);6048/* get sample index */6049if (instr->intrinsic == nir_intrinsic_image_deref_load ||6050instr->intrinsic == nir_intrinsic_image_deref_sparse_load) {6051nir_const_value* sample_cv = nir_src_as_const_value(instr->src[2]);6052Operand sample_index = sample_cv ? Operand::c32(sample_cv->u32)6053: Operand(emit_extract_vector(ctx, src2, 0, v1));6054std::vector<Temp> fmask_load_address;6055for (unsigned i = 0; i < (is_array ? 3 : 2); i++)6056fmask_load_address.emplace_back(emit_extract_vector(ctx, src0, i, v1));60576058Temp fmask_desc_ptr =6059get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),6060ACO_DESC_FMASK, nullptr, false);6061coords[count] = adjust_sample_index_using_fmask(ctx, is_array, fmask_load_address,6062sample_index, fmask_desc_ptr);6063} else {6064coords[count] = emit_extract_vector(ctx, src2, 0, v1);6065}6066}60676068if (gfx9_1d) {6069coords[0] = emit_extract_vector(ctx, src0, 0, v1);6070coords.resize(coords.size() + 1);6071coords[1] = bld.copy(bld.def(v1), Operand::zero());6072if (is_array)6073coords[2] = emit_extract_vector(ctx, src0, 1, v1);6074} else {6075for (int i = 0; i < count; i++)6076coords[i] = emit_extract_vector(ctx, src0, i, v1);6077}60786079if (instr->intrinsic == nir_intrinsic_image_deref_load ||6080instr->intrinsic == nir_intrinsic_image_deref_sparse_load ||6081instr->intrinsic == nir_intrinsic_image_deref_store) {6082int lod_index = instr->intrinsic == nir_intrinsic_image_deref_store ? 4 : 3;6083bool level_zero =6084nir_src_is_const(instr->src[lod_index]) && nir_src_as_uint(instr->src[lod_index]) == 0;60856086if (!level_zero)6087coords.emplace_back(get_ssa_temp(ctx, instr->src[lod_index].ssa));6088}60896090return coords;6091}60926093memory_sync_info6094get_memory_sync_info(nir_intrinsic_instr* instr, storage_class storage, unsigned semantics)6095{6096/* atomicrmw might not have NIR_INTRINSIC_ACCESS and there's nothing interesting there anyway */6097if (semantics & semantic_atomicrmw)6098return memory_sync_info(storage, semantics);60996100unsigned access = nir_intrinsic_access(instr);61016102if (access & ACCESS_VOLATILE)6103semantics |= semantic_volatile;6104if (access & ACCESS_CAN_REORDER)6105semantics |= semantic_can_reorder | semantic_private;61066107return memory_sync_info(storage, semantics);6108}61096110Operand6111emit_tfe_init(Builder& bld, Temp dst)6112{6113Temp tmp = bld.tmp(dst.regClass());61146115aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(6116aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};6117for (unsigned i = 0; i < dst.size(); i++)6118vec->operands[i] = Operand::zero();6119vec->definitions[0] = Definition(tmp);6120/* Since this is fixed to an instruction's definition register, any CSE will6121* just create copies. Copying costs about the same as zero-initialization,6122* but these copies can break up clauses.6123*/6124vec->definitions[0].setNoCSE(true);6125bld.insert(std::move(vec));61266127return Operand(tmp);6128}61296130void6131visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr)6132{6133Builder bld(ctx->program, ctx->block);6134const nir_variable* var =6135nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));6136const struct glsl_type* type = glsl_without_array(var->type);6137const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);6138bool is_array = glsl_sampler_type_is_array(type);6139bool is_sparse = instr->intrinsic == nir_intrinsic_image_deref_sparse_load;6140Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);61416142memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);6143unsigned access = var->data.access | nir_intrinsic_access(instr);61446145unsigned result_size = instr->dest.ssa.num_components - is_sparse;6146unsigned expand_mask =6147nir_ssa_def_components_read(&instr->dest.ssa) & u_bit_consecutive(0, result_size);6148expand_mask = MAX2(expand_mask, 1); /* this can be zero in the case of sparse image loads */6149if (dim == GLSL_SAMPLER_DIM_BUF)6150expand_mask = (1u << util_last_bit(expand_mask)) - 1u;6151unsigned dmask = expand_mask;6152if (instr->dest.ssa.bit_size == 64) {6153expand_mask &= 0x9;6154/* only R64_UINT and R64_SINT supported. x is in xy of the result, w in zw */6155dmask = ((expand_mask & 0x1) ? 0x3 : 0) | ((expand_mask & 0x8) ? 0xc : 0);6156}6157if (is_sparse)6158expand_mask |= 1 << result_size;6159unsigned num_components = util_bitcount(dmask) + is_sparse;61606161Temp tmp;6162if (num_components == dst.size() && dst.type() == RegType::vgpr)6163tmp = dst;6164else6165tmp = ctx->program->allocateTmp(RegClass(RegType::vgpr, num_components));61666167Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),6168dim == GLSL_SAMPLER_DIM_BUF ? ACO_DESC_BUFFER : ACO_DESC_IMAGE,6169nullptr, false);61706171if (dim == GLSL_SAMPLER_DIM_BUF) {6172Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);61736174aco_opcode opcode;6175switch (util_bitcount(dmask)) {6176case 1: opcode = aco_opcode::buffer_load_format_x; break;6177case 2: opcode = aco_opcode::buffer_load_format_xy; break;6178case 3: opcode = aco_opcode::buffer_load_format_xyz; break;6179case 4: opcode = aco_opcode::buffer_load_format_xyzw; break;6180default: unreachable(">4 channel buffer image load");6181}6182aco_ptr<MUBUF_instruction> load{6183create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3 + is_sparse, 1)};6184load->operands[0] = Operand(resource);6185load->operands[1] = Operand(vindex);6186load->operands[2] = Operand::c32(0);6187load->definitions[0] = Definition(tmp);6188load->idxen = true;6189load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);6190load->dlc = load->glc && ctx->options->chip_class >= GFX10;6191load->sync = sync;6192load->tfe = is_sparse;6193if (load->tfe)6194load->operands[3] = emit_tfe_init(bld, tmp);6195ctx->block->instructions.emplace_back(std::move(load));6196} else {6197std::vector<Temp> coords = get_image_coords(ctx, instr, type);61986199bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;6200aco_opcode opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip;62016202Operand vdata = is_sparse ? emit_tfe_init(bld, tmp) : Operand(v1);6203MIMG_instruction* load =6204emit_mimg(bld, opcode, Definition(tmp), resource, Operand(s4), coords, 0, vdata);6205load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;6206load->dlc = load->glc && ctx->options->chip_class >= GFX10;6207load->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);6208load->dmask = dmask;6209load->unrm = true;6210load->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));6211load->sync = sync;6212load->tfe = is_sparse;6213}62146215if (is_sparse && instr->dest.ssa.bit_size == 64) {6216/* The result components are 64-bit but the sparse residency code is6217* 32-bit. So add a zero to the end so expand_vector() works correctly.6218*/6219tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, tmp.size() + 1), tmp,6220Operand::zero());6221}62226223expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, expand_mask);6224}62256226void6227visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)6228{6229const nir_variable* var =6230nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));6231const struct glsl_type* type = glsl_without_array(var->type);6232const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);6233bool is_array = glsl_sampler_type_is_array(type);6234Temp data = get_ssa_temp(ctx, instr->src[3].ssa);62356236/* only R64_UINT and R64_SINT supported */6237if (instr->src[3].ssa->bit_size == 64 && data.bytes() > 8)6238data = emit_extract_vector(ctx, data, 0, RegClass(data.type(), 2));6239data = as_vgpr(ctx, data);62406241memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);6242unsigned access = var->data.access | nir_intrinsic_access(instr);6243bool glc = ctx->options->chip_class == GFX6 ||6244access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE)6245? 16246: 0;62476248if (dim == GLSL_SAMPLER_DIM_BUF) {6249Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),6250ACO_DESC_BUFFER, nullptr, true);6251Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);6252aco_opcode opcode;6253switch (data.size()) {6254case 1: opcode = aco_opcode::buffer_store_format_x; break;6255case 2: opcode = aco_opcode::buffer_store_format_xy; break;6256case 3: opcode = aco_opcode::buffer_store_format_xyz; break;6257case 4: opcode = aco_opcode::buffer_store_format_xyzw; break;6258default: unreachable(">4 channel buffer image store");6259}6260aco_ptr<MUBUF_instruction> store{6261create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};6262store->operands[0] = Operand(rsrc);6263store->operands[1] = Operand(vindex);6264store->operands[2] = Operand::c32(0);6265store->operands[3] = Operand(data);6266store->idxen = true;6267store->glc = glc;6268store->dlc = false;6269store->disable_wqm = true;6270store->sync = sync;6271ctx->program->needs_exact = true;6272ctx->block->instructions.emplace_back(std::move(store));6273return;6274}62756276assert(data.type() == RegType::vgpr);6277std::vector<Temp> coords = get_image_coords(ctx, instr, type);6278Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),6279ACO_DESC_IMAGE, nullptr, true);62806281bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;6282aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip;62836284Builder bld(ctx->program, ctx->block);6285MIMG_instruction* store =6286emit_mimg(bld, opcode, Definition(), resource, Operand(s4), coords, 0, Operand(data));6287store->glc = glc;6288store->dlc = false;6289store->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);6290store->dmask = (1 << data.size()) - 1;6291store->unrm = true;6292store->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));6293store->disable_wqm = true;6294store->sync = sync;6295ctx->program->needs_exact = true;6296return;6297}62986299void6300visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr)6301{6302bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa);6303const nir_variable* var =6304nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));6305const struct glsl_type* type = glsl_without_array(var->type);6306const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);6307bool is_array = glsl_sampler_type_is_array(type);6308Builder bld(ctx->program, ctx->block);63096310Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));6311bool is_64bit = data.bytes() == 8;6312assert((data.bytes() == 4 || data.bytes() == 8) && "only 32/64-bit image atomics implemented.");63136314if (instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap)6315data = bld.pseudo(aco_opcode::p_create_vector, bld.def(is_64bit ? v4 : v2),6316get_ssa_temp(ctx, instr->src[4].ssa), data);63176318aco_opcode buf_op, buf_op64, image_op;6319switch (instr->intrinsic) {6320case nir_intrinsic_image_deref_atomic_add:6321buf_op = aco_opcode::buffer_atomic_add;6322buf_op64 = aco_opcode::buffer_atomic_add_x2;6323image_op = aco_opcode::image_atomic_add;6324break;6325case nir_intrinsic_image_deref_atomic_umin:6326buf_op = aco_opcode::buffer_atomic_umin;6327buf_op64 = aco_opcode::buffer_atomic_umin_x2;6328image_op = aco_opcode::image_atomic_umin;6329break;6330case nir_intrinsic_image_deref_atomic_imin:6331buf_op = aco_opcode::buffer_atomic_smin;6332buf_op64 = aco_opcode::buffer_atomic_smin_x2;6333image_op = aco_opcode::image_atomic_smin;6334break;6335case nir_intrinsic_image_deref_atomic_umax:6336buf_op = aco_opcode::buffer_atomic_umax;6337buf_op64 = aco_opcode::buffer_atomic_umax_x2;6338image_op = aco_opcode::image_atomic_umax;6339break;6340case nir_intrinsic_image_deref_atomic_imax:6341buf_op = aco_opcode::buffer_atomic_smax;6342buf_op64 = aco_opcode::buffer_atomic_smax_x2;6343image_op = aco_opcode::image_atomic_smax;6344break;6345case nir_intrinsic_image_deref_atomic_and:6346buf_op = aco_opcode::buffer_atomic_and;6347buf_op64 = aco_opcode::buffer_atomic_and_x2;6348image_op = aco_opcode::image_atomic_and;6349break;6350case nir_intrinsic_image_deref_atomic_or:6351buf_op = aco_opcode::buffer_atomic_or;6352buf_op64 = aco_opcode::buffer_atomic_or_x2;6353image_op = aco_opcode::image_atomic_or;6354break;6355case nir_intrinsic_image_deref_atomic_xor:6356buf_op = aco_opcode::buffer_atomic_xor;6357buf_op64 = aco_opcode::buffer_atomic_xor_x2;6358image_op = aco_opcode::image_atomic_xor;6359break;6360case nir_intrinsic_image_deref_atomic_exchange:6361buf_op = aco_opcode::buffer_atomic_swap;6362buf_op64 = aco_opcode::buffer_atomic_swap_x2;6363image_op = aco_opcode::image_atomic_swap;6364break;6365case nir_intrinsic_image_deref_atomic_comp_swap:6366buf_op = aco_opcode::buffer_atomic_cmpswap;6367buf_op64 = aco_opcode::buffer_atomic_cmpswap_x2;6368image_op = aco_opcode::image_atomic_cmpswap;6369break;6370default:6371unreachable("visit_image_atomic should only be called with "6372"nir_intrinsic_image_deref_atomic_* instructions.");6373}63746375Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);6376memory_sync_info sync = get_memory_sync_info(instr, storage_image, semantic_atomicrmw);63776378if (dim == GLSL_SAMPLER_DIM_BUF) {6379Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);6380Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),6381ACO_DESC_BUFFER, nullptr, true);6382// assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet6383// implemented.");6384aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(6385is_64bit ? buf_op64 : buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};6386mubuf->operands[0] = Operand(resource);6387mubuf->operands[1] = Operand(vindex);6388mubuf->operands[2] = Operand::c32(0);6389mubuf->operands[3] = Operand(data);6390if (return_previous)6391mubuf->definitions[0] = Definition(dst);6392mubuf->offset = 0;6393mubuf->idxen = true;6394mubuf->glc = return_previous;6395mubuf->dlc = false; /* Not needed for atomics */6396mubuf->disable_wqm = true;6397mubuf->sync = sync;6398ctx->program->needs_exact = true;6399ctx->block->instructions.emplace_back(std::move(mubuf));6400return;6401}64026403std::vector<Temp> coords = get_image_coords(ctx, instr, type);6404Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),6405ACO_DESC_IMAGE, nullptr, true);6406Definition def = return_previous ? Definition(dst) : Definition();6407MIMG_instruction* mimg =6408emit_mimg(bld, image_op, def, resource, Operand(s4), coords, 0, Operand(data));6409mimg->glc = return_previous;6410mimg->dlc = false; /* Not needed for atomics */6411mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);6412mimg->dmask = (1 << data.size()) - 1;6413mimg->unrm = true;6414mimg->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));6415mimg->disable_wqm = true;6416mimg->sync = sync;6417ctx->program->needs_exact = true;6418return;6419}64206421void6422get_buffer_size(isel_context* ctx, Temp desc, Temp dst)6423{6424if (ctx->options->chip_class == GFX8) {6425/* we only have to divide by 1, 2, 4, 8, 12 or 16 */6426Builder bld(ctx->program, ctx->block);64276428Temp size = emit_extract_vector(ctx, desc, 2, s1);64296430Temp size_div3 = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1),6431bld.copy(bld.def(v1), Operand::c32(0xaaaaaaabu)), size);6432size_div3 = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc),6433bld.as_uniform(size_div3), Operand::c32(1u));64346435Temp stride = emit_extract_vector(ctx, desc, 1, s1);6436stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride,6437Operand::c32((5u << 16) | 16u));64386439Temp is12 = bld.sopc(aco_opcode::s_cmp_eq_i32, bld.def(s1, scc), stride, Operand::c32(12u));6440size = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), size_div3, size, bld.scc(is12));64416442Temp shr_dst = dst.type() == RegType::vgpr ? bld.tmp(s1) : dst;6443bld.sop2(aco_opcode::s_lshr_b32, Definition(shr_dst), bld.def(s1, scc), size,6444bld.sop1(aco_opcode::s_ff1_i32_b32, bld.def(s1), stride));6445if (dst.type() == RegType::vgpr)6446bld.copy(Definition(dst), shr_dst);64476448/* TODO: we can probably calculate this faster with v_skip when stride != 12 */6449} else {6450emit_extract_vector(ctx, desc, 2, dst);6451}6452}64536454void6455visit_image_size(isel_context* ctx, nir_intrinsic_instr* instr)6456{6457const nir_variable* var =6458nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));6459const struct glsl_type* type = glsl_without_array(var->type);6460const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);6461bool is_array = glsl_sampler_type_is_array(type);6462Builder bld(ctx->program, ctx->block);64636464if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) {6465Temp desc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),6466ACO_DESC_BUFFER, NULL, false);6467return get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa));6468}64696470/* LOD */6471assert(nir_src_as_uint(instr->src[1]) == 0);6472std::vector<Temp> lod{bld.copy(bld.def(v1), Operand::zero())};64736474/* Resource */6475Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),6476ACO_DESC_IMAGE, NULL, false);64776478Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);64796480MIMG_instruction* mimg =6481emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(dst), resource, Operand(s4), lod);6482uint8_t& dmask = mimg->dmask;6483mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);6484mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;6485mimg->da = glsl_sampler_type_is_array(type);64866487if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE && glsl_sampler_type_is_array(type)) {64886489assert(instr->dest.ssa.num_components == 3);6490Temp tmp = ctx->program->allocateTmp(v3);6491mimg->definitions[0] = Definition(tmp);6492emit_split_vector(ctx, tmp, 3);64936494/* divide 3rd value by 6 by multiplying with magic number */6495Temp c = bld.copy(bld.def(s1), Operand::c32(0x2AAAAAAB));6496Temp by_6 =6497bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp, 2, v1), c);64986499bld.pseudo(aco_opcode::p_create_vector, Definition(dst), emit_extract_vector(ctx, tmp, 0, v1),6500emit_extract_vector(ctx, tmp, 1, v1), by_6);65016502} else if (ctx->options->chip_class == GFX9 &&6503glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_1D &&6504glsl_sampler_type_is_array(type)) {6505assert(instr->dest.ssa.num_components == 2);6506dmask = 0x5;6507}65086509emit_split_vector(ctx, dst, instr->dest.ssa.num_components);6510}65116512void6513get_image_samples(isel_context* ctx, Definition dst, Temp resource)6514{6515Builder bld(ctx->program, ctx->block);65166517Temp dword3 = emit_extract_vector(ctx, resource, 3, s1);6518Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3,6519Operand::c32(16u | 4u << 16));6520Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand::c32(1u),6521samples_log2);6522Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3,6523Operand::c32(28u | 4u << 16 /* offset=28, width=4 */));65246525Operand default_sample = Operand::c32(1u);6526if (ctx->options->robust_buffer_access) {6527/* Extract the second dword of the descriptor, if it's6528* all zero, then it's a null descriptor.6529*/6530Temp dword1 = emit_extract_vector(ctx, resource, 1, s1);6531Temp is_non_null_descriptor =6532bld.sopc(aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), dword1, Operand::zero());6533default_sample = Operand(is_non_null_descriptor);6534}65356536Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand::c32(14u));6537bld.sop2(aco_opcode::s_cselect_b32, dst, samples, default_sample, bld.scc(is_msaa));6538}65396540void6541visit_image_samples(isel_context* ctx, nir_intrinsic_instr* instr)6542{6543Builder bld(ctx->program, ctx->block);6544Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);6545Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),6546ACO_DESC_IMAGE, NULL, false);6547get_image_samples(ctx, Definition(dst), resource);6548}65496550void6551visit_load_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)6552{6553Builder bld(ctx->program, ctx->block);6554unsigned num_components = instr->num_components;65556556Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);6557Temp rsrc = load_buffer_rsrc(ctx, get_ssa_temp(ctx, instr->src[0].ssa));65586559unsigned access = nir_intrinsic_access(instr);6560bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);6561unsigned size = instr->dest.ssa.bit_size / 8;65626563bool allow_smem = access & ACCESS_CAN_REORDER;65646565load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),6566nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, allow_smem,6567get_memory_sync_info(instr, storage_buffer, 0));6568}65696570void6571visit_store_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)6572{6573Builder bld(ctx->program, ctx->block);6574Temp data = get_ssa_temp(ctx, instr->src[0].ssa);6575unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;6576unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);6577Temp offset = get_ssa_temp(ctx, instr->src[2].ssa);65786579Temp rsrc = load_buffer_rsrc(ctx, get_ssa_temp(ctx, instr->src[1].ssa));65806581memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);6582bool glc =6583nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);65846585unsigned write_count = 0;6586Temp write_datas[32];6587unsigned offsets[32];6588split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,6589write_datas, offsets);65906591/* GFX6-7 are affected by a hw bug that prevents address clamping to work6592* correctly when the SGPR offset is used.6593*/6594if (offset.type() == RegType::sgpr && ctx->options->chip_class < GFX8)6595offset = as_vgpr(ctx, offset);65966597for (unsigned i = 0; i < write_count; i++) {6598aco_opcode op = get_buffer_store_op(write_datas[i].bytes());65996600aco_ptr<MUBUF_instruction> store{6601create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};6602store->operands[0] = Operand(rsrc);6603store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);6604store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);6605store->operands[3] = Operand(write_datas[i]);6606store->offset = offsets[i];6607store->offen = (offset.type() == RegType::vgpr);6608store->glc = glc;6609store->dlc = false;6610store->disable_wqm = true;6611store->sync = sync;6612ctx->program->needs_exact = true;6613ctx->block->instructions.emplace_back(std::move(store));6614}6615}66166617void6618visit_atomic_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)6619{6620Builder bld(ctx->program, ctx->block);6621bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa);6622Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));66236624if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap)6625data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),6626get_ssa_temp(ctx, instr->src[3].ssa), data);66276628Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);6629Temp rsrc = load_buffer_rsrc(ctx, get_ssa_temp(ctx, instr->src[0].ssa));66306631Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);66326633aco_opcode op32, op64;6634switch (instr->intrinsic) {6635case nir_intrinsic_ssbo_atomic_add:6636op32 = aco_opcode::buffer_atomic_add;6637op64 = aco_opcode::buffer_atomic_add_x2;6638break;6639case nir_intrinsic_ssbo_atomic_imin:6640op32 = aco_opcode::buffer_atomic_smin;6641op64 = aco_opcode::buffer_atomic_smin_x2;6642break;6643case nir_intrinsic_ssbo_atomic_umin:6644op32 = aco_opcode::buffer_atomic_umin;6645op64 = aco_opcode::buffer_atomic_umin_x2;6646break;6647case nir_intrinsic_ssbo_atomic_imax:6648op32 = aco_opcode::buffer_atomic_smax;6649op64 = aco_opcode::buffer_atomic_smax_x2;6650break;6651case nir_intrinsic_ssbo_atomic_umax:6652op32 = aco_opcode::buffer_atomic_umax;6653op64 = aco_opcode::buffer_atomic_umax_x2;6654break;6655case nir_intrinsic_ssbo_atomic_and:6656op32 = aco_opcode::buffer_atomic_and;6657op64 = aco_opcode::buffer_atomic_and_x2;6658break;6659case nir_intrinsic_ssbo_atomic_or:6660op32 = aco_opcode::buffer_atomic_or;6661op64 = aco_opcode::buffer_atomic_or_x2;6662break;6663case nir_intrinsic_ssbo_atomic_xor:6664op32 = aco_opcode::buffer_atomic_xor;6665op64 = aco_opcode::buffer_atomic_xor_x2;6666break;6667case nir_intrinsic_ssbo_atomic_exchange:6668op32 = aco_opcode::buffer_atomic_swap;6669op64 = aco_opcode::buffer_atomic_swap_x2;6670break;6671case nir_intrinsic_ssbo_atomic_comp_swap:6672op32 = aco_opcode::buffer_atomic_cmpswap;6673op64 = aco_opcode::buffer_atomic_cmpswap_x2;6674break;6675default:6676unreachable(6677"visit_atomic_ssbo should only be called with nir_intrinsic_ssbo_atomic_* instructions.");6678}6679aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;6680aco_ptr<MUBUF_instruction> mubuf{6681create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};6682mubuf->operands[0] = Operand(rsrc);6683mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);6684mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);6685mubuf->operands[3] = Operand(data);6686if (return_previous)6687mubuf->definitions[0] = Definition(dst);6688mubuf->offset = 0;6689mubuf->offen = (offset.type() == RegType::vgpr);6690mubuf->glc = return_previous;6691mubuf->dlc = false; /* Not needed for atomics */6692mubuf->disable_wqm = true;6693mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);6694ctx->program->needs_exact = true;6695ctx->block->instructions.emplace_back(std::move(mubuf));6696}66976698void6699visit_get_ssbo_size(isel_context* ctx, nir_intrinsic_instr* instr)6700{67016702Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa);6703Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);6704bool non_uniform = dst.type() == RegType::vgpr;67056706Builder bld(ctx->program, ctx->block);6707if (non_uniform) {6708Temp set_ptr = emit_extract_vector(ctx, rsrc, 0, RegClass(rsrc.type(), 1));6709Temp binding = emit_extract_vector(ctx, rsrc, 1, RegClass(rsrc.type(), 1));6710Temp index = bld.vadd32(bld.def(v1), set_ptr, binding);6711index = convert_pointer_to_64_bit(ctx, index, non_uniform);67126713LoadEmitInfo info = {Operand(index), dst, 1, 4};6714info.align_mul = 4;6715info.const_offset = 8;6716emit_load(ctx, bld, info, global_load_params);6717} else {6718emit_extract_vector(ctx, load_buffer_rsrc(ctx, rsrc), 2, dst);6719}6720}67216722void6723visit_load_global(isel_context* ctx, nir_intrinsic_instr* instr)6724{6725Builder bld(ctx->program, ctx->block);6726unsigned num_components = instr->num_components;6727unsigned component_size = instr->dest.ssa.bit_size / 8;67286729LoadEmitInfo info = {Operand(get_ssa_temp(ctx, instr->src[0].ssa)),6730get_ssa_temp(ctx, &instr->dest.ssa), num_components, component_size};6731info.glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);6732info.align_mul = nir_intrinsic_align_mul(instr);6733info.align_offset = nir_intrinsic_align_offset(instr);6734info.sync = get_memory_sync_info(instr, storage_buffer, 0);6735/* VMEM stores don't update the SMEM cache and it's difficult to prove that6736* it's safe to use SMEM */6737bool can_use_smem = nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE;6738if (info.dst.type() == RegType::vgpr || (info.glc && ctx->options->chip_class < GFX8) ||6739!can_use_smem) {6740emit_load(ctx, bld, info, global_load_params);6741} else {6742info.offset = Operand(bld.as_uniform(info.offset));6743emit_load(ctx, bld, info, smem_load_params);6744}6745}67466747void6748visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr)6749{6750Builder bld(ctx->program, ctx->block);6751unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;6752unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);67536754Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));6755Temp addr = get_ssa_temp(ctx, instr->src[1].ssa);6756memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);6757bool glc =6758nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);67596760if (ctx->options->chip_class >= GFX7)6761addr = as_vgpr(ctx, addr);67626763unsigned write_count = 0;6764Temp write_datas[32];6765unsigned offsets[32];6766split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,6767write_datas, offsets);67686769for (unsigned i = 0; i < write_count; i++) {6770if (ctx->options->chip_class >= GFX7) {6771unsigned offset = offsets[i];6772Temp store_addr = addr;6773if (offset > 0 && ctx->options->chip_class < GFX9) {6774Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1);6775Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1);6776Temp carry = bld.tmp(bld.lm);6777bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr);67786779bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0),6780bld.hint_vcc(Definition(carry)), Operand::c32(offset), addr0);6781bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(bld.lm),6782Operand::zero(), addr1, carry)6783.def(1)6784.setHint(vcc);67856786store_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);67876788offset = 0;6789}67906791bool global = ctx->options->chip_class >= GFX9;6792aco_opcode op;6793switch (write_datas[i].bytes()) {6794case 1: op = global ? aco_opcode::global_store_byte : aco_opcode::flat_store_byte; break;6795case 2: op = global ? aco_opcode::global_store_short : aco_opcode::flat_store_short; break;6796case 4: op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword; break;6797case 8:6798op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;6799break;6800case 12:6801op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;6802break;6803case 16:6804op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;6805break;6806default: unreachable("store_global not implemented for this size.");6807}68086809aco_ptr<FLAT_instruction> flat{6810create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};6811flat->operands[0] = Operand(store_addr);6812flat->operands[1] = Operand(s1);6813flat->operands[2] = Operand(write_datas[i]);6814flat->glc = glc;6815flat->dlc = false;6816flat->offset = offset;6817flat->disable_wqm = true;6818flat->sync = sync;6819ctx->program->needs_exact = true;6820ctx->block->instructions.emplace_back(std::move(flat));6821} else {6822assert(ctx->options->chip_class == GFX6);68236824aco_opcode op = get_buffer_store_op(write_datas[i].bytes());68256826Temp rsrc = get_gfx6_global_rsrc(bld, addr);68276828aco_ptr<MUBUF_instruction> mubuf{6829create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};6830mubuf->operands[0] = Operand(rsrc);6831mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);6832mubuf->operands[2] = Operand::zero();6833mubuf->operands[3] = Operand(write_datas[i]);6834mubuf->glc = glc;6835mubuf->dlc = false;6836mubuf->offset = offsets[i];6837mubuf->addr64 = addr.type() == RegType::vgpr;6838mubuf->disable_wqm = true;6839mubuf->sync = sync;6840ctx->program->needs_exact = true;6841ctx->block->instructions.emplace_back(std::move(mubuf));6842}6843}6844}68456846void6847visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr)6848{6849Builder bld(ctx->program, ctx->block);6850bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa);6851Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);6852Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));68536854if (ctx->options->chip_class >= GFX7)6855addr = as_vgpr(ctx, addr);68566857if (instr->intrinsic == nir_intrinsic_global_atomic_comp_swap)6858data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),6859get_ssa_temp(ctx, instr->src[2].ssa), data);68606861Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);68626863aco_opcode op32, op64;68646865if (ctx->options->chip_class >= GFX7) {6866bool global = ctx->options->chip_class >= GFX9;6867switch (instr->intrinsic) {6868case nir_intrinsic_global_atomic_add:6869op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add;6870op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2;6871break;6872case nir_intrinsic_global_atomic_imin:6873op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin;6874op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2;6875break;6876case nir_intrinsic_global_atomic_umin:6877op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin;6878op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2;6879break;6880case nir_intrinsic_global_atomic_imax:6881op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax;6882op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2;6883break;6884case nir_intrinsic_global_atomic_umax:6885op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax;6886op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2;6887break;6888case nir_intrinsic_global_atomic_and:6889op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and;6890op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2;6891break;6892case nir_intrinsic_global_atomic_or:6893op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or;6894op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2;6895break;6896case nir_intrinsic_global_atomic_xor:6897op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor;6898op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2;6899break;6900case nir_intrinsic_global_atomic_exchange:6901op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap;6902op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2;6903break;6904case nir_intrinsic_global_atomic_comp_swap:6905op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap;6906op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2;6907break;6908default:6909unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* "6910"instructions.");6911}69126913aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;6914aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(6915op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)};6916flat->operands[0] = Operand(addr);6917flat->operands[1] = Operand(s1);6918flat->operands[2] = Operand(data);6919if (return_previous)6920flat->definitions[0] = Definition(dst);6921flat->glc = return_previous;6922flat->dlc = false; /* Not needed for atomics */6923flat->offset = 0;6924flat->disable_wqm = true;6925flat->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);6926ctx->program->needs_exact = true;6927ctx->block->instructions.emplace_back(std::move(flat));6928} else {6929assert(ctx->options->chip_class == GFX6);69306931switch (instr->intrinsic) {6932case nir_intrinsic_global_atomic_add:6933op32 = aco_opcode::buffer_atomic_add;6934op64 = aco_opcode::buffer_atomic_add_x2;6935break;6936case nir_intrinsic_global_atomic_imin:6937op32 = aco_opcode::buffer_atomic_smin;6938op64 = aco_opcode::buffer_atomic_smin_x2;6939break;6940case nir_intrinsic_global_atomic_umin:6941op32 = aco_opcode::buffer_atomic_umin;6942op64 = aco_opcode::buffer_atomic_umin_x2;6943break;6944case nir_intrinsic_global_atomic_imax:6945op32 = aco_opcode::buffer_atomic_smax;6946op64 = aco_opcode::buffer_atomic_smax_x2;6947break;6948case nir_intrinsic_global_atomic_umax:6949op32 = aco_opcode::buffer_atomic_umax;6950op64 = aco_opcode::buffer_atomic_umax_x2;6951break;6952case nir_intrinsic_global_atomic_and:6953op32 = aco_opcode::buffer_atomic_and;6954op64 = aco_opcode::buffer_atomic_and_x2;6955break;6956case nir_intrinsic_global_atomic_or:6957op32 = aco_opcode::buffer_atomic_or;6958op64 = aco_opcode::buffer_atomic_or_x2;6959break;6960case nir_intrinsic_global_atomic_xor:6961op32 = aco_opcode::buffer_atomic_xor;6962op64 = aco_opcode::buffer_atomic_xor_x2;6963break;6964case nir_intrinsic_global_atomic_exchange:6965op32 = aco_opcode::buffer_atomic_swap;6966op64 = aco_opcode::buffer_atomic_swap_x2;6967break;6968case nir_intrinsic_global_atomic_comp_swap:6969op32 = aco_opcode::buffer_atomic_cmpswap;6970op64 = aco_opcode::buffer_atomic_cmpswap_x2;6971break;6972default:6973unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* "6974"instructions.");6975}69766977Temp rsrc = get_gfx6_global_rsrc(bld, addr);69786979aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;69806981aco_ptr<MUBUF_instruction> mubuf{6982create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};6983mubuf->operands[0] = Operand(rsrc);6984mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);6985mubuf->operands[2] = Operand::zero();6986mubuf->operands[3] = Operand(data);6987if (return_previous)6988mubuf->definitions[0] = Definition(dst);6989mubuf->glc = return_previous;6990mubuf->dlc = false;6991mubuf->offset = 0;6992mubuf->addr64 = addr.type() == RegType::vgpr;6993mubuf->disable_wqm = true;6994mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);6995ctx->program->needs_exact = true;6996ctx->block->instructions.emplace_back(std::move(mubuf));6997}6998}69997000void7001visit_load_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)7002{7003Builder bld(ctx->program, ctx->block);70047005Temp dst = get_ssa_temp(ctx, &intrin->dest.ssa);7006Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[0].ssa));7007Temp v_offset = as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[1].ssa));7008Temp s_offset = bld.as_uniform(get_ssa_temp(ctx, intrin->src[2].ssa));70097010bool swizzled = nir_intrinsic_is_swizzled(intrin);7011bool reorder = nir_intrinsic_can_reorder(intrin);7012bool slc = nir_intrinsic_slc_amd(intrin);70137014unsigned const_offset = nir_intrinsic_base(intrin);7015unsigned elem_size_bytes = intrin->dest.ssa.bit_size / 8u;7016unsigned num_components = intrin->dest.ssa.num_components;7017unsigned swizzle_element_size = swizzled ? (ctx->program->chip_class <= GFX8 ? 4 : 16) : 0;70187019load_vmem_mubuf(ctx, dst, descriptor, v_offset, s_offset, const_offset, elem_size_bytes,7020num_components, swizzle_element_size, !swizzled, reorder, slc);7021}70227023void7024visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)7025{7026Temp store_src = get_ssa_temp(ctx, intrin->src[0].ssa);7027Temp descriptor = get_ssa_temp(ctx, intrin->src[1].ssa);7028Temp v_offset = get_ssa_temp(ctx, intrin->src[2].ssa);7029Temp s_offset = get_ssa_temp(ctx, intrin->src[3].ssa);70307031bool swizzled = nir_intrinsic_is_swizzled(intrin);7032bool slc = nir_intrinsic_slc_amd(intrin);70337034unsigned const_offset = nir_intrinsic_base(intrin);7035unsigned write_mask = nir_intrinsic_write_mask(intrin);7036unsigned elem_size_bytes = intrin->src[0].ssa->bit_size / 8u;70377038nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin);7039memory_sync_info sync(mem_mode == nir_var_shader_out ? storage_vmem_output : storage_none);70407041store_vmem_mubuf(ctx, store_src, descriptor, v_offset, s_offset, const_offset, elem_size_bytes,7042write_mask, !swizzled, sync, slc);7043}70447045sync_scope7046translate_nir_scope(nir_scope scope)7047{7048switch (scope) {7049case NIR_SCOPE_NONE:7050case NIR_SCOPE_INVOCATION: return scope_invocation;7051case NIR_SCOPE_SUBGROUP: return scope_subgroup;7052case NIR_SCOPE_WORKGROUP: return scope_workgroup;7053case NIR_SCOPE_QUEUE_FAMILY: return scope_queuefamily;7054case NIR_SCOPE_DEVICE: return scope_device;7055case NIR_SCOPE_SHADER_CALL: unreachable("unsupported scope");7056}7057unreachable("invalid scope");7058}70597060void7061emit_scoped_barrier(isel_context* ctx, nir_intrinsic_instr* instr)7062{7063Builder bld(ctx->program, ctx->block);70647065unsigned semantics = 0;7066unsigned storage = 0;7067sync_scope mem_scope = translate_nir_scope(nir_intrinsic_memory_scope(instr));7068sync_scope exec_scope = translate_nir_scope(nir_intrinsic_execution_scope(instr));70697070/* We use shared storage for the following:7071* - compute shaders expose it in their API7072* - when tessellation is used, TCS and VS I/O is lowered to shared memory7073* - when GS is used on GFX9+, VS->GS and TES->GS I/O is lowered to shared memory7074* - additionally, when NGG is used on GFX10+, shared memory is used for certain features7075*/7076bool shared_storage_used = ctx->stage.hw == HWStage::CS || ctx->stage.hw == HWStage::LS ||7077ctx->stage.hw == HWStage::HS ||7078(ctx->stage.hw == HWStage::GS && ctx->program->chip_class >= GFX9) ||7079ctx->stage.hw == HWStage::NGG;70807081/* Workgroup barriers can hang merged shaders that can potentially have 0 threads in either half.7082* They are allowed in CS, TCS, and in any NGG shader.7083*/7084ASSERTED bool workgroup_scope_allowed =7085ctx->stage.hw == HWStage::CS || ctx->stage.hw == HWStage::HS || ctx->stage.hw == HWStage::NGG;70867087unsigned nir_storage = nir_intrinsic_memory_modes(instr);7088if (nir_storage & (nir_var_mem_ssbo | nir_var_mem_global))7089storage |= storage_buffer | storage_image; // TODO: split this when NIR gets nir_var_mem_image7090if (shared_storage_used && (nir_storage & nir_var_mem_shared))7091storage |= storage_shared;70927093unsigned nir_semantics = nir_intrinsic_memory_semantics(instr);7094if (nir_semantics & NIR_MEMORY_ACQUIRE)7095semantics |= semantic_acquire | semantic_release;7096if (nir_semantics & NIR_MEMORY_RELEASE)7097semantics |= semantic_acquire | semantic_release;70987099assert(!(nir_semantics & (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE)));7100assert(exec_scope != scope_workgroup || workgroup_scope_allowed);71017102bld.barrier(aco_opcode::p_barrier,7103memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope),7104exec_scope);7105}71067107void7108visit_load_shared(isel_context* ctx, nir_intrinsic_instr* instr)7109{7110// TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read()7111Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);7112Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));7113Builder bld(ctx->program, ctx->block);71147115unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;7116unsigned num_components = instr->dest.ssa.num_components;7117unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;7118load_lds(ctx, elem_size_bytes, num_components, dst, address, nir_intrinsic_base(instr), align);7119}71207121void7122visit_store_shared(isel_context* ctx, nir_intrinsic_instr* instr)7123{7124unsigned writemask = nir_intrinsic_write_mask(instr);7125Temp data = get_ssa_temp(ctx, instr->src[0].ssa);7126Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));7127unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;71287129unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;7130store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);7131}71327133void7134visit_shared_atomic(isel_context* ctx, nir_intrinsic_instr* instr)7135{7136unsigned offset = nir_intrinsic_base(instr);7137Builder bld(ctx->program, ctx->block);7138Operand m = load_lds_size_m0(bld);7139Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));7140Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));71417142unsigned num_operands = 3;7143aco_opcode op32, op64, op32_rtn, op64_rtn;7144switch (instr->intrinsic) {7145case nir_intrinsic_shared_atomic_add:7146op32 = aco_opcode::ds_add_u32;7147op64 = aco_opcode::ds_add_u64;7148op32_rtn = aco_opcode::ds_add_rtn_u32;7149op64_rtn = aco_opcode::ds_add_rtn_u64;7150break;7151case nir_intrinsic_shared_atomic_imin:7152op32 = aco_opcode::ds_min_i32;7153op64 = aco_opcode::ds_min_i64;7154op32_rtn = aco_opcode::ds_min_rtn_i32;7155op64_rtn = aco_opcode::ds_min_rtn_i64;7156break;7157case nir_intrinsic_shared_atomic_umin:7158op32 = aco_opcode::ds_min_u32;7159op64 = aco_opcode::ds_min_u64;7160op32_rtn = aco_opcode::ds_min_rtn_u32;7161op64_rtn = aco_opcode::ds_min_rtn_u64;7162break;7163case nir_intrinsic_shared_atomic_imax:7164op32 = aco_opcode::ds_max_i32;7165op64 = aco_opcode::ds_max_i64;7166op32_rtn = aco_opcode::ds_max_rtn_i32;7167op64_rtn = aco_opcode::ds_max_rtn_i64;7168break;7169case nir_intrinsic_shared_atomic_umax:7170op32 = aco_opcode::ds_max_u32;7171op64 = aco_opcode::ds_max_u64;7172op32_rtn = aco_opcode::ds_max_rtn_u32;7173op64_rtn = aco_opcode::ds_max_rtn_u64;7174break;7175case nir_intrinsic_shared_atomic_and:7176op32 = aco_opcode::ds_and_b32;7177op64 = aco_opcode::ds_and_b64;7178op32_rtn = aco_opcode::ds_and_rtn_b32;7179op64_rtn = aco_opcode::ds_and_rtn_b64;7180break;7181case nir_intrinsic_shared_atomic_or:7182op32 = aco_opcode::ds_or_b32;7183op64 = aco_opcode::ds_or_b64;7184op32_rtn = aco_opcode::ds_or_rtn_b32;7185op64_rtn = aco_opcode::ds_or_rtn_b64;7186break;7187case nir_intrinsic_shared_atomic_xor:7188op32 = aco_opcode::ds_xor_b32;7189op64 = aco_opcode::ds_xor_b64;7190op32_rtn = aco_opcode::ds_xor_rtn_b32;7191op64_rtn = aco_opcode::ds_xor_rtn_b64;7192break;7193case nir_intrinsic_shared_atomic_exchange:7194op32 = aco_opcode::ds_write_b32;7195op64 = aco_opcode::ds_write_b64;7196op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;7197op64_rtn = aco_opcode::ds_wrxchg_rtn_b64;7198break;7199case nir_intrinsic_shared_atomic_comp_swap:7200op32 = aco_opcode::ds_cmpst_b32;7201op64 = aco_opcode::ds_cmpst_b64;7202op32_rtn = aco_opcode::ds_cmpst_rtn_b32;7203op64_rtn = aco_opcode::ds_cmpst_rtn_b64;7204num_operands = 4;7205break;7206case nir_intrinsic_shared_atomic_fadd:7207op32 = aco_opcode::ds_add_f32;7208op32_rtn = aco_opcode::ds_add_rtn_f32;7209op64 = aco_opcode::num_opcodes;7210op64_rtn = aco_opcode::num_opcodes;7211break;7212default: unreachable("Unhandled shared atomic intrinsic");7213}72147215bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa);72167217aco_opcode op;7218if (data.size() == 1) {7219assert(instr->dest.ssa.bit_size == 32);7220op = return_previous ? op32_rtn : op32;7221} else {7222assert(instr->dest.ssa.bit_size == 64);7223op = return_previous ? op64_rtn : op64;7224}72257226if (offset > 65535) {7227address = bld.vadd32(bld.def(v1), Operand::c32(offset), address);7228offset = 0;7229}72307231aco_ptr<DS_instruction> ds;7232ds.reset(7233create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0));7234ds->operands[0] = Operand(address);7235ds->operands[1] = Operand(data);7236if (num_operands == 4) {7237Temp data2 = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));7238ds->operands[2] = Operand(data2);7239}7240ds->operands[num_operands - 1] = m;7241ds->offset0 = offset;7242if (return_previous)7243ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa));7244ds->sync = memory_sync_info(storage_shared, semantic_atomicrmw);7245ctx->block->instructions.emplace_back(std::move(ds));7246}72477248Temp7249get_scratch_resource(isel_context* ctx)7250{7251Builder bld(ctx->program, ctx->block);7252Temp scratch_addr = ctx->program->private_segment_buffer;7253if (ctx->stage != compute_cs)7254scratch_addr =7255bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand::zero());72567257uint32_t rsrc_conf =7258S_008F0C_ADD_TID_ENABLE(1) | S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);72597260if (ctx->program->chip_class >= GFX10) {7261rsrc_conf |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |7262S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);7263} else if (ctx->program->chip_class <=7264GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */7265rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |7266S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);7267}72687269/* older generations need element size = 4 bytes. element size removed in GFX9 */7270if (ctx->program->chip_class <= GFX8)7271rsrc_conf |= S_008F0C_ELEMENT_SIZE(1);72727273return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand::c32(-1u),7274Operand::c32(rsrc_conf));7275}72767277void7278visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr)7279{7280Builder bld(ctx->program, ctx->block);7281Temp rsrc = get_scratch_resource(ctx);7282Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));7283Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);72847285LoadEmitInfo info = {Operand(offset), dst, instr->dest.ssa.num_components,7286instr->dest.ssa.bit_size / 8u, rsrc};7287info.align_mul = nir_intrinsic_align_mul(instr);7288info.align_offset = nir_intrinsic_align_offset(instr);7289info.swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 0;7290info.sync = memory_sync_info(storage_scratch, semantic_private);7291info.soffset = ctx->program->scratch_offset;7292emit_load(ctx, bld, info, scratch_load_params);7293}72947295void7296visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)7297{7298Builder bld(ctx->program, ctx->block);7299Temp rsrc = get_scratch_resource(ctx);7300Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));7301Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));73027303unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;7304unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);73057306unsigned write_count = 0;7307Temp write_datas[32];7308unsigned offsets[32];7309unsigned swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 16;7310split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, swizzle_component_size,7311&write_count, write_datas, offsets);73127313for (unsigned i = 0; i < write_count; i++) {7314aco_opcode op = get_buffer_store_op(write_datas[i].bytes());7315Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i],7316offsets[i], true, true);7317mubuf->mubuf().sync = memory_sync_info(storage_scratch, semantic_private);7318}7319}73207321void7322visit_load_sample_mask_in(isel_context* ctx, nir_intrinsic_instr* instr)7323{7324uint8_t log2_ps_iter_samples;7325if (ctx->program->info->ps.uses_sample_shading) {7326log2_ps_iter_samples = util_logbase2(ctx->options->key.fs.num_samples);7327} else {7328log2_ps_iter_samples = ctx->options->key.fs.log2_ps_iter_samples;7329}73307331Builder bld(ctx->program, ctx->block);73327333Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);73347335if (log2_ps_iter_samples) {7336/* gl_SampleMaskIn[0] = (SampleCoverage & (1 << gl_SampleID)). */7337Temp sample_id =7338bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary),7339Operand::c32(8u), Operand::c32(4u));7340Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sample_id,7341bld.copy(bld.def(v1), Operand::c32(1u)));7342bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask,7343get_arg(ctx, ctx->args->ac.sample_coverage));7344} else {7345bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.sample_coverage));7346}7347}73487349void7350visit_emit_vertex_with_counter(isel_context* ctx, nir_intrinsic_instr* instr)7351{7352Builder bld(ctx->program, ctx->block);73537354unsigned stream = nir_intrinsic_stream_id(instr);7355Temp next_vertex = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));7356next_vertex = bld.v_mul_imm(bld.def(v1), next_vertex, 4u);7357nir_const_value* next_vertex_cv = nir_src_as_const_value(instr->src[0]);73587359/* get GSVS ring */7360Temp gsvs_ring =7361bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer,7362Operand::c32(RING_GSVS_GS * 16u));73637364unsigned num_components = ctx->program->info->gs.num_stream_output_components[stream];73657366unsigned stride = 4u * num_components * ctx->shader->info.gs.vertices_out;7367unsigned stream_offset = 0;7368for (unsigned i = 0; i < stream; i++) {7369unsigned prev_stride = 4u * ctx->program->info->gs.num_stream_output_components[i] *7370ctx->shader->info.gs.vertices_out;7371stream_offset += prev_stride * ctx->program->wave_size;7372}73737374/* Limit on the stride field for <= GFX7. */7375assert(stride < (1 << 14));73767377Temp gsvs_dwords[4];7378for (unsigned i = 0; i < 4; i++)7379gsvs_dwords[i] = bld.tmp(s1);7380bld.pseudo(aco_opcode::p_split_vector, Definition(gsvs_dwords[0]), Definition(gsvs_dwords[1]),7381Definition(gsvs_dwords[2]), Definition(gsvs_dwords[3]), gsvs_ring);73827383if (stream_offset) {7384Temp stream_offset_tmp = bld.copy(bld.def(s1), Operand::c32(stream_offset));73857386Temp carry = bld.tmp(s1);7387gsvs_dwords[0] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)),7388gsvs_dwords[0], stream_offset_tmp);7389gsvs_dwords[1] = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc),7390gsvs_dwords[1], Operand::zero(), bld.scc(carry));7391}73927393gsvs_dwords[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1],7394Operand::c32(S_008F04_STRIDE(stride)));7395gsvs_dwords[2] = bld.copy(bld.def(s1), Operand::c32(ctx->program->wave_size));73967397gsvs_ring = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), gsvs_dwords[0], gsvs_dwords[1],7398gsvs_dwords[2], gsvs_dwords[3]);73997400unsigned offset = 0;7401for (unsigned i = 0; i <= VARYING_SLOT_VAR31; i++) {7402if (ctx->program->info->gs.output_streams[i] != stream)7403continue;74047405for (unsigned j = 0; j < 4; j++) {7406if (!(ctx->program->info->gs.output_usage_mask[i] & (1 << j)))7407continue;74087409if (ctx->outputs.mask[i] & (1 << j)) {7410Operand vaddr_offset = next_vertex_cv ? Operand(v1) : Operand(next_vertex);7411unsigned const_offset = (offset + (next_vertex_cv ? next_vertex_cv->u32 : 0u)) * 4u;7412if (const_offset >= 4096u) {7413if (vaddr_offset.isUndefined())7414vaddr_offset = bld.copy(bld.def(v1), Operand::c32(const_offset / 4096u * 4096u));7415else7416vaddr_offset = bld.vadd32(bld.def(v1), Operand::c32(const_offset / 4096u * 4096u),7417vaddr_offset);7418const_offset %= 4096u;7419}74207421aco_ptr<MTBUF_instruction> mtbuf{create_instruction<MTBUF_instruction>(7422aco_opcode::tbuffer_store_format_x, Format::MTBUF, 4, 0)};7423mtbuf->operands[0] = Operand(gsvs_ring);7424mtbuf->operands[1] = vaddr_offset;7425mtbuf->operands[2] = Operand(get_arg(ctx, ctx->args->ac.gs2vs_offset));7426mtbuf->operands[3] = Operand(ctx->outputs.temps[i * 4u + j]);7427mtbuf->offen = !vaddr_offset.isUndefined();7428mtbuf->dfmt = V_008F0C_BUF_DATA_FORMAT_32;7429mtbuf->nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;7430mtbuf->offset = const_offset;7431mtbuf->glc = true;7432mtbuf->slc = true;7433mtbuf->sync = memory_sync_info(storage_vmem_output, semantic_can_reorder);7434bld.insert(std::move(mtbuf));7435}74367437offset += ctx->shader->info.gs.vertices_out;7438}74397440/* outputs for the next vertex are undefined and keeping them around can7441* create invalid IR with control flow */7442ctx->outputs.mask[i] = 0;7443}74447445bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(false, true, stream));7446}74477448Temp7449emit_boolean_reduce(isel_context* ctx, nir_op op, unsigned cluster_size, Temp src)7450{7451Builder bld(ctx->program, ctx->block);74527453if (cluster_size == 1) {7454return src;7455}7456if (op == nir_op_iand && cluster_size == 4) {7457/* subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val) */7458Temp tmp =7459bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);7460return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc),7461bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), tmp));7462} else if (op == nir_op_ior && cluster_size == 4) {7463/* subgroupClusteredOr(val, 4) -> wqm(val & exec) */7464return bld.sop1(7465Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc),7466bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)));7467} else if (op == nir_op_iand && cluster_size == ctx->program->wave_size) {7468/* subgroupAnd(val) -> (exec & ~val) == 0 */7469Temp tmp =7470bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src)7471.def(1)7472.getTemp();7473Temp cond = bool_to_vector_condition(ctx, emit_wqm(bld, tmp));7474return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), cond);7475} else if (op == nir_op_ior && cluster_size == ctx->program->wave_size) {7476/* subgroupOr(val) -> (val & exec) != 0 */7477Temp tmp =7478bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))7479.def(1)7480.getTemp();7481return bool_to_vector_condition(ctx, tmp);7482} else if (op == nir_op_ixor && cluster_size == ctx->program->wave_size) {7483/* subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1 */7484Temp tmp =7485bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));7486tmp = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), tmp);7487tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(1u))7488.def(1)7489.getTemp();7490return bool_to_vector_condition(ctx, tmp);7491} else {7492/* subgroupClustered{And,Or,Xor}(val, n):7493* lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0)) (just v_mbcnt_lo on wave32)7494* cluster_offset = ~(n - 1) & lane_id cluster_mask = ((1 << n) - 1)7495* subgroupClusteredAnd():7496* return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask7497* subgroupClusteredOr():7498* return ((val & exec) >> cluster_offset) & cluster_mask != 07499* subgroupClusteredXor():7500* return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 07501*/7502Temp lane_id = emit_mbcnt(ctx, bld.tmp(v1));7503Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1),7504Operand::c32(~uint32_t(cluster_size - 1)), lane_id);75057506Temp tmp;7507if (op == nir_op_iand)7508tmp = bld.sop2(Builder::s_orn2, bld.def(bld.lm), bld.def(s1, scc), src,7509Operand(exec, bld.lm));7510else7511tmp =7512bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));75137514uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u;75157516if (ctx->program->chip_class <= GFX7)7517tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), tmp, cluster_offset);7518else if (ctx->program->wave_size == 64)7519tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp);7520else7521tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), cluster_offset, tmp);7522tmp = emit_extract_vector(ctx, tmp, 0, v1);7523if (cluster_mask != 0xffffffff)7524tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(cluster_mask), tmp);75257526if (op == nir_op_iand) {7527return bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.lm), Operand::c32(cluster_mask),7528tmp);7529} else if (op == nir_op_ior) {7530return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand::zero(), tmp);7531} else if (op == nir_op_ixor) {7532tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u),7533bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand::zero()));7534return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand::zero(), tmp);7535}7536assert(false);7537return Temp();7538}7539}75407541Temp7542emit_boolean_exclusive_scan(isel_context* ctx, nir_op op, Temp src)7543{7544Builder bld(ctx->program, ctx->block);7545assert(src.regClass() == bld.lm);75467547/* subgroupExclusiveAnd(val) -> mbcnt(exec & ~val) == 07548* subgroupExclusiveOr(val) -> mbcnt(val & exec) != 07549* subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 07550*/7551Temp tmp;7552if (op == nir_op_iand)7553tmp =7554bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);7555else7556tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));75577558Temp mbcnt = emit_mbcnt(ctx, bld.tmp(v1), Operand(tmp));75597560if (op == nir_op_iand)7561return bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.lm), Operand::zero(), mbcnt);7562else if (op == nir_op_ior)7563return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand::zero(), mbcnt);7564else if (op == nir_op_ixor)7565return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand::zero(),7566bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), mbcnt));75677568assert(false);7569return Temp();7570}75717572Temp7573emit_boolean_inclusive_scan(isel_context* ctx, nir_op op, Temp src)7574{7575Builder bld(ctx->program, ctx->block);75767577/* subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val7578* subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val7579* subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val7580*/7581Temp tmp = emit_boolean_exclusive_scan(ctx, op, src);7582if (op == nir_op_iand)7583return bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, src);7584else if (op == nir_op_ior)7585return bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(s1, scc), tmp, src);7586else if (op == nir_op_ixor)7587return bld.sop2(Builder::s_xor, bld.def(bld.lm), bld.def(s1, scc), tmp, src);75887589assert(false);7590return Temp();7591}75927593ReduceOp7594get_reduce_op(nir_op op, unsigned bit_size)7595{7596switch (op) {7597#define CASEI(name) \7598case nir_op_##name: \7599return (bit_size == 32) ? name##32 \7600: (bit_size == 16) ? name##16 \7601: (bit_size == 8) ? name##8 \7602: name##64;7603#define CASEF(name) \7604case nir_op_##name: return (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64;7605CASEI(iadd)7606CASEI(imul)7607CASEI(imin)7608CASEI(umin)7609CASEI(imax)7610CASEI(umax)7611CASEI(iand)7612CASEI(ior)7613CASEI(ixor)7614CASEF(fadd)7615CASEF(fmul)7616CASEF(fmin)7617CASEF(fmax)7618default: unreachable("unknown reduction op");7619#undef CASEI7620#undef CASEF7621}7622}76237624void7625emit_uniform_subgroup(isel_context* ctx, nir_intrinsic_instr* instr, Temp src)7626{7627Builder bld(ctx->program, ctx->block);7628Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));7629assert(dst.regClass().type() != RegType::vgpr);7630if (src.regClass().type() == RegType::vgpr)7631bld.pseudo(aco_opcode::p_as_uniform, dst, src);7632else7633bld.copy(dst, src);7634}76357636void7637emit_addition_uniform_reduce(isel_context* ctx, nir_op op, Definition dst, nir_src src, Temp count)7638{7639Builder bld(ctx->program, ctx->block);7640Temp src_tmp = get_ssa_temp(ctx, src.ssa);76417642if (op == nir_op_fadd) {7643src_tmp = as_vgpr(ctx, src_tmp);7644Temp tmp = dst.regClass() == s1 ? bld.tmp(src_tmp.regClass()) : dst.getTemp();76457646if (src.ssa->bit_size == 16) {7647count = bld.vop1(aco_opcode::v_cvt_f16_u16, bld.def(v2b), count);7648bld.vop2(aco_opcode::v_mul_f16, Definition(tmp), count, src_tmp);7649} else {7650assert(src.ssa->bit_size == 32);7651count = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), count);7652bld.vop2(aco_opcode::v_mul_f32, Definition(tmp), count, src_tmp);7653}76547655if (tmp != dst.getTemp())7656bld.pseudo(aco_opcode::p_as_uniform, dst, tmp);76577658return;7659}76607661if (dst.regClass() == s1)7662src_tmp = bld.as_uniform(src_tmp);76637664if (op == nir_op_ixor && count.type() == RegType::sgpr)7665count =7666bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), count, Operand::c32(1u));7667else if (op == nir_op_ixor)7668count = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), count);76697670assert(dst.getTemp().type() == count.type());76717672if (nir_src_is_const(src)) {7673if (nir_src_as_uint(src) == 1 && dst.bytes() <= 2)7674bld.pseudo(aco_opcode::p_extract_vector, dst, count, Operand::zero());7675else if (nir_src_as_uint(src) == 1)7676bld.copy(dst, count);7677else if (nir_src_as_uint(src) == 0 && dst.bytes() <= 2)7678bld.vop1(aco_opcode::v_mov_b32, dst, Operand::zero()); /* RA will use SDWA if possible */7679else if (nir_src_as_uint(src) == 0)7680bld.copy(dst, Operand::zero());7681else if (count.type() == RegType::vgpr)7682bld.v_mul_imm(dst, count, nir_src_as_uint(src));7683else7684bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);7685} else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX10) {7686bld.vop3(aco_opcode::v_mul_lo_u16_e64, dst, src_tmp, count);7687} else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX8) {7688bld.vop2(aco_opcode::v_mul_lo_u16, dst, src_tmp, count);7689} else if (dst.getTemp().type() == RegType::vgpr) {7690bld.vop3(aco_opcode::v_mul_lo_u32, dst, src_tmp, count);7691} else {7692bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);7693}7694}76957696bool7697emit_uniform_reduce(isel_context* ctx, nir_intrinsic_instr* instr)7698{7699nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);7700if (op == nir_op_imul || op == nir_op_fmul)7701return false;77027703if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {7704Builder bld(ctx->program, ctx->block);7705Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));7706unsigned bit_size = instr->src[0].ssa->bit_size;7707if (bit_size > 32)7708return false;77097710Temp thread_count =7711bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), Operand(exec, bld.lm));77127713emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], thread_count);7714} else {7715emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));7716}77177718return true;7719}77207721bool7722emit_uniform_scan(isel_context* ctx, nir_intrinsic_instr* instr)7723{7724Builder bld(ctx->program, ctx->block);7725Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));7726nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);7727bool inc = instr->intrinsic == nir_intrinsic_inclusive_scan;77287729if (op == nir_op_imul || op == nir_op_fmul)7730return false;77317732if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {7733if (instr->src[0].ssa->bit_size > 32)7734return false;77357736Temp packed_tid;7737if (inc)7738packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm), Operand::c32(1u));7739else7740packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm));77417742emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], packed_tid);7743return true;7744}77457746assert(op == nir_op_imin || op == nir_op_umin || op == nir_op_imax || op == nir_op_umax ||7747op == nir_op_iand || op == nir_op_ior || op == nir_op_fmin || op == nir_op_fmax);77487749if (inc) {7750emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));7751return true;7752}77537754/* Copy the source and write the reduction operation identity to the first lane. */7755Temp lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));7756Temp src = get_ssa_temp(ctx, instr->src[0].ssa);7757ReduceOp reduce_op = get_reduce_op(op, instr->src[0].ssa->bit_size);7758if (dst.bytes() == 8) {7759Temp lo = bld.tmp(v1), hi = bld.tmp(v1);7760bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);7761uint32_t identity_lo = get_reduction_identity(reduce_op, 0);7762uint32_t identity_hi = get_reduction_identity(reduce_op, 1);77637764lo =7765bld.writelane(bld.def(v1), bld.copy(bld.hint_m0(s1), Operand::c32(identity_lo)), lane, lo);7766hi =7767bld.writelane(bld.def(v1), bld.copy(bld.hint_m0(s1), Operand::c32(identity_hi)), lane, hi);7768bld.pseudo(aco_opcode::p_create_vector, dst, lo, hi);7769} else {7770uint32_t identity = get_reduction_identity(reduce_op, 0);7771bld.writelane(dst, bld.copy(bld.hint_m0(s1), Operand::c32(identity)), lane,7772as_vgpr(ctx, src));7773}77747775return true;7776}77777778Temp7779emit_reduction_instr(isel_context* ctx, aco_opcode aco_op, ReduceOp op, unsigned cluster_size,7780Definition dst, Temp src)7781{7782assert(src.bytes() <= 8);7783assert(src.type() == RegType::vgpr);77847785Builder bld(ctx->program, ctx->block);77867787unsigned num_defs = 0;7788Definition defs[5];7789defs[num_defs++] = dst;7790defs[num_defs++] = bld.def(bld.lm); /* used internally to save/restore exec */77917792/* scalar identity temporary */7793bool need_sitmp = (ctx->program->chip_class <= GFX7 || ctx->program->chip_class >= GFX10) &&7794aco_op != aco_opcode::p_reduce;7795if (aco_op == aco_opcode::p_exclusive_scan) {7796need_sitmp |= (op == imin8 || op == imin16 || op == imin32 || op == imin64 || op == imax8 ||7797op == imax16 || op == imax32 || op == imax64 || op == fmin16 || op == fmin32 ||7798op == fmin64 || op == fmax16 || op == fmax32 || op == fmax64 || op == fmul16 ||7799op == fmul64);7800}7801if (need_sitmp)7802defs[num_defs++] = bld.def(RegType::sgpr, dst.size());78037804/* scc clobber */7805defs[num_defs++] = bld.def(s1, scc);78067807/* vcc clobber */7808bool clobber_vcc = false;7809if ((op == iadd32 || op == imul64) && ctx->program->chip_class < GFX9)7810clobber_vcc = true;7811if ((op == iadd8 || op == iadd16) && ctx->program->chip_class < GFX8)7812clobber_vcc = true;7813if (op == iadd64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)7814clobber_vcc = true;78157816if (clobber_vcc)7817defs[num_defs++] = bld.def(bld.lm, vcc);78187819Pseudo_reduction_instruction* reduce = create_instruction<Pseudo_reduction_instruction>(7820aco_op, Format::PSEUDO_REDUCTION, 3, num_defs);7821reduce->operands[0] = Operand(src);7822/* setup_reduce_temp will update these undef operands if needed */7823reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear());7824reduce->operands[2] = Operand(v1.as_linear());7825std::copy(defs, defs + num_defs, reduce->definitions.begin());78267827reduce->reduce_op = op;7828reduce->cluster_size = cluster_size;7829bld.insert(std::move(reduce));78307831return dst.getTemp();7832}78337834void7835emit_interp_center(isel_context* ctx, Temp dst, Temp pos1, Temp pos2)7836{7837Builder bld(ctx->program, ctx->block);7838Temp persp_center = get_arg(ctx, ctx->args->ac.persp_center);7839Temp p1 = emit_extract_vector(ctx, persp_center, 0, v1);7840Temp p2 = emit_extract_vector(ctx, persp_center, 1, v1);78417842Temp ddx_1, ddx_2, ddy_1, ddy_2;7843uint32_t dpp_ctrl0 = dpp_quad_perm(0, 0, 0, 0);7844uint32_t dpp_ctrl1 = dpp_quad_perm(1, 1, 1, 1);7845uint32_t dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);78467847/* Build DD X/Y */7848if (ctx->program->chip_class >= GFX8) {7849Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_ctrl0);7850ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl1);7851ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl2);7852Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_ctrl0);7853ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl1);7854ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl2);7855} else {7856Temp tl_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl0);7857ddx_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl1);7858ddx_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_1, tl_1);7859ddx_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl2);7860ddx_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_2, tl_1);7861Temp tl_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl0);7862ddy_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl1);7863ddy_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_1, tl_2);7864ddy_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl2);7865ddy_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_2, tl_2);7866}78677868/* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */7869aco_opcode mad =7870ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32;7871Temp tmp1 = bld.vop3(mad, bld.def(v1), ddx_1, pos1, p1);7872Temp tmp2 = bld.vop3(mad, bld.def(v1), ddx_2, pos1, p2);7873tmp1 = bld.vop3(mad, bld.def(v1), ddy_1, pos2, tmp1);7874tmp2 = bld.vop3(mad, bld.def(v1), ddy_2, pos2, tmp2);7875Temp wqm1 = bld.tmp(v1);7876emit_wqm(bld, tmp1, wqm1, true);7877Temp wqm2 = bld.tmp(v1);7878emit_wqm(bld, tmp2, wqm2, true);7879bld.pseudo(aco_opcode::p_create_vector, Definition(dst), wqm1, wqm2);7880return;7881}78827883Temp merged_wave_info_to_mask(isel_context* ctx, unsigned i);7884void ngg_emit_sendmsg_gs_alloc_req(isel_context* ctx, Temp vtx_cnt, Temp prm_cnt);7885static void create_vs_exports(isel_context* ctx);78867887void7888visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)7889{7890Builder bld(ctx->program, ctx->block);7891switch (instr->intrinsic) {7892case nir_intrinsic_load_barycentric_sample:7893case nir_intrinsic_load_barycentric_pixel:7894case nir_intrinsic_load_barycentric_centroid: {7895glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr);7896Temp bary = Temp(0, s2);7897switch (mode) {7898case INTERP_MODE_SMOOTH:7899case INTERP_MODE_NONE:7900if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel)7901bary = get_arg(ctx, ctx->args->ac.persp_center);7902else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid)7903bary = ctx->persp_centroid;7904else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample)7905bary = get_arg(ctx, ctx->args->ac.persp_sample);7906break;7907case INTERP_MODE_NOPERSPECTIVE:7908if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel)7909bary = get_arg(ctx, ctx->args->ac.linear_center);7910else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid)7911bary = ctx->linear_centroid;7912else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample)7913bary = get_arg(ctx, ctx->args->ac.linear_sample);7914break;7915default: break;7916}7917Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);7918Temp p1 = emit_extract_vector(ctx, bary, 0, v1);7919Temp p2 = emit_extract_vector(ctx, bary, 1, v1);7920bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(p1), Operand(p2));7921emit_split_vector(ctx, dst, 2);7922break;7923}7924case nir_intrinsic_load_barycentric_model: {7925Temp model = get_arg(ctx, ctx->args->ac.pull_model);79267927Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);7928Temp p1 = emit_extract_vector(ctx, model, 0, v1);7929Temp p2 = emit_extract_vector(ctx, model, 1, v1);7930Temp p3 = emit_extract_vector(ctx, model, 2, v1);7931bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(p1), Operand(p2),7932Operand(p3));7933emit_split_vector(ctx, dst, 3);7934break;7935}7936case nir_intrinsic_load_barycentric_at_sample: {7937uint32_t sample_pos_offset = RING_PS_SAMPLE_POSITIONS * 16;7938switch (ctx->options->key.fs.num_samples) {7939case 2: sample_pos_offset += 1 << 3; break;7940case 4: sample_pos_offset += 3 << 3; break;7941case 8: sample_pos_offset += 7 << 3; break;7942default: break;7943}7944Temp sample_pos;7945Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);7946nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]);7947Temp private_segment_buffer = ctx->program->private_segment_buffer;7948// TODO: bounds checking?7949if (addr.type() == RegType::sgpr) {7950Operand offset;7951if (const_addr) {7952sample_pos_offset += const_addr->u32 << 3;7953offset = Operand::c32(sample_pos_offset);7954} else if (ctx->options->chip_class >= GFX9) {7955offset = bld.sop2(aco_opcode::s_lshl3_add_u32, bld.def(s1), bld.def(s1, scc), addr,7956Operand::c32(sample_pos_offset));7957} else {7958offset = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr,7959Operand::c32(3u));7960offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,7961Operand::c32(sample_pos_offset));7962}79637964Operand off = bld.copy(bld.def(s1), Operand(offset));7965sample_pos =7966bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, off);79677968} else if (ctx->options->chip_class >= GFX9) {7969addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), addr);7970sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr,7971private_segment_buffer, sample_pos_offset);7972} else if (ctx->options->chip_class >= GFX7) {7973/* addr += private_segment_buffer + sample_pos_offset */7974Temp tmp0 = bld.tmp(s1);7975Temp tmp1 = bld.tmp(s1);7976bld.pseudo(aco_opcode::p_split_vector, Definition(tmp0), Definition(tmp1),7977private_segment_buffer);7978Definition scc_tmp = bld.def(s1, scc);7979tmp0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), scc_tmp, tmp0,7980Operand::c32(sample_pos_offset));7981tmp1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), tmp1,7982Operand::zero(), bld.scc(scc_tmp.getTemp()));7983addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), addr);7984Temp pck0 = bld.tmp(v1);7985Temp carry = bld.vadd32(Definition(pck0), tmp0, addr, true).def(1).getTemp();7986tmp1 = as_vgpr(ctx, tmp1);7987Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1),7988bld.hint_vcc(bld.def(bld.lm)), tmp1, Operand::zero(), carry);7989addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), pck0, pck1);79907991/* sample_pos = flat_load_dwordx2 addr */7992sample_pos = bld.flat(aco_opcode::flat_load_dwordx2, bld.def(v2), addr, Operand(s1));7993} else {7994assert(ctx->options->chip_class == GFX6);79957996uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |7997S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);7998Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer,7999Operand::zero(), Operand::c32(rsrc_conf));80008001addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), addr);8002addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), addr, Operand::zero());80038004sample_pos = bld.tmp(v2);80058006aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(8007aco_opcode::buffer_load_dwordx2, Format::MUBUF, 3, 1)};8008load->definitions[0] = Definition(sample_pos);8009load->operands[0] = Operand(rsrc);8010load->operands[1] = Operand(addr);8011load->operands[2] = Operand::zero();8012load->offset = sample_pos_offset;8013load->offen = 0;8014load->addr64 = true;8015load->glc = false;8016load->dlc = false;8017load->disable_wqm = false;8018ctx->block->instructions.emplace_back(std::move(load));8019}80208021/* sample_pos -= 0.5 */8022Temp pos1 = bld.tmp(RegClass(sample_pos.type(), 1));8023Temp pos2 = bld.tmp(RegClass(sample_pos.type(), 1));8024bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), sample_pos);8025pos1 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos1, Operand::c32(0x3f000000u));8026pos2 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos2, Operand::c32(0x3f000000u));80278028emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);8029break;8030}8031case nir_intrinsic_load_barycentric_at_offset: {8032Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);8033RegClass rc = RegClass(offset.type(), 1);8034Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc);8035bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset);8036emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);8037break;8038}8039case nir_intrinsic_load_front_face: {8040bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),8041Operand::zero(), get_arg(ctx, ctx->args->ac.front_face))8042.def(0)8043.setHint(vcc);8044break;8045}8046case nir_intrinsic_load_view_index: {8047if (ctx->stage.has(SWStage::VS) || ctx->stage.has(SWStage::GS) ||8048ctx->stage.has(SWStage::TCS) || ctx->stage.has(SWStage::TES)) {8049Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);8050bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.view_index)));8051break;8052}8053FALLTHROUGH;8054}8055case nir_intrinsic_load_layer_id: {8056unsigned idx = nir_intrinsic_base(instr);8057bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),8058Operand::c32(2u), bld.m0(get_arg(ctx, ctx->args->ac.prim_mask)), idx, 0);8059break;8060}8061case nir_intrinsic_load_frag_coord: {8062emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 4);8063break;8064}8065case nir_intrinsic_load_frag_shading_rate:8066emit_load_frag_shading_rate(ctx, get_ssa_temp(ctx, &instr->dest.ssa));8067break;8068case nir_intrinsic_load_sample_pos: {8069Temp posx = get_arg(ctx, ctx->args->ac.frag_pos[0]);8070Temp posy = get_arg(ctx, ctx->args->ac.frag_pos[1]);8071bld.pseudo(8072aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),8073posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand::zero(),8074posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand::zero());8075break;8076}8077case nir_intrinsic_load_tess_coord: visit_load_tess_coord(ctx, instr); break;8078case nir_intrinsic_load_interpolated_input: visit_load_interpolated_input(ctx, instr); break;8079case nir_intrinsic_store_output: visit_store_output(ctx, instr); break;8080case nir_intrinsic_load_input:8081case nir_intrinsic_load_input_vertex: visit_load_input(ctx, instr); break;8082case nir_intrinsic_load_per_vertex_input: visit_load_per_vertex_input(ctx, instr); break;8083case nir_intrinsic_load_ubo: visit_load_ubo(ctx, instr); break;8084case nir_intrinsic_load_push_constant: visit_load_push_constant(ctx, instr); break;8085case nir_intrinsic_load_constant: visit_load_constant(ctx, instr); break;8086case nir_intrinsic_vulkan_resource_index: visit_load_resource(ctx, instr); break;8087case nir_intrinsic_terminate:8088case nir_intrinsic_discard: visit_discard(ctx, instr); break;8089case nir_intrinsic_terminate_if:8090case nir_intrinsic_discard_if: visit_discard_if(ctx, instr); break;8091case nir_intrinsic_load_shared: visit_load_shared(ctx, instr); break;8092case nir_intrinsic_store_shared: visit_store_shared(ctx, instr); break;8093case nir_intrinsic_shared_atomic_add:8094case nir_intrinsic_shared_atomic_imin:8095case nir_intrinsic_shared_atomic_umin:8096case nir_intrinsic_shared_atomic_imax:8097case nir_intrinsic_shared_atomic_umax:8098case nir_intrinsic_shared_atomic_and:8099case nir_intrinsic_shared_atomic_or:8100case nir_intrinsic_shared_atomic_xor:8101case nir_intrinsic_shared_atomic_exchange:8102case nir_intrinsic_shared_atomic_comp_swap:8103case nir_intrinsic_shared_atomic_fadd: visit_shared_atomic(ctx, instr); break;8104case nir_intrinsic_image_deref_load:8105case nir_intrinsic_image_deref_sparse_load: visit_image_load(ctx, instr); break;8106case nir_intrinsic_image_deref_store: visit_image_store(ctx, instr); break;8107case nir_intrinsic_image_deref_atomic_add:8108case nir_intrinsic_image_deref_atomic_umin:8109case nir_intrinsic_image_deref_atomic_imin:8110case nir_intrinsic_image_deref_atomic_umax:8111case nir_intrinsic_image_deref_atomic_imax:8112case nir_intrinsic_image_deref_atomic_and:8113case nir_intrinsic_image_deref_atomic_or:8114case nir_intrinsic_image_deref_atomic_xor:8115case nir_intrinsic_image_deref_atomic_exchange:8116case nir_intrinsic_image_deref_atomic_comp_swap: visit_image_atomic(ctx, instr); break;8117case nir_intrinsic_image_deref_size: visit_image_size(ctx, instr); break;8118case nir_intrinsic_image_deref_samples: visit_image_samples(ctx, instr); break;8119case nir_intrinsic_load_ssbo: visit_load_ssbo(ctx, instr); break;8120case nir_intrinsic_store_ssbo: visit_store_ssbo(ctx, instr); break;8121case nir_intrinsic_load_global: visit_load_global(ctx, instr); break;8122case nir_intrinsic_load_buffer_amd: visit_load_buffer(ctx, instr); break;8123case nir_intrinsic_store_buffer_amd: visit_store_buffer(ctx, instr); break;8124case nir_intrinsic_store_global: visit_store_global(ctx, instr); break;8125case nir_intrinsic_global_atomic_add:8126case nir_intrinsic_global_atomic_imin:8127case nir_intrinsic_global_atomic_umin:8128case nir_intrinsic_global_atomic_imax:8129case nir_intrinsic_global_atomic_umax:8130case nir_intrinsic_global_atomic_and:8131case nir_intrinsic_global_atomic_or:8132case nir_intrinsic_global_atomic_xor:8133case nir_intrinsic_global_atomic_exchange:8134case nir_intrinsic_global_atomic_comp_swap: visit_global_atomic(ctx, instr); break;8135case nir_intrinsic_ssbo_atomic_add:8136case nir_intrinsic_ssbo_atomic_imin:8137case nir_intrinsic_ssbo_atomic_umin:8138case nir_intrinsic_ssbo_atomic_imax:8139case nir_intrinsic_ssbo_atomic_umax:8140case nir_intrinsic_ssbo_atomic_and:8141case nir_intrinsic_ssbo_atomic_or:8142case nir_intrinsic_ssbo_atomic_xor:8143case nir_intrinsic_ssbo_atomic_exchange:8144case nir_intrinsic_ssbo_atomic_comp_swap: visit_atomic_ssbo(ctx, instr); break;8145case nir_intrinsic_load_scratch: visit_load_scratch(ctx, instr); break;8146case nir_intrinsic_store_scratch: visit_store_scratch(ctx, instr); break;8147case nir_intrinsic_get_ssbo_size: visit_get_ssbo_size(ctx, instr); break;8148case nir_intrinsic_scoped_barrier: emit_scoped_barrier(ctx, instr); break;8149case nir_intrinsic_load_num_workgroups: {8150Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);8151bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.num_work_groups)));8152emit_split_vector(ctx, dst, 3);8153break;8154}8155case nir_intrinsic_load_local_invocation_id: {8156Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);8157bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.local_invocation_ids)));8158emit_split_vector(ctx, dst, 3);8159break;8160}8161case nir_intrinsic_load_workgroup_id: {8162Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);8163struct ac_arg* args = ctx->args->ac.workgroup_ids;8164bld.pseudo(aco_opcode::p_create_vector, Definition(dst),8165args[0].used ? Operand(get_arg(ctx, args[0])) : Operand::zero(),8166args[1].used ? Operand(get_arg(ctx, args[1])) : Operand::zero(),8167args[2].used ? Operand(get_arg(ctx, args[2])) : Operand::zero());8168emit_split_vector(ctx, dst, 3);8169break;8170}8171case nir_intrinsic_load_local_invocation_index: {8172if (ctx->stage.hw == HWStage::LS || ctx->stage.hw == HWStage::HS) {8173bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),8174get_arg(ctx, ctx->args->ac.vs_rel_patch_id));8175break;8176} else if (ctx->stage.hw == HWStage::GS || ctx->stage.hw == HWStage::NGG) {8177bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), thread_id_in_threadgroup(ctx));8178break;8179}81808181Temp id = emit_mbcnt(ctx, bld.tmp(v1));81828183/* The tg_size bits [6:11] contain the subgroup id,8184* we need this multiplied by the wave size, and then OR the thread id to it.8185*/8186if (ctx->program->wave_size == 64) {8187/* After the s_and the bits are already multiplied by 64 (left shifted by 6) so we can just8188* feed that to v_or */8189Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),8190Operand::c32(0xfc0u), get_arg(ctx, ctx->args->ac.tg_size));8191bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num,8192id);8193} else {8194/* Extract the bit field and multiply the result by 32 (left shift by 5), then do the OR */8195Temp tg_num =8196bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),8197get_arg(ctx, ctx->args->ac.tg_size), Operand::c32(0x6u | (0x6u << 16)));8198bld.vop3(aco_opcode::v_lshl_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),8199tg_num, Operand::c32(0x5u), id);8200}8201break;8202}8203case nir_intrinsic_load_subgroup_id: {8204if (ctx->stage == compute_cs) {8205bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),8206bld.def(s1, scc), get_arg(ctx, ctx->args->ac.tg_size),8207Operand::c32(0x6u | (0x6u << 16)));8208} else if (ctx->stage.hw == HWStage::NGG) {8209/* Get the id of the current wave within the threadgroup (workgroup) */8210bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),8211bld.def(s1, scc), get_arg(ctx, ctx->args->ac.merged_wave_info),8212Operand::c32(24u | (4u << 16)));8213} else {8214bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand::zero());8215}8216break;8217}8218case nir_intrinsic_load_subgroup_invocation: {8219emit_mbcnt(ctx, get_ssa_temp(ctx, &instr->dest.ssa));8220break;8221}8222case nir_intrinsic_load_num_subgroups: {8223if (ctx->stage == compute_cs)8224bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),8225bld.def(s1, scc), Operand::c32(0x3fu), get_arg(ctx, ctx->args->ac.tg_size));8226else if (ctx->stage.hw == HWStage::NGG)8227bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),8228bld.def(s1, scc), get_arg(ctx, ctx->args->ac.merged_wave_info),8229Operand::c32(28u | (4u << 16)));8230else8231bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand::c32(0x1u));8232break;8233}8234case nir_intrinsic_ballot: {8235Temp src = get_ssa_temp(ctx, instr->src[0].ssa);8236Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);82378238if (instr->src[0].ssa->bit_size == 1) {8239assert(src.regClass() == bld.lm);8240} else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {8241src = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), src);8242} else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {8243src = bld.vopc(aco_opcode::v_cmp_lg_u64, bld.def(bld.lm), Operand::zero(), src);8244} else {8245isel_err(&instr->instr, "Unimplemented NIR instr bit size");8246}82478248/* Make sure that all inactive lanes return zero.8249* Value-numbering might remove the comparison above */8250src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);8251if (dst.size() != bld.lm.size()) {8252/* Wave32 with ballot size set to 64 */8253src =8254bld.pseudo(aco_opcode::p_create_vector, bld.def(dst.regClass()), src, Operand::zero());8255}82568257emit_wqm(bld, src, dst);8258break;8259}8260case nir_intrinsic_shuffle:8261case nir_intrinsic_read_invocation: {8262Temp src = get_ssa_temp(ctx, instr->src[0].ssa);8263if (!nir_src_is_divergent(instr->src[0])) {8264emit_uniform_subgroup(ctx, instr, src);8265} else {8266Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);8267if (instr->intrinsic == nir_intrinsic_read_invocation ||8268!nir_src_is_divergent(instr->src[1]))8269tid = bld.as_uniform(tid);8270Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);82718272if (instr->dest.ssa.bit_size != 1)8273src = as_vgpr(ctx, src);82748275if (src.regClass() == v1b || src.regClass() == v2b) {8276Temp tmp = bld.tmp(v1);8277tmp = emit_wqm(bld, emit_bpermute(ctx, bld, tid, src), tmp);8278if (dst.type() == RegType::vgpr)8279bld.pseudo(aco_opcode::p_split_vector, Definition(dst),8280bld.def(src.regClass() == v1b ? v3b : v2b), tmp);8281else8282bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);8283} else if (src.regClass() == v1) {8284emit_wqm(bld, emit_bpermute(ctx, bld, tid, src), dst);8285} else if (src.regClass() == v2) {8286Temp lo = bld.tmp(v1), hi = bld.tmp(v1);8287bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);8288lo = emit_wqm(bld, emit_bpermute(ctx, bld, tid, lo));8289hi = emit_wqm(bld, emit_bpermute(ctx, bld, tid, hi));8290bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);8291emit_split_vector(ctx, dst, 2);8292} else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == s1) {8293assert(src.regClass() == bld.lm);8294Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src, tid);8295bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst);8296} else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == v1) {8297assert(src.regClass() == bld.lm);8298Temp tmp;8299if (ctx->program->chip_class <= GFX7)8300tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), src, tid);8301else if (ctx->program->wave_size == 64)8302tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src);8303else8304tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), tid, src);8305tmp = emit_extract_vector(ctx, tmp, 0, v1);8306tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), tmp);8307emit_wqm(bld, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), tmp),8308dst);8309} else {8310isel_err(&instr->instr, "Unimplemented NIR instr bit size");8311}8312}8313break;8314}8315case nir_intrinsic_load_sample_id: {8316bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),8317get_arg(ctx, ctx->args->ac.ancillary), Operand::c32(8u), Operand::c32(4u));8318break;8319}8320case nir_intrinsic_load_sample_mask_in: {8321visit_load_sample_mask_in(ctx, instr);8322break;8323}8324case nir_intrinsic_read_first_invocation: {8325Temp src = get_ssa_temp(ctx, instr->src[0].ssa);8326Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);8327if (src.regClass() == v1b || src.regClass() == v2b || src.regClass() == v1) {8328emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src), dst);8329} else if (src.regClass() == v2) {8330Temp lo = bld.tmp(v1), hi = bld.tmp(v1);8331bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);8332lo = emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo));8333hi = emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi));8334bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);8335emit_split_vector(ctx, dst, 2);8336} else if (instr->dest.ssa.bit_size == 1) {8337assert(src.regClass() == bld.lm);8338Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src,8339bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)));8340bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst);8341} else {8342bld.copy(Definition(dst), src);8343}8344break;8345}8346case nir_intrinsic_vote_all: {8347Temp src = get_ssa_temp(ctx, instr->src[0].ssa);8348Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);8349assert(src.regClass() == bld.lm);8350assert(dst.regClass() == bld.lm);83518352Temp tmp =8353bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src)8354.def(1)8355.getTemp();8356Temp cond = bool_to_vector_condition(ctx, emit_wqm(bld, tmp));8357bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond);8358break;8359}8360case nir_intrinsic_vote_any: {8361Temp src = get_ssa_temp(ctx, instr->src[0].ssa);8362Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);8363assert(src.regClass() == bld.lm);8364assert(dst.regClass() == bld.lm);83658366Temp tmp = bool_to_scalar_condition(ctx, src);8367bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst);8368break;8369}8370case nir_intrinsic_reduce:8371case nir_intrinsic_inclusive_scan:8372case nir_intrinsic_exclusive_scan: {8373Temp src = get_ssa_temp(ctx, instr->src[0].ssa);8374Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);8375nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);8376unsigned cluster_size =8377instr->intrinsic == nir_intrinsic_reduce ? nir_intrinsic_cluster_size(instr) : 0;8378cluster_size = util_next_power_of_two(8379MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));83808381if (!nir_src_is_divergent(instr->src[0]) && cluster_size == ctx->program->wave_size &&8382instr->dest.ssa.bit_size != 1) {8383/* We use divergence analysis to assign the regclass, so check if it's8384* working as expected */8385ASSERTED bool expected_divergent = instr->intrinsic == nir_intrinsic_exclusive_scan;8386if (instr->intrinsic == nir_intrinsic_inclusive_scan)8387expected_divergent = op == nir_op_iadd || op == nir_op_fadd || op == nir_op_ixor;8388assert(nir_dest_is_divergent(instr->dest) == expected_divergent);83898390if (instr->intrinsic == nir_intrinsic_reduce) {8391if (emit_uniform_reduce(ctx, instr))8392break;8393} else if (emit_uniform_scan(ctx, instr)) {8394break;8395}8396}83978398if (instr->dest.ssa.bit_size == 1) {8399if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin)8400op = nir_op_iand;8401else if (op == nir_op_iadd)8402op = nir_op_ixor;8403else if (op == nir_op_umax || op == nir_op_imax)8404op = nir_op_ior;8405assert(op == nir_op_iand || op == nir_op_ior || op == nir_op_ixor);84068407switch (instr->intrinsic) {8408case nir_intrinsic_reduce:8409emit_wqm(bld, emit_boolean_reduce(ctx, op, cluster_size, src), dst);8410break;8411case nir_intrinsic_exclusive_scan:8412emit_wqm(bld, emit_boolean_exclusive_scan(ctx, op, src), dst);8413break;8414case nir_intrinsic_inclusive_scan:8415emit_wqm(bld, emit_boolean_inclusive_scan(ctx, op, src), dst);8416break;8417default: assert(false);8418}8419} else if (cluster_size == 1) {8420bld.copy(Definition(dst), src);8421} else {8422unsigned bit_size = instr->src[0].ssa->bit_size;84238424src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8));84258426ReduceOp reduce_op = get_reduce_op(op, bit_size);84278428aco_opcode aco_op;8429switch (instr->intrinsic) {8430case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;8431case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;8432case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;8433default: unreachable("unknown reduce intrinsic");8434}84358436Temp tmp_dst = emit_reduction_instr(ctx, aco_op, reduce_op, cluster_size,8437bld.def(dst.regClass()), src);8438emit_wqm(bld, tmp_dst, dst);8439}8440break;8441}8442case nir_intrinsic_quad_broadcast: {8443Temp src = get_ssa_temp(ctx, instr->src[0].ssa);8444if (!nir_dest_is_divergent(instr->dest)) {8445emit_uniform_subgroup(ctx, instr, src);8446} else {8447Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);8448unsigned lane = nir_src_as_const_value(instr->src[1])->u32;8449uint32_t dpp_ctrl = dpp_quad_perm(lane, lane, lane, lane);84508451if (instr->dest.ssa.bit_size != 1)8452src = as_vgpr(ctx, src);84538454if (instr->dest.ssa.bit_size == 1) {8455assert(src.regClass() == bld.lm);8456assert(dst.regClass() == bld.lm);8457uint32_t half_mask = 0x11111111u << lane;8458Temp mask_tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),8459Operand::c32(half_mask), Operand::c32(half_mask));8460Temp tmp = bld.tmp(bld.lm);8461bld.sop1(Builder::s_wqm, Definition(tmp),8462bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp,8463bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src,8464Operand(exec, bld.lm))));8465emit_wqm(bld, tmp, dst);8466} else if (instr->dest.ssa.bit_size == 8) {8467Temp tmp = bld.tmp(v1);8468if (ctx->program->chip_class >= GFX8)8469emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);8470else8471emit_wqm(bld,8472bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl),8473tmp);8474bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v3b), tmp);8475} else if (instr->dest.ssa.bit_size == 16) {8476Temp tmp = bld.tmp(v1);8477if (ctx->program->chip_class >= GFX8)8478emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);8479else8480emit_wqm(bld,8481bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl),8482tmp);8483bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);8484} else if (instr->dest.ssa.bit_size == 32) {8485if (ctx->program->chip_class >= GFX8)8486emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), dst);8487else8488emit_wqm(bld,8489bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl),8490dst);8491} else if (instr->dest.ssa.bit_size == 64) {8492Temp lo = bld.tmp(v1), hi = bld.tmp(v1);8493bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);8494if (ctx->program->chip_class >= GFX8) {8495lo = emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl));8496hi = emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl));8497} else {8498lo = emit_wqm(8499bld, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, (1 << 15) | dpp_ctrl));8500hi = emit_wqm(8501bld, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, (1 << 15) | dpp_ctrl));8502}8503bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);8504emit_split_vector(ctx, dst, 2);8505} else {8506isel_err(&instr->instr, "Unimplemented NIR instr bit size");8507}8508}8509break;8510}8511case nir_intrinsic_quad_swap_horizontal:8512case nir_intrinsic_quad_swap_vertical:8513case nir_intrinsic_quad_swap_diagonal:8514case nir_intrinsic_quad_swizzle_amd: {8515Temp src = get_ssa_temp(ctx, instr->src[0].ssa);8516if (!nir_dest_is_divergent(instr->dest)) {8517emit_uniform_subgroup(ctx, instr, src);8518break;8519}8520uint16_t dpp_ctrl = 0;8521switch (instr->intrinsic) {8522case nir_intrinsic_quad_swap_horizontal: dpp_ctrl = dpp_quad_perm(1, 0, 3, 2); break;8523case nir_intrinsic_quad_swap_vertical: dpp_ctrl = dpp_quad_perm(2, 3, 0, 1); break;8524case nir_intrinsic_quad_swap_diagonal: dpp_ctrl = dpp_quad_perm(3, 2, 1, 0); break;8525case nir_intrinsic_quad_swizzle_amd: dpp_ctrl = nir_intrinsic_swizzle_mask(instr); break;8526default: break;8527}8528if (ctx->program->chip_class < GFX8)8529dpp_ctrl |= (1 << 15);85308531Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);85328533if (instr->dest.ssa.bit_size != 1)8534src = as_vgpr(ctx, src);85358536if (instr->dest.ssa.bit_size == 1) {8537assert(src.regClass() == bld.lm);8538src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),8539Operand::c32(-1), src);8540if (ctx->program->chip_class >= GFX8)8541src = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);8542else8543src = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl);8544Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), src);8545emit_wqm(bld, tmp, dst);8546} else if (instr->dest.ssa.bit_size == 8) {8547Temp tmp = bld.tmp(v1);8548if (ctx->program->chip_class >= GFX8)8549emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);8550else8551emit_wqm(bld, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl), tmp);8552bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v3b), tmp);8553} else if (instr->dest.ssa.bit_size == 16) {8554Temp tmp = bld.tmp(v1);8555if (ctx->program->chip_class >= GFX8)8556emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);8557else8558emit_wqm(bld, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl), tmp);8559bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);8560} else if (instr->dest.ssa.bit_size == 32) {8561Temp tmp;8562if (ctx->program->chip_class >= GFX8)8563tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);8564else8565tmp = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl);8566emit_wqm(bld, tmp, dst);8567} else if (instr->dest.ssa.bit_size == 64) {8568Temp lo = bld.tmp(v1), hi = bld.tmp(v1);8569bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);8570if (ctx->program->chip_class >= GFX8) {8571lo = emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl));8572hi = emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl));8573} else {8574lo = emit_wqm(bld, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, dpp_ctrl));8575hi = emit_wqm(bld, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, dpp_ctrl));8576}8577bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);8578emit_split_vector(ctx, dst, 2);8579} else {8580isel_err(&instr->instr, "Unimplemented NIR instr bit size");8581}8582break;8583}8584case nir_intrinsic_masked_swizzle_amd: {8585Temp src = get_ssa_temp(ctx, instr->src[0].ssa);8586if (!nir_dest_is_divergent(instr->dest)) {8587emit_uniform_subgroup(ctx, instr, src);8588break;8589}8590Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);8591uint32_t mask = nir_intrinsic_swizzle_mask(instr);85928593if (instr->dest.ssa.bit_size != 1)8594src = as_vgpr(ctx, src);85958596if (instr->dest.ssa.bit_size == 1) {8597assert(src.regClass() == bld.lm);8598src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),8599Operand::c32(-1), src);8600src = emit_masked_swizzle(ctx, bld, src, mask);8601Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), src);8602emit_wqm(bld, tmp, dst);8603} else if (dst.regClass() == v1b) {8604Temp tmp = emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask));8605emit_extract_vector(ctx, tmp, 0, dst);8606} else if (dst.regClass() == v2b) {8607Temp tmp = emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask));8608emit_extract_vector(ctx, tmp, 0, dst);8609} else if (dst.regClass() == v1) {8610emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask), dst);8611} else if (dst.regClass() == v2) {8612Temp lo = bld.tmp(v1), hi = bld.tmp(v1);8613bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);8614lo = emit_wqm(bld, emit_masked_swizzle(ctx, bld, lo, mask));8615hi = emit_wqm(bld, emit_masked_swizzle(ctx, bld, hi, mask));8616bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);8617emit_split_vector(ctx, dst, 2);8618} else {8619isel_err(&instr->instr, "Unimplemented NIR instr bit size");8620}8621break;8622}8623case nir_intrinsic_write_invocation_amd: {8624Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));8625Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));8626Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa));8627Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);8628if (dst.regClass() == v1) {8629/* src2 is ignored for writelane. RA assigns the same reg for dst */8630emit_wqm(bld, bld.writelane(bld.def(v1), val, lane, src), dst);8631} else if (dst.regClass() == v2) {8632Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);8633Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);8634bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);8635bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);8636Temp lo = emit_wqm(bld, bld.writelane(bld.def(v1), val_lo, lane, src_hi));8637Temp hi = emit_wqm(bld, bld.writelane(bld.def(v1), val_hi, lane, src_hi));8638bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);8639emit_split_vector(ctx, dst, 2);8640} else {8641isel_err(&instr->instr, "Unimplemented NIR instr bit size");8642}8643break;8644}8645case nir_intrinsic_mbcnt_amd: {8646Temp src = get_ssa_temp(ctx, instr->src[0].ssa);8647Temp add_src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));8648Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);8649/* Fit 64-bit mask for wave32 */8650src = emit_extract_vector(ctx, src, 0, RegClass(src.type(), bld.lm.size()));8651Temp wqm_tmp = emit_mbcnt(ctx, bld.tmp(v1), Operand(src), Operand(add_src));8652emit_wqm(bld, wqm_tmp, dst);8653break;8654}8655case nir_intrinsic_byte_permute_amd: {8656Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);8657assert(dst.regClass() == v1);8658assert(ctx->program->chip_class >= GFX8);8659bld.vop3(aco_opcode::v_perm_b32, Definition(dst), get_ssa_temp(ctx, instr->src[0].ssa),8660as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)),8661as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa)));8662break;8663}8664case nir_intrinsic_lane_permute_16_amd: {8665Temp src = get_ssa_temp(ctx, instr->src[0].ssa);8666Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);8667assert(ctx->program->chip_class >= GFX10);86688669if (src.regClass() == s1) {8670bld.copy(Definition(dst), src);8671} else if (dst.regClass() == v1 && src.regClass() == v1) {8672bld.vop3(aco_opcode::v_permlane16_b32, Definition(dst), src,8673bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)),8674bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa)));8675} else {8676isel_err(&instr->instr, "Unimplemented lane_permute_16_amd");8677}8678break;8679}8680case nir_intrinsic_load_helper_invocation:8681case nir_intrinsic_is_helper_invocation: {8682/* load_helper() after demote() get lowered to is_helper().8683* Otherwise, these two behave the same. */8684Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);8685bld.pseudo(aco_opcode::p_is_helper, Definition(dst));8686ctx->block->kind |= block_kind_needs_lowering;8687ctx->program->needs_exact = true;8688break;8689}8690case nir_intrinsic_demote:8691bld.pseudo(aco_opcode::p_demote_to_helper, Operand::c32(-1u));86928693if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)8694ctx->cf_info.exec_potentially_empty_discard = true;8695ctx->block->kind |= block_kind_uses_demote;8696ctx->program->needs_exact = true;8697break;8698case nir_intrinsic_demote_if: {8699Temp src = get_ssa_temp(ctx, instr->src[0].ssa);8700assert(src.regClass() == bld.lm);8701Temp cond =8702bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));8703bld.pseudo(aco_opcode::p_demote_to_helper, cond);87048705if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)8706ctx->cf_info.exec_potentially_empty_discard = true;8707ctx->block->kind |= block_kind_uses_demote;8708ctx->program->needs_exact = true;8709break;8710}8711case nir_intrinsic_first_invocation: {8712emit_wqm(bld, bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)),8713get_ssa_temp(ctx, &instr->dest.ssa));8714break;8715}8716case nir_intrinsic_last_invocation: {8717Temp flbit = bld.sop1(Builder::s_flbit_i32, bld.def(s1), Operand(exec, bld.lm));8718Temp last = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc),8719Operand::c32(ctx->program->wave_size - 1u), flbit);8720emit_wqm(bld, last, get_ssa_temp(ctx, &instr->dest.ssa));8721break;8722}8723case nir_intrinsic_elect: {8724Temp first = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));8725emit_wqm(8726bld, bld.sop2(Builder::s_lshl, bld.def(bld.lm), bld.def(s1, scc), Operand::c32(1u), first),8727get_ssa_temp(ctx, &instr->dest.ssa));8728break;8729}8730case nir_intrinsic_shader_clock: {8731Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);8732if (nir_intrinsic_memory_scope(instr) == NIR_SCOPE_SUBGROUP &&8733ctx->options->chip_class >= GFX10_3) {8734/* "((size - 1) << 11) | register" (SHADER_CYCLES is encoded as register 29) */8735Temp clock = bld.sopk(aco_opcode::s_getreg_b32, bld.def(s1), ((20 - 1) << 11) | 29);8736bld.pseudo(aco_opcode::p_create_vector, Definition(dst), clock, Operand::zero());8737} else {8738aco_opcode opcode = nir_intrinsic_memory_scope(instr) == NIR_SCOPE_DEVICE8739? aco_opcode::s_memrealtime8740: aco_opcode::s_memtime;8741bld.smem(opcode, Definition(dst), memory_sync_info(0, semantic_volatile));8742}8743emit_split_vector(ctx, dst, 2);8744break;8745}8746case nir_intrinsic_load_vertex_id_zero_base: {8747Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);8748bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.vertex_id));8749break;8750}8751case nir_intrinsic_load_first_vertex: {8752Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);8753bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.base_vertex));8754break;8755}8756case nir_intrinsic_load_base_instance: {8757Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);8758bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.start_instance));8759break;8760}8761case nir_intrinsic_load_instance_id: {8762Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);8763bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.instance_id));8764break;8765}8766case nir_intrinsic_load_draw_id: {8767Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);8768bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.draw_id));8769break;8770}8771case nir_intrinsic_load_invocation_id: {8772Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);87738774if (ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {8775if (ctx->options->chip_class >= GFX10)8776bld.vop2_e64(aco_opcode::v_and_b32, Definition(dst), Operand::c32(127u),8777get_arg(ctx, ctx->args->ac.gs_invocation_id));8778else8779bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_invocation_id));8780} else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {8781bld.vop3(aco_opcode::v_bfe_u32, Definition(dst), get_arg(ctx, ctx->args->ac.tcs_rel_ids),8782Operand::c32(8u), Operand::c32(5u));8783} else {8784unreachable("Unsupported stage for load_invocation_id");8785}87868787break;8788}8789case nir_intrinsic_load_primitive_id: {8790Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);87918792switch (ctx->shader->info.stage) {8793case MESA_SHADER_GEOMETRY:8794bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id));8795break;8796case MESA_SHADER_TESS_CTRL:8797bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tcs_patch_id));8798break;8799case MESA_SHADER_TESS_EVAL:8800bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tes_patch_id));8801break;8802default:8803if (ctx->stage.hw == HWStage::NGG && !ctx->stage.has(SWStage::GS)) {8804/* In case of NGG, the GS threads always have the primitive ID8805* even if there is no SW GS. */8806bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id));8807break;8808}8809unreachable("Unimplemented shader stage for nir_intrinsic_load_primitive_id");8810}88118812break;8813}8814case nir_intrinsic_load_patch_vertices_in: {8815assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL ||8816ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);88178818Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);8819bld.copy(Definition(dst), Operand::c32(ctx->args->options->key.tcs.input_vertices));8820break;8821}8822case nir_intrinsic_emit_vertex_with_counter: {8823assert(ctx->stage.hw == HWStage::GS);8824visit_emit_vertex_with_counter(ctx, instr);8825break;8826}8827case nir_intrinsic_end_primitive_with_counter: {8828if (ctx->stage.hw != HWStage::NGG) {8829unsigned stream = nir_intrinsic_stream_id(instr);8830bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1,8831sendmsg_gs(true, false, stream));8832}8833break;8834}8835case nir_intrinsic_set_vertex_and_primitive_count: {8836assert(ctx->stage.hw == HWStage::GS);8837/* unused in the legacy pipeline, the HW keeps track of this for us */8838break;8839}8840case nir_intrinsic_load_tess_rel_patch_id_amd: {8841bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), get_tess_rel_patch_id(ctx));8842break;8843}8844case nir_intrinsic_load_ring_tess_factors_amd: {8845bld.smem(aco_opcode::s_load_dwordx4, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),8846ctx->program->private_segment_buffer, Operand::c32(RING_HS_TESS_FACTOR * 16u));8847break;8848}8849case nir_intrinsic_load_ring_tess_factors_offset_amd: {8850bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),8851get_arg(ctx, ctx->args->ac.tcs_factor_offset));8852break;8853}8854case nir_intrinsic_load_ring_tess_offchip_amd: {8855bld.smem(aco_opcode::s_load_dwordx4, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),8856ctx->program->private_segment_buffer, Operand::c32(RING_HS_TESS_OFFCHIP * 16u));8857break;8858}8859case nir_intrinsic_load_ring_tess_offchip_offset_amd: {8860bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),8861get_arg(ctx, ctx->args->ac.tess_offchip_offset));8862break;8863}8864case nir_intrinsic_load_ring_esgs_amd: {8865unsigned ring = ctx->stage.hw == HWStage::ES ? RING_ESGS_VS : RING_ESGS_GS;8866bld.smem(aco_opcode::s_load_dwordx4, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),8867ctx->program->private_segment_buffer, Operand::c32(ring * 16u));8868break;8869}8870case nir_intrinsic_load_ring_es2gs_offset_amd: {8871bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),8872get_arg(ctx, ctx->args->ac.es2gs_offset));8873break;8874}8875case nir_intrinsic_load_gs_vertex_offset_amd: {8876unsigned b = nir_intrinsic_base(instr);8877bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),8878get_arg(ctx, ctx->args->ac.gs_vtx_offset[b]));8879break;8880}8881case nir_intrinsic_has_input_vertex_amd:8882case nir_intrinsic_has_input_primitive_amd: {8883assert(ctx->stage.hw == HWStage::NGG);8884unsigned i = instr->intrinsic == nir_intrinsic_has_input_vertex_amd ? 0 : 1;8885bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), merged_wave_info_to_mask(ctx, i));8886break;8887}8888case nir_intrinsic_load_workgroup_num_input_vertices_amd:8889case nir_intrinsic_load_workgroup_num_input_primitives_amd: {8890assert(ctx->stage.hw == HWStage::NGG);8891unsigned pos =8892instr->intrinsic == nir_intrinsic_load_workgroup_num_input_vertices_amd ? 12 : 22;8893bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),8894bld.def(s1, scc), get_arg(ctx, ctx->args->ac.gs_tg_info),8895Operand::c32(pos | (9u << 16u)));8896break;8897}8898case nir_intrinsic_load_initial_edgeflag_amd: {8899assert(ctx->stage.hw == HWStage::NGG);8900assert(nir_src_is_const(instr->src[0]));8901unsigned i = nir_src_as_uint(instr->src[0]);89028903Temp gs_invocation_id = get_arg(ctx, ctx->args->ac.gs_invocation_id);8904bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),8905gs_invocation_id, Operand::c32(8u + i), Operand::c32(1u));8906break;8907}8908case nir_intrinsic_load_packed_passthrough_primitive_amd: {8909bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),8910get_arg(ctx, ctx->args->ac.gs_vtx_offset[0]));8911break;8912}8913case nir_intrinsic_export_vertex_amd: {8914ctx->block->kind |= block_kind_export_end;8915create_vs_exports(ctx);8916break;8917}8918case nir_intrinsic_export_primitive_amd: {8919assert(ctx->stage.hw == HWStage::NGG);8920Temp prim_exp_arg = get_ssa_temp(ctx, instr->src[0].ssa);8921bld.exp(aco_opcode::exp, prim_exp_arg, Operand(v1), Operand(v1), Operand(v1),89221 /* enabled mask */, V_008DFC_SQ_EXP_PRIM /* dest */, false /* compressed */,8923true /* done */, false /* valid mask */);8924break;8925}8926case nir_intrinsic_alloc_vertices_and_primitives_amd: {8927assert(ctx->stage.hw == HWStage::NGG);8928Temp num_vertices = get_ssa_temp(ctx, instr->src[0].ssa);8929Temp num_primitives = get_ssa_temp(ctx, instr->src[1].ssa);8930ngg_emit_sendmsg_gs_alloc_req(ctx, num_vertices, num_primitives);8931break;8932}8933case nir_intrinsic_gds_atomic_add_amd: {8934Temp store_val = get_ssa_temp(ctx, instr->src[0].ssa);8935Temp gds_addr = get_ssa_temp(ctx, instr->src[1].ssa);8936Temp m0_val = get_ssa_temp(ctx, instr->src[2].ssa);8937Operand m = bld.m0((Temp)bld.copy(bld.def(s1, m0), bld.as_uniform(m0_val)));8938bld.ds(aco_opcode::ds_add_u32, as_vgpr(ctx, gds_addr), as_vgpr(ctx, store_val), m, 0u, 0u,8939true);8940break;8941}8942case nir_intrinsic_load_shader_query_enabled_amd: {8943unsigned cmp_bit = 0;8944Temp shader_query_enabled =8945bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc),8946get_arg(ctx, ctx->args->ngg_gs_state), Operand::c32(cmp_bit));8947bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),8948bool_to_vector_condition(ctx, shader_query_enabled));8949break;8950}8951case nir_intrinsic_load_cull_front_face_enabled_amd:8952case nir_intrinsic_load_cull_back_face_enabled_amd:8953case nir_intrinsic_load_cull_ccw_amd:8954case nir_intrinsic_load_cull_small_primitives_enabled_amd: {8955unsigned cmp_bit;8956if (instr->intrinsic == nir_intrinsic_load_cull_front_face_enabled_amd)8957cmp_bit = 0;8958else if (instr->intrinsic == nir_intrinsic_load_cull_back_face_enabled_amd)8959cmp_bit = 1;8960else if (instr->intrinsic == nir_intrinsic_load_cull_ccw_amd)8961cmp_bit = 2;8962else if (instr->intrinsic == nir_intrinsic_load_cull_small_primitives_enabled_amd)8963cmp_bit = 3;8964else8965unreachable("unimplemented culling intrinsic");89668967Builder::Result enabled =8968bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc),8969get_arg(ctx, ctx->args->ngg_culling_settings), Operand::c32(cmp_bit));8970enabled.instr->definitions[0].setNoCSE(true);8971bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),8972bool_to_vector_condition(ctx, enabled));8973break;8974}8975case nir_intrinsic_load_sbt_amd: visit_load_sbt_amd(ctx, instr); break;8976case nir_intrinsic_bvh64_intersect_ray_amd: visit_bvh64_intersect_ray_amd(ctx, instr); break;8977case nir_intrinsic_load_cull_any_enabled_amd: {8978Builder::Result cull_any_enabled =8979bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),8980get_arg(ctx, ctx->args->ngg_culling_settings), Operand::c32(0x00ffffffu));8981cull_any_enabled.instr->definitions[1].setNoCSE(true);8982bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),8983bool_to_vector_condition(ctx, cull_any_enabled.def(1).getTemp()));8984break;8985}8986case nir_intrinsic_load_cull_small_prim_precision_amd: {8987/* Exponent is 8-bit signed int, move that into a signed 32-bit int. */8988Temp exponent = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc),8989get_arg(ctx, ctx->args->ngg_gs_state), Operand::c32(24u));8990/* small_prim_precision = 1.0 * 2^X */8991bld.vop3(aco_opcode::v_ldexp_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),8992Operand::c32(0x3f800000u), Operand(exponent));8993break;8994}8995case nir_intrinsic_load_viewport_x_scale: {8996bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),8997get_arg(ctx, ctx->args->ngg_viewport_scale[0]));8998break;8999}9000case nir_intrinsic_load_viewport_y_scale: {9001bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),9002get_arg(ctx, ctx->args->ngg_viewport_scale[1]));9003break;9004}9005case nir_intrinsic_load_viewport_x_offset: {9006bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),9007get_arg(ctx, ctx->args->ngg_viewport_translate[0]));9008break;9009}9010case nir_intrinsic_load_viewport_y_offset: {9011bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),9012get_arg(ctx, ctx->args->ngg_viewport_translate[1]));9013break;9014}9015case nir_intrinsic_overwrite_vs_arguments_amd: {9016ctx->arg_temps[ctx->args->ac.vertex_id.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa);9017ctx->arg_temps[ctx->args->ac.instance_id.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa);9018break;9019}9020case nir_intrinsic_overwrite_tes_arguments_amd: {9021ctx->arg_temps[ctx->args->ac.tes_u.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa);9022ctx->arg_temps[ctx->args->ac.tes_v.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa);9023ctx->arg_temps[ctx->args->ac.tes_rel_patch_id.arg_index] =9024get_ssa_temp(ctx, instr->src[2].ssa);9025ctx->arg_temps[ctx->args->ac.tes_patch_id.arg_index] = get_ssa_temp(ctx, instr->src[3].ssa);9026break;9027}9028case nir_intrinsic_overwrite_subgroup_num_vertices_and_primitives_amd: {9029Temp old_merged_wave_info = get_arg(ctx, ctx->args->ac.merged_wave_info);9030Temp num_vertices = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));9031Temp num_primitives = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));9032Temp tmp = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), num_primitives,9033Operand::c32(8u));9034tmp = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), tmp, num_vertices);9035ctx->arg_temps[ctx->args->ac.merged_wave_info.arg_index] =9036bld.sop2(aco_opcode::s_pack_lh_b32_b16, bld.def(s1), tmp, old_merged_wave_info);9037break;9038}9039default:9040isel_err(&instr->instr, "Unimplemented intrinsic instr");9041abort();90429043break;9044}9045}90469047void9048tex_fetch_ptrs(isel_context* ctx, nir_tex_instr* instr, Temp* res_ptr, Temp* samp_ptr,9049Temp* fmask_ptr, enum glsl_base_type* stype)9050{9051nir_deref_instr* texture_deref_instr = NULL;9052nir_deref_instr* sampler_deref_instr = NULL;9053int plane = -1;90549055for (unsigned i = 0; i < instr->num_srcs; i++) {9056switch (instr->src[i].src_type) {9057case nir_tex_src_texture_deref:9058texture_deref_instr = nir_src_as_deref(instr->src[i].src);9059break;9060case nir_tex_src_sampler_deref:9061sampler_deref_instr = nir_src_as_deref(instr->src[i].src);9062break;9063case nir_tex_src_plane: plane = nir_src_as_int(instr->src[i].src); break;9064default: break;9065}9066}90679068*stype = glsl_get_sampler_result_type(texture_deref_instr->type);90699070if (!sampler_deref_instr)9071sampler_deref_instr = texture_deref_instr;90729073if (plane >= 0) {9074assert(instr->op != nir_texop_txf_ms && instr->op != nir_texop_samples_identical);9075assert(instr->sampler_dim != GLSL_SAMPLER_DIM_BUF);9076*res_ptr = get_sampler_desc(ctx, texture_deref_instr,9077(aco_descriptor_type)(ACO_DESC_PLANE_0 + plane), instr, false);9078} else if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {9079*res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_BUFFER, instr, false);9080} else if (instr->op == nir_texop_fragment_mask_fetch) {9081*res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false);9082} else {9083*res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_IMAGE, instr, false);9084}9085if (samp_ptr) {9086*samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, ACO_DESC_SAMPLER, instr, false);90879088if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT && ctx->options->chip_class < GFX8) {9089/* fix sampler aniso on SI/CI: samp[0] = samp[0] & img[7] */9090Builder bld(ctx->program, ctx->block);90919092/* to avoid unnecessary moves, we split and recombine sampler and image */9093Temp img[8] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1),9094bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)};9095Temp samp[4] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)};9096bld.pseudo(aco_opcode::p_split_vector, Definition(img[0]), Definition(img[1]),9097Definition(img[2]), Definition(img[3]), Definition(img[4]), Definition(img[5]),9098Definition(img[6]), Definition(img[7]), *res_ptr);9099bld.pseudo(aco_opcode::p_split_vector, Definition(samp[0]), Definition(samp[1]),9100Definition(samp[2]), Definition(samp[3]), *samp_ptr);91019102samp[0] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), samp[0], img[7]);9103*res_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), img[0], img[1], img[2],9104img[3], img[4], img[5], img[6], img[7]);9105*samp_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), samp[0], samp[1], samp[2],9106samp[3]);9107}9108}9109if (fmask_ptr && (instr->op == nir_texop_txf_ms || instr->op == nir_texop_samples_identical))9110*fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false);9111}91129113void9114build_cube_select(isel_context* ctx, Temp ma, Temp id, Temp deriv, Temp* out_ma, Temp* out_sc,9115Temp* out_tc)9116{9117Builder bld(ctx->program, ctx->block);91189119Temp deriv_x = emit_extract_vector(ctx, deriv, 0, v1);9120Temp deriv_y = emit_extract_vector(ctx, deriv, 1, v1);9121Temp deriv_z = emit_extract_vector(ctx, deriv, 2, v1);91229123Operand neg_one = Operand::c32(0xbf800000u);9124Operand one = Operand::c32(0x3f800000u);9125Operand two = Operand::c32(0x40000000u);9126Operand four = Operand::c32(0x40800000u);91279128Temp is_ma_positive =9129bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand::zero(), ma);9130Temp sgn_ma = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, one, is_ma_positive);9131Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand::zero(), sgn_ma);91329133Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), four, id);9134Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(bld.lm), two, id);9135is_ma_y = bld.sop2(Builder::s_andn2, bld.hint_vcc(bld.def(bld.lm)), is_ma_y, is_ma_z);9136Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(bld.lm)),9137bld.def(s1, scc), is_ma_z, is_ma_y);91389139/* select sc */9140Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_z, deriv_x, is_not_ma_x);9141Temp sgn = bld.vop2_e64(9142aco_opcode::v_cndmask_b32, bld.def(v1),9143bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_sgn_ma, sgn_ma, is_ma_z), one, is_ma_y);9144*out_sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);91459146/* select tc */9147tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_y, deriv_z, is_ma_y);9148sgn = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, sgn_ma, is_ma_y);9149*out_tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);91509151/* select ma */9152tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),9153bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_x, deriv_y, is_ma_y),9154deriv_z, is_ma_z);9155tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7fffffffu), tmp);9156*out_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), two, tmp);9157}91589159void9160prepare_cube_coords(isel_context* ctx, std::vector<Temp>& coords, Temp* ddx, Temp* ddy,9161bool is_deriv, bool is_array)9162{9163Builder bld(ctx->program, ctx->block);9164Temp ma, tc, sc, id;9165aco_opcode madak =9166ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fmaak_f32 : aco_opcode::v_madak_f32;9167aco_opcode madmk =9168ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fmamk_f32 : aco_opcode::v_madmk_f32;91699170if (is_array) {9171coords[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[3]);91729173/* see comment in ac_prepare_cube_coords() */9174if (ctx->options->chip_class <= GFX8)9175coords[3] = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::zero(), coords[3]);9176}91779178ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), coords[0], coords[1], coords[2]);91799180aco_ptr<VOP3_instruction> vop3a{9181create_instruction<VOP3_instruction>(aco_opcode::v_rcp_f32, asVOP3(Format::VOP1), 1, 1)};9182vop3a->operands[0] = Operand(ma);9183vop3a->abs[0] = true;9184Temp invma = bld.tmp(v1);9185vop3a->definitions[0] = Definition(invma);9186ctx->block->instructions.emplace_back(std::move(vop3a));91879188sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coords[0], coords[1], coords[2]);9189if (!is_deriv)9190sc = bld.vop2(madak, bld.def(v1), sc, invma, Operand::c32(0x3fc00000u /*1.5*/));91919192tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coords[0], coords[1], coords[2]);9193if (!is_deriv)9194tc = bld.vop2(madak, bld.def(v1), tc, invma, Operand::c32(0x3fc00000u /*1.5*/));91959196id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coords[0], coords[1], coords[2]);91979198if (is_deriv) {9199sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, invma);9200tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, invma);92019202for (unsigned i = 0; i < 2; i++) {9203/* see comment in ac_prepare_cube_coords() */9204Temp deriv_ma;9205Temp deriv_sc, deriv_tc;9206build_cube_select(ctx, ma, id, i ? *ddy : *ddx, &deriv_ma, &deriv_sc, &deriv_tc);92079208deriv_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, invma);92099210Temp x = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),9211bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_sc, invma),9212bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, sc));9213Temp y = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),9214bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_tc, invma),9215bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, tc));9216*(i ? ddy : ddx) = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), x, y);9217}92189219sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3fc00000u /*1.5*/), sc);9220tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3fc00000u /*1.5*/), tc);9221}92229223if (is_array)9224id = bld.vop2(madmk, bld.def(v1), coords[3], id, Operand::c32(0x41000000u /*8.0*/));9225coords.resize(3);9226coords[0] = sc;9227coords[1] = tc;9228coords[2] = id;9229}92309231void9232get_const_vec(nir_ssa_def* vec, nir_const_value* cv[4])9233{9234if (vec->parent_instr->type != nir_instr_type_alu)9235return;9236nir_alu_instr* vec_instr = nir_instr_as_alu(vec->parent_instr);9237if (vec_instr->op != nir_op_vec(vec->num_components))9238return;92399240for (unsigned i = 0; i < vec->num_components; i++) {9241cv[i] =9242vec_instr->src[i].swizzle[0] == 0 ? nir_src_as_const_value(vec_instr->src[i].src) : NULL;9243}9244}92459246void9247visit_tex(isel_context* ctx, nir_tex_instr* instr)9248{9249Builder bld(ctx->program, ctx->block);9250bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,9251has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false,9252has_sample_index = false, has_clamped_lod = false;9253Temp resource, sampler, fmask_ptr, bias = Temp(), compare = Temp(), sample_index = Temp(),9254lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(),9255clamped_lod = Temp();9256std::vector<Temp> coords;9257std::vector<Temp> derivs;9258nir_const_value* sample_index_cv = NULL;9259nir_const_value* const_offset[4] = {NULL, NULL, NULL, NULL};9260enum glsl_base_type stype;9261tex_fetch_ptrs(ctx, instr, &resource, &sampler, &fmask_ptr, &stype);92629263bool tg4_integer_workarounds = ctx->options->chip_class <= GFX8 && instr->op == nir_texop_tg4 &&9264(stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT);9265bool tg4_integer_cube_workaround =9266tg4_integer_workarounds && instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;92679268for (unsigned i = 0; i < instr->num_srcs; i++) {9269switch (instr->src[i].src_type) {9270case nir_tex_src_coord: {9271Temp coord = get_ssa_temp(ctx, instr->src[i].src.ssa);9272for (unsigned j = 0; j < coord.size(); j++)9273coords.emplace_back(emit_extract_vector(ctx, coord, j, v1));9274break;9275}9276case nir_tex_src_bias:9277bias = get_ssa_temp(ctx, instr->src[i].src.ssa);9278has_bias = true;9279break;9280case nir_tex_src_lod: {9281if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0) {9282level_zero = true;9283} else {9284lod = get_ssa_temp(ctx, instr->src[i].src.ssa);9285has_lod = true;9286}9287break;9288}9289case nir_tex_src_min_lod:9290clamped_lod = get_ssa_temp(ctx, instr->src[i].src.ssa);9291has_clamped_lod = true;9292break;9293case nir_tex_src_comparator:9294if (instr->is_shadow) {9295compare = get_ssa_temp(ctx, instr->src[i].src.ssa);9296has_compare = true;9297}9298break;9299case nir_tex_src_offset:9300offset = get_ssa_temp(ctx, instr->src[i].src.ssa);9301get_const_vec(instr->src[i].src.ssa, const_offset);9302has_offset = true;9303break;9304case nir_tex_src_ddx:9305ddx = get_ssa_temp(ctx, instr->src[i].src.ssa);9306has_ddx = true;9307break;9308case nir_tex_src_ddy:9309ddy = get_ssa_temp(ctx, instr->src[i].src.ssa);9310has_ddy = true;9311break;9312case nir_tex_src_ms_index:9313sample_index = get_ssa_temp(ctx, instr->src[i].src.ssa);9314sample_index_cv = nir_src_as_const_value(instr->src[i].src);9315has_sample_index = true;9316break;9317case nir_tex_src_texture_offset:9318case nir_tex_src_sampler_offset:9319default: break;9320}9321}93229323if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)9324return get_buffer_size(ctx, resource, get_ssa_temp(ctx, &instr->dest.ssa));93259326if (instr->op == nir_texop_texture_samples) {9327get_image_samples(ctx, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), resource);9328return;9329}93309331if (has_offset && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) {9332aco_ptr<Instruction> tmp_instr;9333Temp acc, pack = Temp();93349335uint32_t pack_const = 0;9336for (unsigned i = 0; i < offset.size(); i++) {9337if (!const_offset[i])9338continue;9339pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i);9340}93419342if (offset.type() == RegType::sgpr) {9343for (unsigned i = 0; i < offset.size(); i++) {9344if (const_offset[i])9345continue;93469347acc = emit_extract_vector(ctx, offset, i, s1);9348acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc,9349Operand::c32(0x3Fu));93509351if (i) {9352acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc,9353Operand::c32(8u * i));9354}93559356if (pack == Temp()) {9357pack = acc;9358} else {9359pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc);9360}9361}93629363if (pack_const && pack != Temp())9364pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),9365Operand::c32(pack_const), pack);9366} else {9367for (unsigned i = 0; i < offset.size(); i++) {9368if (const_offset[i])9369continue;93709371acc = emit_extract_vector(ctx, offset, i, v1);9372acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x3Fu), acc);93739374if (i) {9375acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(8u * i), acc);9376}93779378if (pack == Temp()) {9379pack = acc;9380} else {9381pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc);9382}9383}93849385if (pack_const && pack != Temp())9386pack = bld.sop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(pack_const), pack);9387}9388if (pack_const && pack == Temp())9389offset = bld.copy(bld.def(v1), Operand::c32(pack_const));9390else if (pack == Temp())9391has_offset = false;9392else9393offset = pack;9394}93959396if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && instr->coord_components)9397prepare_cube_coords(ctx, coords, &ddx, &ddy, instr->op == nir_texop_txd,9398instr->is_array && instr->op != nir_texop_lod);93999400/* pack derivatives */9401if (has_ddx || has_ddy) {9402if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && ctx->options->chip_class == GFX9) {9403assert(has_ddx && has_ddy && ddx.size() == 1 && ddy.size() == 1);9404Temp zero = bld.copy(bld.def(v1), Operand::zero());9405derivs = {ddx, zero, ddy, zero};9406} else {9407for (unsigned i = 0; has_ddx && i < ddx.size(); i++)9408derivs.emplace_back(emit_extract_vector(ctx, ddx, i, v1));9409for (unsigned i = 0; has_ddy && i < ddy.size(); i++)9410derivs.emplace_back(emit_extract_vector(ctx, ddy, i, v1));9411}9412has_derivs = true;9413}94149415if (instr->coord_components > 1 && instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&9416instr->is_array && instr->op != nir_texop_txf)9417coords[1] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[1]);94189419if (instr->coord_components > 2 &&9420(instr->sampler_dim == GLSL_SAMPLER_DIM_2D || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||9421instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS ||9422instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&9423instr->is_array && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms &&9424instr->op != nir_texop_fragment_fetch && instr->op != nir_texop_fragment_mask_fetch)9425coords[2] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[2]);94269427if (ctx->options->chip_class == GFX9 && instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&9428instr->op != nir_texop_lod && instr->coord_components) {9429assert(coords.size() > 0 && coords.size() < 3);94309431coords.insert(std::next(coords.begin()),9432bld.copy(bld.def(v1), instr->op == nir_texop_txf ? Operand::c32(0)9433: Operand::c32(0x3f000000)));9434}94359436bool da = should_declare_array(ctx, instr->sampler_dim, instr->is_array);94379438if (instr->op == nir_texop_samples_identical)9439resource = fmask_ptr;94409441else if ((instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||9442instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&9443instr->op != nir_texop_txs && instr->op != nir_texop_fragment_fetch &&9444instr->op != nir_texop_fragment_mask_fetch) {9445assert(has_sample_index);9446Operand op(sample_index);9447if (sample_index_cv)9448op = Operand::c32(sample_index_cv->u32);9449sample_index = adjust_sample_index_using_fmask(ctx, da, coords, op, fmask_ptr);9450}94519452if (has_offset && (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms)) {9453for (unsigned i = 0; i < std::min(offset.size(), instr->coord_components); i++) {9454Temp off = emit_extract_vector(ctx, offset, i, v1);9455coords[i] = bld.vadd32(bld.def(v1), coords[i], off);9456}9457has_offset = false;9458}94599460/* Build tex instruction */9461unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa) & 0xf;9462if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)9463dmask = u_bit_consecutive(0, util_last_bit(dmask));9464if (instr->is_sparse)9465dmask = MAX2(dmask, 1) | 0x10;9466unsigned dim =9467ctx->options->chip_class >= GFX10 && instr->sampler_dim != GLSL_SAMPLER_DIM_BUF9468? ac_get_sampler_dim(ctx->options->chip_class, instr->sampler_dim, instr->is_array)9469: 0;9470Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);9471Temp tmp_dst = dst;94729473/* gather4 selects the component by dmask and always returns vec4 (vec5 if sparse) */9474if (instr->op == nir_texop_tg4) {9475assert(instr->dest.ssa.num_components == (4 + instr->is_sparse));9476if (instr->is_shadow)9477dmask = 1;9478else9479dmask = 1 << instr->component;9480if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr)9481tmp_dst = bld.tmp(instr->is_sparse ? v5 : v4);9482} else if (instr->op == nir_texop_samples_identical) {9483tmp_dst = bld.tmp(v1);9484} else if (util_bitcount(dmask) != instr->dest.ssa.num_components ||9485dst.type() == RegType::sgpr) {9486tmp_dst = bld.tmp(RegClass(RegType::vgpr, util_bitcount(dmask)));9487}94889489if (instr->op == nir_texop_txs || instr->op == nir_texop_query_levels) {9490if (!has_lod)9491lod = bld.copy(bld.def(v1), Operand::zero());94929493bool div_by_6 = instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&9494instr->is_array && (dmask & (1 << 2));9495if (tmp_dst.id() == dst.id() && div_by_6)9496tmp_dst = bld.tmp(tmp_dst.regClass());94979498MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(tmp_dst),9499resource, Operand(s4), std::vector<Temp>{lod});9500if (ctx->options->chip_class == GFX9 && instr->op == nir_texop_txs &&9501instr->sampler_dim == GLSL_SAMPLER_DIM_1D && instr->is_array) {9502tex->dmask = (dmask & 0x1) | ((dmask & 0x2) << 1);9503} else if (instr->op == nir_texop_query_levels) {9504tex->dmask = 1 << 3;9505} else {9506tex->dmask = dmask;9507}9508tex->da = da;9509tex->dim = dim;95109511if (div_by_6) {9512/* divide 3rd value by 6 by multiplying with magic number */9513emit_split_vector(ctx, tmp_dst, tmp_dst.size());9514Temp c = bld.copy(bld.def(s1), Operand::c32(0x2AAAAAAB));9515Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1),9516emit_extract_vector(ctx, tmp_dst, 2, v1), c);9517assert(instr->dest.ssa.num_components == 3);9518Temp tmp = dst.type() == RegType::vgpr ? dst : bld.tmp(v3);9519tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),9520emit_extract_vector(ctx, tmp_dst, 0, v1),9521emit_extract_vector(ctx, tmp_dst, 1, v1), by_6);9522}95239524expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);9525return;9526}95279528Temp tg4_compare_cube_wa64 = Temp();95299530if (tg4_integer_workarounds) {9531Temp tg4_lod = bld.copy(bld.def(v1), Operand::zero());9532Temp size = bld.tmp(v2);9533MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(size),9534resource, Operand(s4), std::vector<Temp>{tg4_lod});9535tex->dim = dim;9536tex->dmask = 0x3;9537tex->da = da;9538emit_split_vector(ctx, size, size.size());95399540Temp half_texel[2];9541for (unsigned i = 0; i < 2; i++) {9542half_texel[i] = emit_extract_vector(ctx, size, i, v1);9543half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]);9544half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]);9545half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1),9546Operand::c32(0xbf000000 /*-0.5*/), half_texel[i]);9547}95489549if (instr->sampler_dim == GLSL_SAMPLER_DIM_2D && !instr->is_array) {9550/* In vulkan, whether the sampler uses unnormalized9551* coordinates or not is a dynamic property of the9552* sampler. Hence, to figure out whether or not we9553* need to divide by the texture size, we need to test9554* the sampler at runtime. This tests the bit set by9555* radv_init_sampler().9556*/9557unsigned bit_idx = ffs(S_008F30_FORCE_UNNORMALIZED(1)) - 1;9558Temp not_needed =9559bld.sopc(aco_opcode::s_bitcmp0_b32, bld.def(s1, scc), sampler, Operand::c32(bit_idx));95609561not_needed = bool_to_vector_condition(ctx, not_needed);9562half_texel[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),9563Operand::c32(0xbf000000 /*-0.5*/), half_texel[0], not_needed);9564half_texel[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),9565Operand::c32(0xbf000000 /*-0.5*/), half_texel[1], not_needed);9566}95679568Temp new_coords[2] = {bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[0], half_texel[0]),9569bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[1], half_texel[1])};95709571if (tg4_integer_cube_workaround) {9572/* see comment in ac_nir_to_llvm.c's lower_gather4_integer() */9573Temp* const desc = (Temp*)alloca(resource.size() * sizeof(Temp));9574aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(9575aco_opcode::p_split_vector, Format::PSEUDO, 1, resource.size())};9576split->operands[0] = Operand(resource);9577for (unsigned i = 0; i < resource.size(); i++) {9578desc[i] = bld.tmp(s1);9579split->definitions[i] = Definition(desc[i]);9580}9581ctx->block->instructions.emplace_back(std::move(split));95829583Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1],9584Operand::c32(20u | (6u << 16)));9585Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt,9586Operand::c32(V_008F14_IMG_DATA_FORMAT_8_8_8_8));95879588Temp nfmt;9589if (stype == GLSL_TYPE_UINT) {9590nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),9591Operand::c32(V_008F14_IMG_NUM_FORMAT_USCALED),9592Operand::c32(V_008F14_IMG_NUM_FORMAT_UINT), bld.scc(compare_cube_wa));9593} else {9594nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),9595Operand::c32(V_008F14_IMG_NUM_FORMAT_SSCALED),9596Operand::c32(V_008F14_IMG_NUM_FORMAT_SINT), bld.scc(compare_cube_wa));9597}9598tg4_compare_cube_wa64 = bld.tmp(bld.lm);9599bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64);96009601nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt,9602Operand::c32(26u));96039604desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1],9605Operand::c32(C_008F14_NUM_FORMAT));9606desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt);96079608aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(9609aco_opcode::p_create_vector, Format::PSEUDO, resource.size(), 1)};9610for (unsigned i = 0; i < resource.size(); i++)9611vec->operands[i] = Operand(desc[i]);9612resource = bld.tmp(resource.regClass());9613vec->definitions[0] = Definition(resource);9614ctx->block->instructions.emplace_back(std::move(vec));96159616new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[0], coords[0],9617tg4_compare_cube_wa64);9618new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[1], coords[1],9619tg4_compare_cube_wa64);9620}9621coords[0] = new_coords[0];9622coords[1] = new_coords[1];9623}96249625if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {9626// FIXME: if (ctx->abi->gfx9_stride_size_workaround) return9627// ac_build_buffer_load_format_gfx9_safe()96289629assert(coords.size() == 1);9630aco_opcode op;9631switch (util_last_bit(dmask & 0xf)) {9632case 1: op = aco_opcode::buffer_load_format_x; break;9633case 2: op = aco_opcode::buffer_load_format_xy; break;9634case 3: op = aco_opcode::buffer_load_format_xyz; break;9635case 4: op = aco_opcode::buffer_load_format_xyzw; break;9636default: unreachable("Tex instruction loads more than 4 components.");9637}96389639aco_ptr<MUBUF_instruction> mubuf{9640create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3 + instr->is_sparse, 1)};9641mubuf->operands[0] = Operand(resource);9642mubuf->operands[1] = Operand(coords[0]);9643mubuf->operands[2] = Operand::c32(0);9644mubuf->definitions[0] = Definition(tmp_dst);9645mubuf->idxen = true;9646mubuf->tfe = instr->is_sparse;9647if (mubuf->tfe)9648mubuf->operands[3] = emit_tfe_init(bld, tmp_dst);9649ctx->block->instructions.emplace_back(std::move(mubuf));96509651expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);9652return;9653}96549655/* gather MIMG address components */9656std::vector<Temp> args;9657unsigned wqm_mask = 0;9658if (has_offset) {9659wqm_mask |= u_bit_consecutive(args.size(), 1);9660args.emplace_back(offset);9661}9662if (has_bias)9663args.emplace_back(bias);9664if (has_compare)9665args.emplace_back(compare);9666if (has_derivs)9667args.insert(args.end(), derivs.begin(), derivs.end());96689669wqm_mask |= u_bit_consecutive(args.size(), coords.size());9670args.insert(args.end(), coords.begin(), coords.end());96719672if (has_sample_index)9673args.emplace_back(sample_index);9674if (has_lod)9675args.emplace_back(lod);9676if (has_clamped_lod)9677args.emplace_back(clamped_lod);96789679if (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms ||9680instr->op == nir_texop_samples_identical || instr->op == nir_texop_fragment_fetch ||9681instr->op == nir_texop_fragment_mask_fetch) {9682aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||9683instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS9684? aco_opcode::image_load9685: aco_opcode::image_load_mip;9686Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);9687MIMG_instruction* tex =9688emit_mimg(bld, op, Definition(tmp_dst), resource, Operand(s4), args, 0, vdata);9689tex->dim = dim;9690tex->dmask = dmask & 0xf;9691tex->unrm = true;9692tex->da = da;9693tex->tfe = instr->is_sparse;96949695if (instr->op == nir_texop_samples_identical) {9696assert(dmask == 1 && dst.regClass() == bld.lm);9697assert(dst.id() != tmp_dst.id());96989699bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(dst), Operand::zero(), tmp_dst)9700.def(0)9701.setHint(vcc);9702} else {9703expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);9704}9705return;9706}97079708// TODO: would be better to do this by adding offsets, but needs the opcodes ordered.9709aco_opcode opcode = aco_opcode::image_sample;9710if (has_offset) { /* image_sample_*_o */9711if (has_clamped_lod) {9712if (has_compare) {9713opcode = aco_opcode::image_sample_c_cl_o;9714if (has_derivs)9715opcode = aco_opcode::image_sample_c_d_cl_o;9716if (has_bias)9717opcode = aco_opcode::image_sample_c_b_cl_o;9718} else {9719opcode = aco_opcode::image_sample_cl_o;9720if (has_derivs)9721opcode = aco_opcode::image_sample_d_cl_o;9722if (has_bias)9723opcode = aco_opcode::image_sample_b_cl_o;9724}9725} else if (has_compare) {9726opcode = aco_opcode::image_sample_c_o;9727if (has_derivs)9728opcode = aco_opcode::image_sample_c_d_o;9729if (has_bias)9730opcode = aco_opcode::image_sample_c_b_o;9731if (level_zero)9732opcode = aco_opcode::image_sample_c_lz_o;9733if (has_lod)9734opcode = aco_opcode::image_sample_c_l_o;9735} else {9736opcode = aco_opcode::image_sample_o;9737if (has_derivs)9738opcode = aco_opcode::image_sample_d_o;9739if (has_bias)9740opcode = aco_opcode::image_sample_b_o;9741if (level_zero)9742opcode = aco_opcode::image_sample_lz_o;9743if (has_lod)9744opcode = aco_opcode::image_sample_l_o;9745}9746} else if (has_clamped_lod) { /* image_sample_*_cl */9747if (has_compare) {9748opcode = aco_opcode::image_sample_c_cl;9749if (has_derivs)9750opcode = aco_opcode::image_sample_c_d_cl;9751if (has_bias)9752opcode = aco_opcode::image_sample_c_b_cl;9753} else {9754opcode = aco_opcode::image_sample_cl;9755if (has_derivs)9756opcode = aco_opcode::image_sample_d_cl;9757if (has_bias)9758opcode = aco_opcode::image_sample_b_cl;9759}9760} else { /* no offset */9761if (has_compare) {9762opcode = aco_opcode::image_sample_c;9763if (has_derivs)9764opcode = aco_opcode::image_sample_c_d;9765if (has_bias)9766opcode = aco_opcode::image_sample_c_b;9767if (level_zero)9768opcode = aco_opcode::image_sample_c_lz;9769if (has_lod)9770opcode = aco_opcode::image_sample_c_l;9771} else {9772opcode = aco_opcode::image_sample;9773if (has_derivs)9774opcode = aco_opcode::image_sample_d;9775if (has_bias)9776opcode = aco_opcode::image_sample_b;9777if (level_zero)9778opcode = aco_opcode::image_sample_lz;9779if (has_lod)9780opcode = aco_opcode::image_sample_l;9781}9782}97839784if (instr->op == nir_texop_tg4) {9785if (has_offset) { /* image_gather4_*_o */9786if (has_compare) {9787opcode = aco_opcode::image_gather4_c_lz_o;9788if (has_lod)9789opcode = aco_opcode::image_gather4_c_l_o;9790if (has_bias)9791opcode = aco_opcode::image_gather4_c_b_o;9792} else {9793opcode = aco_opcode::image_gather4_lz_o;9794if (has_lod)9795opcode = aco_opcode::image_gather4_l_o;9796if (has_bias)9797opcode = aco_opcode::image_gather4_b_o;9798}9799} else {9800if (has_compare) {9801opcode = aco_opcode::image_gather4_c_lz;9802if (has_lod)9803opcode = aco_opcode::image_gather4_c_l;9804if (has_bias)9805opcode = aco_opcode::image_gather4_c_b;9806} else {9807opcode = aco_opcode::image_gather4_lz;9808if (has_lod)9809opcode = aco_opcode::image_gather4_l;9810if (has_bias)9811opcode = aco_opcode::image_gather4_b;9812}9813}9814} else if (instr->op == nir_texop_lod) {9815opcode = aco_opcode::image_get_lod;9816}98179818bool implicit_derivs = bld.program->stage == fragment_fs && !has_derivs && !has_lod &&9819!level_zero && instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&9820instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS;98219822Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);9823MIMG_instruction* tex = emit_mimg(bld, opcode, Definition(tmp_dst), resource, Operand(sampler),9824args, implicit_derivs ? wqm_mask : 0, vdata);9825tex->dim = dim;9826tex->dmask = dmask & 0xf;9827tex->da = da;9828tex->tfe = instr->is_sparse;98299830if (tg4_integer_cube_workaround) {9831assert(tmp_dst.id() != dst.id());9832assert(tmp_dst.size() == dst.size());98339834emit_split_vector(ctx, tmp_dst, tmp_dst.size());9835Temp val[4];9836for (unsigned i = 0; i < 4; i++) {9837val[i] = emit_extract_vector(ctx, tmp_dst, i, v1);9838Temp cvt_val;9839if (stype == GLSL_TYPE_UINT)9840cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]);9841else9842cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]);9843val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val,9844tg4_compare_cube_wa64);9845}98469847Temp tmp = dst.regClass() == tmp_dst.regClass() ? dst : bld.tmp(tmp_dst.regClass());9848if (instr->is_sparse)9849tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],9850val[3], emit_extract_vector(ctx, tmp_dst, 4, v1));9851else9852tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],9853val[3]);9854}9855unsigned mask = instr->op == nir_texop_tg4 ? (instr->is_sparse ? 0x1F : 0xF) : dmask;9856expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, mask);9857}98589859Operand9860get_phi_operand(isel_context* ctx, nir_ssa_def* ssa, RegClass rc, bool logical)9861{9862Temp tmp = get_ssa_temp(ctx, ssa);9863if (ssa->parent_instr->type == nir_instr_type_ssa_undef) {9864return Operand(rc);9865} else if (logical && ssa->bit_size == 1 &&9866ssa->parent_instr->type == nir_instr_type_load_const) {9867if (ctx->program->wave_size == 64)9868return Operand::c64(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT64_MAX9869: 0u);9870else9871return Operand::c32(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT32_MAX9872: 0u);9873} else {9874return Operand(tmp);9875}9876}98779878void9879visit_phi(isel_context* ctx, nir_phi_instr* instr)9880{9881aco_ptr<Pseudo_instruction> phi;9882Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);9883assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == ctx->program->lane_mask);98849885bool logical = !dst.is_linear() || nir_dest_is_divergent(instr->dest);9886logical |= (ctx->block->kind & block_kind_merge) != 0;9887aco_opcode opcode = logical ? aco_opcode::p_phi : aco_opcode::p_linear_phi;98889889/* we want a sorted list of sources, since the predecessor list is also sorted */9890std::map<unsigned, nir_ssa_def*> phi_src;9891nir_foreach_phi_src (src, instr)9892phi_src[src->pred->index] = src->src.ssa;98939894std::vector<unsigned>& preds = logical ? ctx->block->logical_preds : ctx->block->linear_preds;9895unsigned num_operands = 0;9896Operand* const operands = (Operand*)alloca(9897(std::max(exec_list_length(&instr->srcs), (unsigned)preds.size()) + 1) * sizeof(Operand));9898unsigned num_defined = 0;9899unsigned cur_pred_idx = 0;9900for (std::pair<unsigned, nir_ssa_def*> src : phi_src) {9901if (cur_pred_idx < preds.size()) {9902/* handle missing preds (IF merges with discard/break) and extra preds9903* (loop exit with discard) */9904unsigned block = ctx->cf_info.nir_to_aco[src.first];9905unsigned skipped = 0;9906while (cur_pred_idx + skipped < preds.size() && preds[cur_pred_idx + skipped] != block)9907skipped++;9908if (cur_pred_idx + skipped < preds.size()) {9909for (unsigned i = 0; i < skipped; i++)9910operands[num_operands++] = Operand(dst.regClass());9911cur_pred_idx += skipped;9912} else {9913continue;9914}9915}9916/* Handle missing predecessors at the end. This shouldn't happen with loop9917* headers and we can't ignore these sources for loop header phis. */9918if (!(ctx->block->kind & block_kind_loop_header) && cur_pred_idx >= preds.size())9919continue;9920cur_pred_idx++;9921Operand op = get_phi_operand(ctx, src.second, dst.regClass(), logical);9922operands[num_operands++] = op;9923num_defined += !op.isUndefined();9924}9925/* handle block_kind_continue_or_break at loop exit blocks */9926while (cur_pred_idx++ < preds.size())9927operands[num_operands++] = Operand(dst.regClass());99289929/* If the loop ends with a break, still add a linear continue edge in case9930* that break is divergent or continue_or_break is used. We'll either remove9931* this operand later in visit_loop() if it's not necessary or replace the9932* undef with something correct. */9933if (!logical && ctx->block->kind & block_kind_loop_header) {9934nir_loop* loop = nir_cf_node_as_loop(instr->instr.block->cf_node.parent);9935nir_block* last = nir_loop_last_block(loop);9936if (last->successors[0] != instr->instr.block)9937operands[num_operands++] = Operand(RegClass());9938}99399940/* we can use a linear phi in some cases if one src is undef */9941if (dst.is_linear() && ctx->block->kind & block_kind_merge && num_defined == 1) {9942phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO,9943num_operands, 1));99449945Block* linear_else = &ctx->program->blocks[ctx->block->linear_preds[1]];9946Block* invert = &ctx->program->blocks[linear_else->linear_preds[0]];9947assert(invert->kind & block_kind_invert);99489949unsigned then_block = invert->linear_preds[0];99509951Block* insert_block = NULL;9952for (unsigned i = 0; i < num_operands; i++) {9953Operand op = operands[i];9954if (op.isUndefined())9955continue;9956insert_block = ctx->block->logical_preds[i] == then_block ? invert : ctx->block;9957phi->operands[0] = op;9958break;9959}9960assert(insert_block); /* should be handled by the "num_defined == 0" case above */9961phi->operands[1] = Operand(dst.regClass());9962phi->definitions[0] = Definition(dst);9963insert_block->instructions.emplace(insert_block->instructions.begin(), std::move(phi));9964return;9965}99669967phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_operands, 1));9968for (unsigned i = 0; i < num_operands; i++)9969phi->operands[i] = operands[i];9970phi->definitions[0] = Definition(dst);9971ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));9972}99739974void9975visit_undef(isel_context* ctx, nir_ssa_undef_instr* instr)9976{9977Temp dst = get_ssa_temp(ctx, &instr->def);99789979assert(dst.type() == RegType::sgpr);99809981if (dst.size() == 1) {9982Builder(ctx->program, ctx->block).copy(Definition(dst), Operand::zero());9983} else {9984aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(9985aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};9986for (unsigned i = 0; i < dst.size(); i++)9987vec->operands[i] = Operand::zero();9988vec->definitions[0] = Definition(dst);9989ctx->block->instructions.emplace_back(std::move(vec));9990}9991}99929993void9994begin_loop(isel_context* ctx, loop_context* lc)9995{9996// TODO: we might want to wrap the loop around a branch if exec_potentially_empty=true9997append_logical_end(ctx->block);9998ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;9999Builder bld(ctx->program, ctx->block);10000bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));10001unsigned loop_preheader_idx = ctx->block->index;1000210003lc->loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level));1000410005ctx->program->next_loop_depth++;1000610007Block* loop_header = ctx->program->create_and_insert_block();10008loop_header->kind |= block_kind_loop_header;10009add_edge(loop_preheader_idx, loop_header);10010ctx->block = loop_header;1001110012append_logical_start(ctx->block);1001310014lc->header_idx_old = std::exchange(ctx->cf_info.parent_loop.header_idx, loop_header->index);10015lc->exit_old = std::exchange(ctx->cf_info.parent_loop.exit, &lc->loop_exit);10016lc->divergent_cont_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_continue, false);10017lc->divergent_branch_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_branch, false);10018lc->divergent_if_old = std::exchange(ctx->cf_info.parent_if.is_divergent, false);10019}1002010021void10022end_loop(isel_context* ctx, loop_context* lc)10023{10024// TODO: what if a loop ends with a unconditional or uniformly branched continue10025// and this branch is never taken?10026if (!ctx->cf_info.has_branch) {10027unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx;10028Builder bld(ctx->program, ctx->block);10029append_logical_end(ctx->block);1003010031if (ctx->cf_info.exec_potentially_empty_discard ||10032ctx->cf_info.exec_potentially_empty_break) {10033/* Discards can result in code running with an empty exec mask.10034* This would result in divergent breaks not ever being taken. As a10035* workaround, break the loop when the loop mask is empty instead of10036* always continuing. */10037ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform);10038unsigned block_idx = ctx->block->index;1003910040/* create helper blocks to avoid critical edges */10041Block* break_block = ctx->program->create_and_insert_block();10042break_block->kind = block_kind_uniform;10043bld.reset(break_block);10044bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));10045add_linear_edge(block_idx, break_block);10046add_linear_edge(break_block->index, &lc->loop_exit);1004710048Block* continue_block = ctx->program->create_and_insert_block();10049continue_block->kind = block_kind_uniform;10050bld.reset(continue_block);10051bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));10052add_linear_edge(block_idx, continue_block);10053add_linear_edge(continue_block->index, &ctx->program->blocks[loop_header_idx]);1005410055if (!ctx->cf_info.parent_loop.has_divergent_branch)10056add_logical_edge(block_idx, &ctx->program->blocks[loop_header_idx]);10057ctx->block = &ctx->program->blocks[block_idx];10058} else {10059ctx->block->kind |= (block_kind_continue | block_kind_uniform);10060if (!ctx->cf_info.parent_loop.has_divergent_branch)10061add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);10062else10063add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);10064}1006510066bld.reset(ctx->block);10067bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));10068}1006910070ctx->cf_info.has_branch = false;10071ctx->program->next_loop_depth--;1007210073// TODO: if the loop has not a single exit, we must add one °°10074/* emit loop successor block */10075ctx->block = ctx->program->insert_block(std::move(lc->loop_exit));10076append_logical_start(ctx->block);1007710078#if 010079// TODO: check if it is beneficial to not branch on continues10080/* trim linear phis in loop header */10081for (auto&& instr : loop_entry->instructions) {10082if (instr->opcode == aco_opcode::p_linear_phi) {10083aco_ptr<Pseudo_instruction> new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, loop_entry->linear_predecessors.size(), 1)};10084new_phi->definitions[0] = instr->definitions[0];10085for (unsigned i = 0; i < new_phi->operands.size(); i++)10086new_phi->operands[i] = instr->operands[i];10087/* check that the remaining operands are all the same */10088for (unsigned i = new_phi->operands.size(); i < instr->operands.size(); i++)10089assert(instr->operands[i].tempId() == instr->operands.back().tempId());10090instr.swap(new_phi);10091} else if (instr->opcode == aco_opcode::p_phi) {10092continue;10093} else {10094break;10095}10096}10097#endif1009810099ctx->cf_info.parent_loop.header_idx = lc->header_idx_old;10100ctx->cf_info.parent_loop.exit = lc->exit_old;10101ctx->cf_info.parent_loop.has_divergent_continue = lc->divergent_cont_old;10102ctx->cf_info.parent_loop.has_divergent_branch = lc->divergent_branch_old;10103ctx->cf_info.parent_if.is_divergent = lc->divergent_if_old;10104if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)10105ctx->cf_info.exec_potentially_empty_discard = false;10106}1010710108void10109emit_loop_jump(isel_context* ctx, bool is_break)10110{10111Builder bld(ctx->program, ctx->block);10112Block* logical_target;10113append_logical_end(ctx->block);10114unsigned idx = ctx->block->index;1011510116if (is_break) {10117logical_target = ctx->cf_info.parent_loop.exit;10118add_logical_edge(idx, logical_target);10119ctx->block->kind |= block_kind_break;1012010121if (!ctx->cf_info.parent_if.is_divergent &&10122!ctx->cf_info.parent_loop.has_divergent_continue) {10123/* uniform break - directly jump out of the loop */10124ctx->block->kind |= block_kind_uniform;10125ctx->cf_info.has_branch = true;10126bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));10127add_linear_edge(idx, logical_target);10128return;10129}10130ctx->cf_info.parent_loop.has_divergent_branch = true;10131} else {10132logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];10133add_logical_edge(idx, logical_target);10134ctx->block->kind |= block_kind_continue;1013510136if (!ctx->cf_info.parent_if.is_divergent) {10137/* uniform continue - directly jump to the loop header */10138ctx->block->kind |= block_kind_uniform;10139ctx->cf_info.has_branch = true;10140bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));10141add_linear_edge(idx, logical_target);10142return;10143}1014410145/* for potential uniform breaks after this continue,10146we must ensure that they are handled correctly */10147ctx->cf_info.parent_loop.has_divergent_continue = true;10148ctx->cf_info.parent_loop.has_divergent_branch = true;10149}1015010151if (ctx->cf_info.parent_if.is_divergent && !ctx->cf_info.exec_potentially_empty_break) {10152ctx->cf_info.exec_potentially_empty_break = true;10153ctx->cf_info.exec_potentially_empty_break_depth = ctx->block->loop_nest_depth;10154}1015510156/* remove critical edges from linear CFG */10157bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));10158Block* break_block = ctx->program->create_and_insert_block();10159break_block->kind |= block_kind_uniform;10160add_linear_edge(idx, break_block);10161/* the loop_header pointer might be invalidated by this point */10162if (!is_break)10163logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];10164add_linear_edge(break_block->index, logical_target);10165bld.reset(break_block);10166bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));1016710168Block* continue_block = ctx->program->create_and_insert_block();10169add_linear_edge(idx, continue_block);10170append_logical_start(continue_block);10171ctx->block = continue_block;10172}1017310174void10175emit_loop_break(isel_context* ctx)10176{10177emit_loop_jump(ctx, true);10178}1017910180void10181emit_loop_continue(isel_context* ctx)10182{10183emit_loop_jump(ctx, false);10184}1018510186void10187visit_jump(isel_context* ctx, nir_jump_instr* instr)10188{10189/* visit_block() would usually do this but divergent jumps updates ctx->block */10190ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index;1019110192switch (instr->type) {10193case nir_jump_break: emit_loop_break(ctx); break;10194case nir_jump_continue: emit_loop_continue(ctx); break;10195default: isel_err(&instr->instr, "Unknown NIR jump instr"); abort();10196}10197}1019810199void10200visit_block(isel_context* ctx, nir_block* block)10201{10202nir_foreach_instr (instr, block) {10203switch (instr->type) {10204case nir_instr_type_alu: visit_alu_instr(ctx, nir_instr_as_alu(instr)); break;10205case nir_instr_type_load_const: visit_load_const(ctx, nir_instr_as_load_const(instr)); break;10206case nir_instr_type_intrinsic: visit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); break;10207case nir_instr_type_tex: visit_tex(ctx, nir_instr_as_tex(instr)); break;10208case nir_instr_type_phi: visit_phi(ctx, nir_instr_as_phi(instr)); break;10209case nir_instr_type_ssa_undef: visit_undef(ctx, nir_instr_as_ssa_undef(instr)); break;10210case nir_instr_type_deref: break;10211case nir_instr_type_jump: visit_jump(ctx, nir_instr_as_jump(instr)); break;10212default: isel_err(instr, "Unknown NIR instr type");10213}10214}1021510216if (!ctx->cf_info.parent_loop.has_divergent_branch)10217ctx->cf_info.nir_to_aco[block->index] = ctx->block->index;10218}1021910220static Operand10221create_continue_phis(isel_context* ctx, unsigned first, unsigned last,10222aco_ptr<Instruction>& header_phi, Operand* vals)10223{10224vals[0] = Operand(header_phi->definitions[0].getTemp());10225RegClass rc = vals[0].regClass();1022610227unsigned loop_nest_depth = ctx->program->blocks[first].loop_nest_depth;1022810229unsigned next_pred = 1;1023010231for (unsigned idx = first + 1; idx <= last; idx++) {10232Block& block = ctx->program->blocks[idx];10233if (block.loop_nest_depth != loop_nest_depth) {10234vals[idx - first] = vals[idx - 1 - first];10235continue;10236}1023710238if ((block.kind & block_kind_continue) && block.index != last) {10239vals[idx - first] = header_phi->operands[next_pred];10240next_pred++;10241continue;10242}1024310244bool all_same = true;10245for (unsigned i = 1; all_same && (i < block.linear_preds.size()); i++)10246all_same = vals[block.linear_preds[i] - first] == vals[block.linear_preds[0] - first];1024710248Operand val;10249if (all_same) {10250val = vals[block.linear_preds[0] - first];10251} else {10252aco_ptr<Instruction> phi(create_instruction<Pseudo_instruction>(10253aco_opcode::p_linear_phi, Format::PSEUDO, block.linear_preds.size(), 1));10254for (unsigned i = 0; i < block.linear_preds.size(); i++)10255phi->operands[i] = vals[block.linear_preds[i] - first];10256val = Operand(ctx->program->allocateTmp(rc));10257phi->definitions[0] = Definition(val.getTemp());10258block.instructions.emplace(block.instructions.begin(), std::move(phi));10259}10260vals[idx - first] = val;10261}1026210263return vals[last - first];10264}1026510266static void begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond);10267static void begin_uniform_if_else(isel_context* ctx, if_context* ic);10268static void end_uniform_if(isel_context* ctx, if_context* ic);1026910270static void10271visit_loop(isel_context* ctx, nir_loop* loop)10272{10273loop_context lc;10274begin_loop(ctx, &lc);1027510276/* NIR seems to allow this, and even though the loop exit has no predecessors, SSA defs from the10277* loop header are live. Handle this without complicating the ACO IR by creating a dummy break.10278*/10279if (nir_cf_node_cf_tree_next(&loop->cf_node)->predecessors->entries == 0) {10280Builder bld(ctx->program, ctx->block);10281Temp cond = bld.copy(bld.def(s1, scc), Operand::zero());10282if_context ic;10283begin_uniform_if_then(ctx, &ic, cond);10284emit_loop_break(ctx);10285begin_uniform_if_else(ctx, &ic);10286end_uniform_if(ctx, &ic);10287}1028810289bool unreachable = visit_cf_list(ctx, &loop->body);1029010291unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx;1029210293/* Fixup phis in loop header from unreachable blocks.10294* has_branch/has_divergent_branch also indicates if the loop ends with a10295* break/continue instruction, but we don't emit those if unreachable=true */10296if (unreachable) {10297assert(ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch);10298bool linear = ctx->cf_info.has_branch;10299bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch;10300for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {10301if ((logical && instr->opcode == aco_opcode::p_phi) ||10302(linear && instr->opcode == aco_opcode::p_linear_phi)) {10303/* the last operand should be the one that needs to be removed */10304instr->operands.pop_back();10305} else if (!is_phi(instr)) {10306break;10307}10308}10309}1031010311/* Fixup linear phis in loop header from expecting a continue. Both this fixup10312* and the previous one shouldn't both happen at once because a break in the10313* merge block would get CSE'd */10314if (nir_loop_last_block(loop)->successors[0] != nir_loop_first_block(loop)) {10315unsigned num_vals = ctx->cf_info.has_branch ? 1 : (ctx->block->index - loop_header_idx + 1);10316Operand* const vals = (Operand*)alloca(num_vals * sizeof(Operand));10317for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {10318if (instr->opcode == aco_opcode::p_linear_phi) {10319if (ctx->cf_info.has_branch)10320instr->operands.pop_back();10321else10322instr->operands.back() =10323create_continue_phis(ctx, loop_header_idx, ctx->block->index, instr, vals);10324} else if (!is_phi(instr)) {10325break;10326}10327}10328}1032910330end_loop(ctx, &lc);10331}1033210333static void10334begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond)10335{10336ic->cond = cond;1033710338append_logical_end(ctx->block);10339ctx->block->kind |= block_kind_branch;1034010341/* branch to linear then block */10342assert(cond.regClass() == ctx->program->lane_mask);10343aco_ptr<Pseudo_branch_instruction> branch;10344branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z,10345Format::PSEUDO_BRANCH, 1, 1));10346branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));10347branch->definitions[0].setHint(vcc);10348branch->operands[0] = Operand(cond);10349ctx->block->instructions.push_back(std::move(branch));1035010351ic->BB_if_idx = ctx->block->index;10352ic->BB_invert = Block();10353/* Invert blocks are intentionally not marked as top level because they10354* are not part of the logical cfg. */10355ic->BB_invert.kind |= block_kind_invert;10356ic->BB_endif = Block();10357ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level));1035810359ic->exec_potentially_empty_discard_old = ctx->cf_info.exec_potentially_empty_discard;10360ic->exec_potentially_empty_break_old = ctx->cf_info.exec_potentially_empty_break;10361ic->exec_potentially_empty_break_depth_old = ctx->cf_info.exec_potentially_empty_break_depth;10362ic->divergent_old = ctx->cf_info.parent_if.is_divergent;10363ctx->cf_info.parent_if.is_divergent = true;1036410365/* divergent branches use cbranch_execz */10366ctx->cf_info.exec_potentially_empty_discard = false;10367ctx->cf_info.exec_potentially_empty_break = false;10368ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;1036910370/** emit logical then block */10371ctx->program->next_divergent_if_logical_depth++;10372Block* BB_then_logical = ctx->program->create_and_insert_block();10373add_edge(ic->BB_if_idx, BB_then_logical);10374ctx->block = BB_then_logical;10375append_logical_start(BB_then_logical);10376}1037710378static void10379begin_divergent_if_else(isel_context* ctx, if_context* ic)10380{10381Block* BB_then_logical = ctx->block;10382append_logical_end(BB_then_logical);10383/* branch from logical then block to invert block */10384aco_ptr<Pseudo_branch_instruction> branch;10385branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,10386Format::PSEUDO_BRANCH, 0, 1));10387branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));10388branch->definitions[0].setHint(vcc);10389BB_then_logical->instructions.emplace_back(std::move(branch));10390add_linear_edge(BB_then_logical->index, &ic->BB_invert);10391if (!ctx->cf_info.parent_loop.has_divergent_branch)10392add_logical_edge(BB_then_logical->index, &ic->BB_endif);10393BB_then_logical->kind |= block_kind_uniform;10394assert(!ctx->cf_info.has_branch);10395ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;10396ctx->cf_info.parent_loop.has_divergent_branch = false;10397ctx->program->next_divergent_if_logical_depth--;1039810399/** emit linear then block */10400Block* BB_then_linear = ctx->program->create_and_insert_block();10401BB_then_linear->kind |= block_kind_uniform;10402add_linear_edge(ic->BB_if_idx, BB_then_linear);10403/* branch from linear then block to invert block */10404branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,10405Format::PSEUDO_BRANCH, 0, 1));10406branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));10407branch->definitions[0].setHint(vcc);10408BB_then_linear->instructions.emplace_back(std::move(branch));10409add_linear_edge(BB_then_linear->index, &ic->BB_invert);1041010411/** emit invert merge block */10412ctx->block = ctx->program->insert_block(std::move(ic->BB_invert));10413ic->invert_idx = ctx->block->index;1041410415/* branch to linear else block (skip else) */10416branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,10417Format::PSEUDO_BRANCH, 0, 1));10418branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));10419branch->definitions[0].setHint(vcc);10420ctx->block->instructions.push_back(std::move(branch));1042110422ic->exec_potentially_empty_discard_old |= ctx->cf_info.exec_potentially_empty_discard;10423ic->exec_potentially_empty_break_old |= ctx->cf_info.exec_potentially_empty_break;10424ic->exec_potentially_empty_break_depth_old = std::min(10425ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);10426/* divergent branches use cbranch_execz */10427ctx->cf_info.exec_potentially_empty_discard = false;10428ctx->cf_info.exec_potentially_empty_break = false;10429ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;1043010431/** emit logical else block */10432ctx->program->next_divergent_if_logical_depth++;10433Block* BB_else_logical = ctx->program->create_and_insert_block();10434add_logical_edge(ic->BB_if_idx, BB_else_logical);10435add_linear_edge(ic->invert_idx, BB_else_logical);10436ctx->block = BB_else_logical;10437append_logical_start(BB_else_logical);10438}1043910440static void10441end_divergent_if(isel_context* ctx, if_context* ic)10442{10443Block* BB_else_logical = ctx->block;10444append_logical_end(BB_else_logical);1044510446/* branch from logical else block to endif block */10447aco_ptr<Pseudo_branch_instruction> branch;10448branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,10449Format::PSEUDO_BRANCH, 0, 1));10450branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));10451branch->definitions[0].setHint(vcc);10452BB_else_logical->instructions.emplace_back(std::move(branch));10453add_linear_edge(BB_else_logical->index, &ic->BB_endif);10454if (!ctx->cf_info.parent_loop.has_divergent_branch)10455add_logical_edge(BB_else_logical->index, &ic->BB_endif);10456BB_else_logical->kind |= block_kind_uniform;10457ctx->program->next_divergent_if_logical_depth--;1045810459assert(!ctx->cf_info.has_branch);10460ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;1046110462/** emit linear else block */10463Block* BB_else_linear = ctx->program->create_and_insert_block();10464BB_else_linear->kind |= block_kind_uniform;10465add_linear_edge(ic->invert_idx, BB_else_linear);1046610467/* branch from linear else block to endif block */10468branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,10469Format::PSEUDO_BRANCH, 0, 1));10470branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));10471branch->definitions[0].setHint(vcc);10472BB_else_linear->instructions.emplace_back(std::move(branch));10473add_linear_edge(BB_else_linear->index, &ic->BB_endif);1047410475/** emit endif merge block */10476ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));10477append_logical_start(ctx->block);1047810479ctx->cf_info.parent_if.is_divergent = ic->divergent_old;10480ctx->cf_info.exec_potentially_empty_discard |= ic->exec_potentially_empty_discard_old;10481ctx->cf_info.exec_potentially_empty_break |= ic->exec_potentially_empty_break_old;10482ctx->cf_info.exec_potentially_empty_break_depth = std::min(10483ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);10484if (ctx->block->loop_nest_depth == ctx->cf_info.exec_potentially_empty_break_depth &&10485!ctx->cf_info.parent_if.is_divergent) {10486ctx->cf_info.exec_potentially_empty_break = false;10487ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;10488}10489/* uniform control flow never has an empty exec-mask */10490if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent) {10491ctx->cf_info.exec_potentially_empty_discard = false;10492ctx->cf_info.exec_potentially_empty_break = false;10493ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;10494}10495}1049610497static void10498begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond)10499{10500assert(cond.regClass() == s1);1050110502append_logical_end(ctx->block);10503ctx->block->kind |= block_kind_uniform;1050410505aco_ptr<Pseudo_branch_instruction> branch;10506aco_opcode branch_opcode = aco_opcode::p_cbranch_z;10507branch.reset(10508create_instruction<Pseudo_branch_instruction>(branch_opcode, Format::PSEUDO_BRANCH, 1, 1));10509branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));10510branch->definitions[0].setHint(vcc);10511branch->operands[0] = Operand(cond);10512branch->operands[0].setFixed(scc);10513ctx->block->instructions.emplace_back(std::move(branch));1051410515ic->BB_if_idx = ctx->block->index;10516ic->BB_endif = Block();10517ic->BB_endif.kind |= ctx->block->kind & block_kind_top_level;1051810519ctx->cf_info.has_branch = false;10520ctx->cf_info.parent_loop.has_divergent_branch = false;1052110522/** emit then block */10523ctx->program->next_uniform_if_depth++;10524Block* BB_then = ctx->program->create_and_insert_block();10525add_edge(ic->BB_if_idx, BB_then);10526append_logical_start(BB_then);10527ctx->block = BB_then;10528}1052910530static void10531begin_uniform_if_else(isel_context* ctx, if_context* ic)10532{10533Block* BB_then = ctx->block;1053410535ic->uniform_has_then_branch = ctx->cf_info.has_branch;10536ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;1053710538if (!ic->uniform_has_then_branch) {10539append_logical_end(BB_then);10540/* branch from then block to endif block */10541aco_ptr<Pseudo_branch_instruction> branch;10542branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,10543Format::PSEUDO_BRANCH, 0, 1));10544branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));10545branch->definitions[0].setHint(vcc);10546BB_then->instructions.emplace_back(std::move(branch));10547add_linear_edge(BB_then->index, &ic->BB_endif);10548if (!ic->then_branch_divergent)10549add_logical_edge(BB_then->index, &ic->BB_endif);10550BB_then->kind |= block_kind_uniform;10551}1055210553ctx->cf_info.has_branch = false;10554ctx->cf_info.parent_loop.has_divergent_branch = false;1055510556/** emit else block */10557Block* BB_else = ctx->program->create_and_insert_block();10558add_edge(ic->BB_if_idx, BB_else);10559append_logical_start(BB_else);10560ctx->block = BB_else;10561}1056210563static void10564end_uniform_if(isel_context* ctx, if_context* ic)10565{10566Block* BB_else = ctx->block;1056710568if (!ctx->cf_info.has_branch) {10569append_logical_end(BB_else);10570/* branch from then block to endif block */10571aco_ptr<Pseudo_branch_instruction> branch;10572branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,10573Format::PSEUDO_BRANCH, 0, 1));10574branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));10575branch->definitions[0].setHint(vcc);10576BB_else->instructions.emplace_back(std::move(branch));10577add_linear_edge(BB_else->index, &ic->BB_endif);10578if (!ctx->cf_info.parent_loop.has_divergent_branch)10579add_logical_edge(BB_else->index, &ic->BB_endif);10580BB_else->kind |= block_kind_uniform;10581}1058210583ctx->cf_info.has_branch &= ic->uniform_has_then_branch;10584ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;1058510586/** emit endif merge block */10587ctx->program->next_uniform_if_depth--;10588if (!ctx->cf_info.has_branch) {10589ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));10590append_logical_start(ctx->block);10591}10592}1059310594static bool10595visit_if(isel_context* ctx, nir_if* if_stmt)10596{10597Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);10598Builder bld(ctx->program, ctx->block);10599aco_ptr<Pseudo_branch_instruction> branch;10600if_context ic;1060110602if (!nir_src_is_divergent(if_stmt->condition)) { /* uniform condition */10603/**10604* Uniform conditionals are represented in the following way*) :10605*10606* The linear and logical CFG:10607* BB_IF10608* / \10609* BB_THEN (logical) BB_ELSE (logical)10610* \ /10611* BB_ENDIF10612*10613* *) Exceptions may be due to break and continue statements within loops10614* If a break/continue happens within uniform control flow, it branches10615* to the loop exit/entry block. Otherwise, it branches to the next10616* merge block.10617**/1061810619assert(cond.regClass() == ctx->program->lane_mask);10620cond = bool_to_scalar_condition(ctx, cond);1062110622begin_uniform_if_then(ctx, &ic, cond);10623visit_cf_list(ctx, &if_stmt->then_list);1062410625begin_uniform_if_else(ctx, &ic);10626visit_cf_list(ctx, &if_stmt->else_list);1062710628end_uniform_if(ctx, &ic);10629} else { /* non-uniform condition */10630/**10631* To maintain a logical and linear CFG without critical edges,10632* non-uniform conditionals are represented in the following way*) :10633*10634* The linear CFG:10635* BB_IF10636* / \10637* BB_THEN (logical) BB_THEN (linear)10638* \ /10639* BB_INVERT (linear)10640* / \10641* BB_ELSE (logical) BB_ELSE (linear)10642* \ /10643* BB_ENDIF10644*10645* The logical CFG:10646* BB_IF10647* / \10648* BB_THEN (logical) BB_ELSE (logical)10649* \ /10650* BB_ENDIF10651*10652* *) Exceptions may be due to break and continue statements within loops10653**/1065410655begin_divergent_if_then(ctx, &ic, cond);10656visit_cf_list(ctx, &if_stmt->then_list);1065710658begin_divergent_if_else(ctx, &ic);10659visit_cf_list(ctx, &if_stmt->else_list);1066010661end_divergent_if(ctx, &ic);10662}1066310664return !ctx->cf_info.has_branch && !ctx->block->logical_preds.empty();10665}1066610667static bool10668visit_cf_list(isel_context* ctx, struct exec_list* list)10669{10670foreach_list_typed (nir_cf_node, node, node, list) {10671switch (node->type) {10672case nir_cf_node_block: visit_block(ctx, nir_cf_node_as_block(node)); break;10673case nir_cf_node_if:10674if (!visit_if(ctx, nir_cf_node_as_if(node)))10675return true;10676break;10677case nir_cf_node_loop: visit_loop(ctx, nir_cf_node_as_loop(node)); break;10678default: unreachable("unimplemented cf list type");10679}10680}10681return false;10682}1068310684static void10685export_vs_varying(isel_context* ctx, int slot, bool is_pos, int* next_pos)10686{10687assert(ctx->stage.hw == HWStage::VS || ctx->stage.hw == HWStage::NGG);1068810689int offset = (ctx->stage.has(SWStage::TES) && !ctx->stage.has(SWStage::GS))10690? ctx->program->info->tes.outinfo.vs_output_param_offset[slot]10691: ctx->program->info->vs.outinfo.vs_output_param_offset[slot];10692unsigned mask = ctx->outputs.mask[slot];10693if (!is_pos && !mask)10694return;10695if (!is_pos && offset == AC_EXP_PARAM_UNDEFINED)10696return;10697aco_ptr<Export_instruction> exp{10698create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};10699exp->enabled_mask = mask;10700for (unsigned i = 0; i < 4; ++i) {10701if (mask & (1 << i))10702exp->operands[i] = Operand(ctx->outputs.temps[slot * 4u + i]);10703else10704exp->operands[i] = Operand(v1);10705}10706/* GFX10 (Navi1x) skip POS0 exports if EXEC=0 and DONE=0, causing a hang.10707* Setting valid_mask=1 prevents it and has no other effect.10708*/10709exp->valid_mask = ctx->options->chip_class == GFX10 && is_pos && *next_pos == 0;10710exp->done = false;10711exp->compressed = false;10712if (is_pos)10713exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;10714else10715exp->dest = V_008DFC_SQ_EXP_PARAM + offset;10716ctx->block->instructions.emplace_back(std::move(exp));10717}1071810719static void10720export_vs_psiz_layer_viewport_vrs(isel_context* ctx, int* next_pos)10721{10722aco_ptr<Export_instruction> exp{10723create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};10724exp->enabled_mask = 0;10725for (unsigned i = 0; i < 4; ++i)10726exp->operands[i] = Operand(v1);10727if (ctx->outputs.mask[VARYING_SLOT_PSIZ]) {10728exp->operands[0] = Operand(ctx->outputs.temps[VARYING_SLOT_PSIZ * 4u]);10729exp->enabled_mask |= 0x1;10730}10731if (ctx->outputs.mask[VARYING_SLOT_LAYER]) {10732exp->operands[2] = Operand(ctx->outputs.temps[VARYING_SLOT_LAYER * 4u]);10733exp->enabled_mask |= 0x4;10734}10735if (ctx->outputs.mask[VARYING_SLOT_VIEWPORT]) {10736if (ctx->options->chip_class < GFX9) {10737exp->operands[3] = Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]);10738exp->enabled_mask |= 0x8;10739} else {10740Builder bld(ctx->program, ctx->block);1074110742Temp out = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(16u),10743Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]));10744if (exp->operands[2].isTemp())10745out = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(out), exp->operands[2]);1074610747exp->operands[2] = Operand(out);10748exp->enabled_mask |= 0x4;10749}10750}10751if (ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_SHADING_RATE]) {10752exp->operands[1] = Operand(ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_SHADING_RATE * 4u]);10753exp->enabled_mask |= 0x2;10754} else if (ctx->options->force_vrs_rates) {10755/* Bits [2:3] = VRS rate X10756* Bits [4:5] = VRS rate Y10757*10758* The range is [-2, 1]. Values:10759* 1: 2x coarser shading rate in that direction.10760* 0: normal shading rate10761* -1: 2x finer shading rate (sample shading, not directional)10762* -2: 4x finer shading rate (sample shading, not directional)10763*10764* Sample shading can't go above 8 samples, so both numbers can't be -210765* at the same time.10766*/10767Builder bld(ctx->program, ctx->block);10768Temp rates = bld.copy(bld.def(v1), Operand::c32((unsigned)ctx->options->force_vrs_rates));1076910770/* If Pos.W != 1 (typical for non-GUI elements), use 2x2 coarse shading. */10771Temp cond = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), Operand::c32(0x3f800000u),10772Operand(ctx->outputs.temps[VARYING_SLOT_POS + 3]));10773rates = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),10774bld.copy(bld.def(v1), Operand::zero()), rates, cond);1077510776exp->operands[1] = Operand(rates);10777exp->enabled_mask |= 0x2;10778}1077910780exp->valid_mask = ctx->options->chip_class == GFX10 && *next_pos == 0;10781exp->done = false;10782exp->compressed = false;10783exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;10784ctx->block->instructions.emplace_back(std::move(exp));10785}1078610787static void10788create_vs_exports(isel_context* ctx)10789{10790assert(ctx->stage.hw == HWStage::VS || ctx->stage.hw == HWStage::NGG);1079110792radv_vs_output_info* outinfo = (ctx->stage.has(SWStage::TES) && !ctx->stage.has(SWStage::GS))10793? &ctx->program->info->tes.outinfo10794: &ctx->program->info->vs.outinfo;1079510796ctx->block->kind |= block_kind_export_end;1079710798if (outinfo->export_prim_id && ctx->stage.hw != HWStage::NGG) {10799ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;10800if (ctx->stage.has(SWStage::TES))10801ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] =10802get_arg(ctx, ctx->args->ac.tes_patch_id);10803else10804ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] =10805get_arg(ctx, ctx->args->ac.vs_prim_id);10806}1080710808if (ctx->options->key.has_multiview_view_index) {10809ctx->outputs.mask[VARYING_SLOT_LAYER] |= 0x1;10810ctx->outputs.temps[VARYING_SLOT_LAYER * 4u] =10811as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index));10812}1081310814/* Hardware requires position data to always be exported, even if the10815* application did not write gl_Position.10816*/10817ctx->outputs.mask[VARYING_SLOT_POS] = 0xf;1081810819/* the order these position exports are created is important */10820int next_pos = 0;10821export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos);1082210823bool writes_primitive_shading_rate =10824outinfo->writes_primitive_shading_rate || ctx->options->force_vrs_rates;10825if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index ||10826writes_primitive_shading_rate) {10827export_vs_psiz_layer_viewport_vrs(ctx, &next_pos);10828}10829if (ctx->num_clip_distances + ctx->num_cull_distances > 0)10830export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, true, &next_pos);10831if (ctx->num_clip_distances + ctx->num_cull_distances > 4)10832export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos);1083310834if (ctx->export_clip_dists) {10835if (ctx->num_clip_distances + ctx->num_cull_distances > 0)10836export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos);10837if (ctx->num_clip_distances + ctx->num_cull_distances > 4)10838export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, false, &next_pos);10839}1084010841for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {10842if (i < VARYING_SLOT_VAR0 && i != VARYING_SLOT_LAYER && i != VARYING_SLOT_PRIMITIVE_ID &&10843i != VARYING_SLOT_VIEWPORT)10844continue;1084510846export_vs_varying(ctx, i, false, NULL);10847}10848}1084910850static bool10851export_fs_mrt_z(isel_context* ctx)10852{10853Builder bld(ctx->program, ctx->block);10854unsigned enabled_channels = 0;10855bool compr = false;10856Operand values[4];1085710858for (unsigned i = 0; i < 4; ++i) {10859values[i] = Operand(v1);10860}1086110862/* Both stencil and sample mask only need 16-bits. */10863if (!ctx->program->info->ps.writes_z &&10864(ctx->program->info->ps.writes_stencil || ctx->program->info->ps.writes_sample_mask)) {10865compr = true; /* COMPR flag */1086610867if (ctx->program->info->ps.writes_stencil) {10868/* Stencil should be in X[23:16]. */10869values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]);10870values[0] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(16u), values[0]);10871enabled_channels |= 0x3;10872}1087310874if (ctx->program->info->ps.writes_sample_mask) {10875/* SampleMask should be in Y[15:0]. */10876values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]);10877enabled_channels |= 0xc;10878}10879} else {10880if (ctx->program->info->ps.writes_z) {10881values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4u]);10882enabled_channels |= 0x1;10883}1088410885if (ctx->program->info->ps.writes_stencil) {10886values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]);10887enabled_channels |= 0x2;10888}1088910890if (ctx->program->info->ps.writes_sample_mask) {10891values[2] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]);10892enabled_channels |= 0x4;10893}10894}1089510896/* GFX6 (except OLAND and HAINAN) has a bug that it only looks at the X10897* writemask component.10898*/10899if (ctx->options->chip_class == GFX6 && ctx->options->family != CHIP_OLAND &&10900ctx->options->family != CHIP_HAINAN) {10901enabled_channels |= 0x1;10902}1090310904bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], enabled_channels,10905V_008DFC_SQ_EXP_MRTZ, compr);1090610907return true;10908}1090910910static bool10911export_fs_mrt_color(isel_context* ctx, int slot)10912{10913Builder bld(ctx->program, ctx->block);10914unsigned write_mask = ctx->outputs.mask[slot];10915Operand values[4];1091610917for (unsigned i = 0; i < 4; ++i) {10918if (write_mask & (1 << i)) {10919values[i] = Operand(ctx->outputs.temps[slot * 4u + i]);10920} else {10921values[i] = Operand(v1);10922}10923}1092410925unsigned target, col_format;10926unsigned enabled_channels = 0;10927aco_opcode compr_op = (aco_opcode)0;10928bool compr = false;1092910930slot -= FRAG_RESULT_DATA0;10931target = V_008DFC_SQ_EXP_MRT + slot;10932col_format = (ctx->options->key.fs.col_format >> (4 * slot)) & 0xf;1093310934bool is_int8 = (ctx->options->key.fs.is_int8 >> slot) & 1;10935bool is_int10 = (ctx->options->key.fs.is_int10 >> slot) & 1;10936bool is_16bit = values[0].regClass() == v2b;1093710938/* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */10939if (ctx->options->enable_mrt_output_nan_fixup && !is_16bit &&10940(col_format == V_028714_SPI_SHADER_32_R || col_format == V_028714_SPI_SHADER_32_GR ||10941col_format == V_028714_SPI_SHADER_32_AR || col_format == V_028714_SPI_SHADER_32_ABGR ||10942col_format == V_028714_SPI_SHADER_FP16_ABGR)) {10943for (int i = 0; i < 4; i++) {10944if (!(write_mask & (1 << i)))10945continue;1094610947Temp isnan = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)),10948values[i], bld.copy(bld.def(v1), Operand::c32(3u)));10949values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), values[i],10950bld.copy(bld.def(v1), Operand::zero()), isnan);10951}10952}1095310954switch (col_format) {10955case V_028714_SPI_SHADER_32_R: enabled_channels = 1; break;1095610957case V_028714_SPI_SHADER_32_GR: enabled_channels = 0x3; break;1095810959case V_028714_SPI_SHADER_32_AR:10960if (ctx->options->chip_class >= GFX10) {10961/* Special case: on GFX10, the outputs are different for 32_AR */10962enabled_channels = 0x3;10963values[1] = values[3];10964values[3] = Operand(v1);10965} else {10966enabled_channels = 0x9;10967}10968break;1096910970case V_028714_SPI_SHADER_FP16_ABGR:10971for (int i = 0; i < 2; i++) {10972bool enabled = (write_mask >> (i * 2)) & 0x3;10973if (enabled) {10974enabled_channels |= 0x3 << (i * 2);10975if (is_16bit) {10976values[i] =10977bld.pseudo(aco_opcode::p_create_vector, bld.def(v1),10978values[i * 2].isUndefined() ? Operand(v2b) : values[i * 2],10979values[i * 2 + 1].isUndefined() ? Operand(v2b) : values[i * 2 + 1]);10980} else if (ctx->options->chip_class == GFX8 || ctx->options->chip_class == GFX9) {10981values[i] =10982bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, bld.def(v1),10983values[i * 2].isUndefined() ? Operand::zero() : values[i * 2],10984values[i * 2 + 1].isUndefined() ? Operand::zero() : values[i * 2 + 1]);10985} else {10986values[i] =10987bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1),10988values[i * 2].isUndefined() ? values[i * 2 + 1] : values[i * 2],10989values[i * 2 + 1].isUndefined() ? values[i * 2] : values[i * 2 + 1]);10990}10991} else {10992values[i] = Operand(v1);10993}10994}10995values[2] = Operand(v1);10996values[3] = Operand(v1);10997compr = true;10998break;1099911000case V_028714_SPI_SHADER_UNORM16_ABGR:11001if (is_16bit && ctx->options->chip_class >= GFX9) {11002compr_op = aco_opcode::v_cvt_pknorm_u16_f16;11003} else {11004compr_op = aco_opcode::v_cvt_pknorm_u16_f32;11005}11006break;1100711008case V_028714_SPI_SHADER_SNORM16_ABGR:11009if (is_16bit && ctx->options->chip_class >= GFX9) {11010compr_op = aco_opcode::v_cvt_pknorm_i16_f16;11011} else {11012compr_op = aco_opcode::v_cvt_pknorm_i16_f32;11013}11014break;1101511016case V_028714_SPI_SHADER_UINT16_ABGR: {11017compr_op = aco_opcode::v_cvt_pk_u16_u32;11018if (is_int8 || is_int10) {11019/* clamp */11020uint32_t max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0;11021Temp max_rgb_val = bld.copy(bld.def(s1), Operand::c32(max_rgb));1102211023for (unsigned i = 0; i < 4; i++) {11024if ((write_mask >> i) & 1) {11025values[i] =11026bld.vop2(aco_opcode::v_min_u32, bld.def(v1),11027i == 3 && is_int10 ? Operand::c32(3u) : Operand(max_rgb_val), values[i]);11028}11029}11030} else if (is_16bit) {11031for (unsigned i = 0; i < 4; i++) {11032if ((write_mask >> i) & 1) {11033Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, false);11034values[i] = Operand(tmp);11035}11036}11037}11038break;11039}1104011041case V_028714_SPI_SHADER_SINT16_ABGR:11042compr_op = aco_opcode::v_cvt_pk_i16_i32;11043if (is_int8 || is_int10) {11044/* clamp */11045uint32_t max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0;11046uint32_t min_rgb = is_int8 ? -128 : is_int10 ? -512 : 0;11047Temp max_rgb_val = bld.copy(bld.def(s1), Operand::c32(max_rgb));11048Temp min_rgb_val = bld.copy(bld.def(s1), Operand::c32(min_rgb));1104911050for (unsigned i = 0; i < 4; i++) {11051if ((write_mask >> i) & 1) {11052values[i] =11053bld.vop2(aco_opcode::v_min_i32, bld.def(v1),11054i == 3 && is_int10 ? Operand::c32(1u) : Operand(max_rgb_val), values[i]);11055values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1),11056i == 3 && is_int10 ? Operand::c32(-2u) : Operand(min_rgb_val),11057values[i]);11058}11059}11060} else if (is_16bit) {11061for (unsigned i = 0; i < 4; i++) {11062if ((write_mask >> i) & 1) {11063Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, true);11064values[i] = Operand(tmp);11065}11066}11067}11068break;1106911070case V_028714_SPI_SHADER_32_ABGR: enabled_channels = 0xF; break;1107111072case V_028714_SPI_SHADER_ZERO:11073default: return false;11074}1107511076if ((bool)compr_op) {11077for (int i = 0; i < 2; i++) {11078/* check if at least one of the values to be compressed is enabled */11079bool enabled = (write_mask >> (i * 2)) & 0x3;11080if (enabled) {11081enabled_channels |= 0x3 << (i * 2);11082values[i] = bld.vop3(11083compr_op, bld.def(v1), values[i * 2].isUndefined() ? Operand::zero() : values[i * 2],11084values[i * 2 + 1].isUndefined() ? Operand::zero() : values[i * 2 + 1]);11085} else {11086values[i] = Operand(v1);11087}11088}11089values[2] = Operand(v1);11090values[3] = Operand(v1);11091compr = true;11092} else if (!compr) {11093for (int i = 0; i < 4; i++)11094values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);11095}1109611097bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], enabled_channels, target,11098compr);11099return true;11100}1110111102static void11103create_fs_null_export(isel_context* ctx)11104{11105/* FS must always have exports.11106* So when there are none, we need to add a null export.11107*/1110811109Builder bld(ctx->program, ctx->block);11110unsigned dest = V_008DFC_SQ_EXP_NULL;11111bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),11112/* enabled_mask */ 0, dest, /* compr */ false, /* done */ true, /* vm */ true);11113}1111411115static void11116create_fs_exports(isel_context* ctx)11117{11118bool exported = false;1111911120/* Export depth, stencil and sample mask. */11121if (ctx->outputs.mask[FRAG_RESULT_DEPTH] || ctx->outputs.mask[FRAG_RESULT_STENCIL] ||11122ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK])11123exported |= export_fs_mrt_z(ctx);1112411125/* Export all color render targets. */11126for (unsigned i = FRAG_RESULT_DATA0; i < FRAG_RESULT_DATA7 + 1; ++i)11127if (ctx->outputs.mask[i])11128exported |= export_fs_mrt_color(ctx, i);1112911130if (!exported)11131create_fs_null_export(ctx);1113211133ctx->block->kind |= block_kind_export_end;11134}1113511136static void11137create_workgroup_barrier(Builder& bld)11138{11139bld.barrier(aco_opcode::p_barrier,11140memory_sync_info(storage_shared, semantic_acqrel, scope_workgroup), scope_workgroup);11141}1114211143static void11144emit_stream_output(isel_context* ctx, Temp const* so_buffers, Temp const* so_write_offset,11145const struct radv_stream_output* output)11146{11147unsigned num_comps = util_bitcount(output->component_mask);11148unsigned writemask = (1 << num_comps) - 1;11149unsigned loc = output->location;11150unsigned buf = output->buffer;1115111152assert(num_comps && num_comps <= 4);11153if (!num_comps || num_comps > 4)11154return;1115511156unsigned first_comp = ffs(output->component_mask) - 1;1115711158Temp out[4];11159bool all_undef = true;11160assert(ctx->stage.hw == HWStage::VS);11161for (unsigned i = 0; i < num_comps; i++) {11162out[i] = ctx->outputs.temps[loc * 4 + first_comp + i];11163all_undef = all_undef && !out[i].id();11164}11165if (all_undef)11166return;1116711168while (writemask) {11169int start, count;11170u_bit_scan_consecutive_range(&writemask, &start, &count);11171if (count == 3 && ctx->options->chip_class == GFX6) {11172/* GFX6 doesn't support storing vec3, split it. */11173writemask |= 1u << (start + 2);11174count = 2;11175}1117611177unsigned offset = output->offset + start * 4;1117811179Temp write_data = ctx->program->allocateTmp(RegClass(RegType::vgpr, count));11180aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(11181aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};11182for (int i = 0; i < count; ++i)11183vec->operands[i] =11184(ctx->outputs.mask[loc] & 1 << (start + i)) ? Operand(out[start + i]) : Operand::zero();11185vec->definitions[0] = Definition(write_data);11186ctx->block->instructions.emplace_back(std::move(vec));1118711188aco_opcode opcode;11189switch (count) {11190case 1: opcode = aco_opcode::buffer_store_dword; break;11191case 2: opcode = aco_opcode::buffer_store_dwordx2; break;11192case 3: opcode = aco_opcode::buffer_store_dwordx3; break;11193case 4: opcode = aco_opcode::buffer_store_dwordx4; break;11194default: unreachable("Unsupported dword count.");11195}1119611197aco_ptr<MUBUF_instruction> store{11198create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};11199store->operands[0] = Operand(so_buffers[buf]);11200store->operands[1] = Operand(so_write_offset[buf]);11201store->operands[2] = Operand::c32(0);11202store->operands[3] = Operand(write_data);11203if (offset > 4095) {11204/* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */11205Builder bld(ctx->program, ctx->block);11206store->operands[0] =11207bld.vadd32(bld.def(v1), Operand::c32(offset), Operand(so_write_offset[buf]));11208} else {11209store->offset = offset;11210}11211store->offen = true;11212store->glc = true;11213store->dlc = false;11214store->slc = true;11215ctx->block->instructions.emplace_back(std::move(store));11216}11217}1121811219static void11220emit_streamout(isel_context* ctx, unsigned stream)11221{11222Builder bld(ctx->program, ctx->block);1122311224Temp so_buffers[4];11225Temp buf_ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->streamout_buffers));11226for (unsigned i = 0; i < 4; i++) {11227unsigned stride = ctx->program->info->so.strides[i];11228if (!stride)11229continue;1123011231Operand off = bld.copy(bld.def(s1), Operand::c32(i * 16u));11232so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr, off);11233}1123411235Temp so_vtx_count =11236bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),11237get_arg(ctx, ctx->args->ac.streamout_config), Operand::c32(0x70010u));1123811239Temp tid = emit_mbcnt(ctx, bld.tmp(v1));1124011241Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(bld.lm), so_vtx_count, tid);1124211243if_context ic;11244begin_divergent_if_then(ctx, &ic, can_emit);1124511246bld.reset(ctx->block);1124711248Temp so_write_index =11249bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->ac.streamout_write_index), tid);1125011251Temp so_write_offset[4];1125211253for (unsigned i = 0; i < 4; i++) {11254unsigned stride = ctx->program->info->so.strides[i];11255if (!stride)11256continue;1125711258if (stride == 1) {11259Temp offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),11260get_arg(ctx, ctx->args->ac.streamout_write_index),11261get_arg(ctx, ctx->args->ac.streamout_offset[i]));11262Temp new_offset = bld.vadd32(bld.def(v1), offset, tid);1126311264so_write_offset[i] =11265bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), new_offset);11266} else {11267Temp offset = bld.v_mul_imm(bld.def(v1), so_write_index, stride * 4u);11268Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(4u),11269get_arg(ctx, ctx->args->ac.streamout_offset[i]));11270so_write_offset[i] = bld.vadd32(bld.def(v1), offset, offset2);11271}11272}1127311274for (unsigned i = 0; i < ctx->program->info->so.num_outputs; i++) {11275struct radv_stream_output* output = &ctx->program->info->so.outputs[i];11276if (stream != output->stream)11277continue;1127811279emit_stream_output(ctx, so_buffers, so_write_offset, output);11280}1128111282begin_divergent_if_else(ctx, &ic);11283end_divergent_if(ctx, &ic);11284}1128511286Pseudo_instruction*11287add_startpgm(struct isel_context* ctx)11288{11289unsigned arg_count = ctx->args->ac.arg_count;11290if (ctx->stage == fragment_fs) {11291/* LLVM optimizes away unused FS inputs and computes spi_ps_input_addr11292* itself and then communicates the results back via the ELF binary.11293* Mirror what LLVM does by re-mapping the VGPR arguments here.11294*11295* TODO: If we made the FS input scanning code into a separate pass that11296* could run before argument setup, then this wouldn't be necessary11297* anymore.11298*/11299struct ac_shader_args* args = &ctx->args->ac;11300arg_count = 0;11301for (unsigned i = 0, vgpr_arg = 0, vgpr_reg = 0; i < args->arg_count; i++) {11302if (args->args[i].file != AC_ARG_VGPR) {11303arg_count++;11304continue;11305}1130611307if (!(ctx->program->config->spi_ps_input_addr & (1 << vgpr_arg))) {11308args->args[i].skip = true;11309} else {11310args->args[i].offset = vgpr_reg;11311vgpr_reg += args->args[i].size;11312arg_count++;11313}11314vgpr_arg++;11315}11316}1131711318aco_ptr<Pseudo_instruction> startpgm{11319create_instruction<Pseudo_instruction>(aco_opcode::p_startpgm, Format::PSEUDO, 0, arg_count)};11320for (unsigned i = 0, arg = 0; i < ctx->args->ac.arg_count; i++) {11321if (ctx->args->ac.args[i].skip)11322continue;1132311324enum ac_arg_regfile file = ctx->args->ac.args[i].file;11325unsigned size = ctx->args->ac.args[i].size;11326unsigned reg = ctx->args->ac.args[i].offset;11327RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);11328Temp dst = ctx->program->allocateTmp(type);11329ctx->arg_temps[i] = dst;11330startpgm->definitions[arg] = Definition(dst);11331startpgm->definitions[arg].setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256});11332arg++;11333}11334Pseudo_instruction* instr = startpgm.get();11335ctx->block->instructions.push_back(std::move(startpgm));1133611337/* Stash these in the program so that they can be accessed later when11338* handling spilling.11339*/11340ctx->program->private_segment_buffer = get_arg(ctx, ctx->args->ring_offsets);11341ctx->program->scratch_offset = get_arg(ctx, ctx->args->ac.scratch_offset);1134211343return instr;11344}1134511346void11347fix_ls_vgpr_init_bug(isel_context* ctx, Pseudo_instruction* startpgm)11348{11349assert(ctx->shader->info.stage == MESA_SHADER_VERTEX);11350Builder bld(ctx->program, ctx->block);11351constexpr unsigned hs_idx = 1u;11352Builder::Result hs_thread_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),11353get_arg(ctx, ctx->args->ac.merged_wave_info),11354Operand::c32((8u << 16) | (hs_idx * 8u)));11355Temp ls_has_nonzero_hs_threads = bool_to_vector_condition(ctx, hs_thread_count.def(1).getTemp());1135611357/* If there are no HS threads, SPI mistakenly loads the LS VGPRs starting at VGPR 0. */1135811359Temp instance_id =11360bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.vertex_id),11361get_arg(ctx, ctx->args->ac.instance_id), ls_has_nonzero_hs_threads);11362Temp vs_rel_patch_id =11363bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_rel_ids),11364get_arg(ctx, ctx->args->ac.vs_rel_patch_id), ls_has_nonzero_hs_threads);11365Temp vertex_id =11366bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_patch_id),11367get_arg(ctx, ctx->args->ac.vertex_id), ls_has_nonzero_hs_threads);1136811369ctx->arg_temps[ctx->args->ac.instance_id.arg_index] = instance_id;11370ctx->arg_temps[ctx->args->ac.vs_rel_patch_id.arg_index] = vs_rel_patch_id;11371ctx->arg_temps[ctx->args->ac.vertex_id.arg_index] = vertex_id;11372}1137311374void11375split_arguments(isel_context* ctx, Pseudo_instruction* startpgm)11376{11377/* Split all arguments except for the first (ring_offsets) and the last11378* (exec) so that the dead channels don't stay live throughout the program.11379*/11380for (int i = 1; i < startpgm->definitions.size(); i++) {11381if (startpgm->definitions[i].regClass().size() > 1) {11382emit_split_vector(ctx, startpgm->definitions[i].getTemp(),11383startpgm->definitions[i].regClass().size());11384}11385}11386}1138711388void11389handle_bc_optimize(isel_context* ctx)11390{11391/* needed when SPI_PS_IN_CONTROL.BC_OPTIMIZE_DISABLE is set to 0 */11392Builder bld(ctx->program, ctx->block);11393uint32_t spi_ps_input_ena = ctx->program->config->spi_ps_input_ena;11394bool uses_center =11395G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena);11396bool uses_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena) ||11397G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena);11398ctx->persp_centroid = get_arg(ctx, ctx->args->ac.persp_centroid);11399ctx->linear_centroid = get_arg(ctx, ctx->args->ac.linear_centroid);11400if (uses_center && uses_centroid) {11401Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)),11402get_arg(ctx, ctx->args->ac.prim_mask), Operand::zero());1140311404if (G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena)) {11405Temp new_coord[2];11406for (unsigned i = 0; i < 2; i++) {11407Temp persp_centroid =11408emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_centroid), i, v1);11409Temp persp_center =11410emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_center), i, v1);11411new_coord[i] =11412bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), persp_centroid, persp_center, sel);11413}11414ctx->persp_centroid = bld.tmp(v2);11415bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->persp_centroid),11416Operand(new_coord[0]), Operand(new_coord[1]));11417emit_split_vector(ctx, ctx->persp_centroid, 2);11418}1141911420if (G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena)) {11421Temp new_coord[2];11422for (unsigned i = 0; i < 2; i++) {11423Temp linear_centroid =11424emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_centroid), i, v1);11425Temp linear_center =11426emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_center), i, v1);11427new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), linear_centroid,11428linear_center, sel);11429}11430ctx->linear_centroid = bld.tmp(v2);11431bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->linear_centroid),11432Operand(new_coord[0]), Operand(new_coord[1]));11433emit_split_vector(ctx, ctx->linear_centroid, 2);11434}11435}11436}1143711438void11439setup_fp_mode(isel_context* ctx, nir_shader* shader)11440{11441Program* program = ctx->program;1144211443unsigned float_controls = shader->info.float_controls_execution_mode;1144411445program->next_fp_mode.preserve_signed_zero_inf_nan32 =11446float_controls & FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32;11447program->next_fp_mode.preserve_signed_zero_inf_nan16_64 =11448float_controls & (FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 |11449FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64);1145011451program->next_fp_mode.must_flush_denorms32 =11452float_controls & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32;11453program->next_fp_mode.must_flush_denorms16_64 =11454float_controls &11455(FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 | FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64);1145611457program->next_fp_mode.care_about_round32 =11458float_controls &11459(FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32);1146011461program->next_fp_mode.care_about_round16_64 =11462float_controls &11463(FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |11464FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);1146511466/* default to preserving fp16 and fp64 denorms, since it's free for fp64 and11467* the precision seems needed for Wolfenstein: Youngblood to render correctly */11468if (program->next_fp_mode.must_flush_denorms16_64)11469program->next_fp_mode.denorm16_64 = 0;11470else11471program->next_fp_mode.denorm16_64 = fp_denorm_keep;1147211473/* preserving fp32 denorms is expensive, so only do it if asked */11474if (float_controls & FLOAT_CONTROLS_DENORM_PRESERVE_FP32)11475program->next_fp_mode.denorm32 = fp_denorm_keep;11476else11477program->next_fp_mode.denorm32 = 0;1147811479if (float_controls & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)11480program->next_fp_mode.round32 = fp_round_tz;11481else11482program->next_fp_mode.round32 = fp_round_ne;1148311484if (float_controls &11485(FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))11486program->next_fp_mode.round16_64 = fp_round_tz;11487else11488program->next_fp_mode.round16_64 = fp_round_ne;1148911490ctx->block->fp_mode = program->next_fp_mode;11491}1149211493void11494cleanup_cfg(Program* program)11495{11496/* create linear_succs/logical_succs */11497for (Block& BB : program->blocks) {11498for (unsigned idx : BB.linear_preds)11499program->blocks[idx].linear_succs.emplace_back(BB.index);11500for (unsigned idx : BB.logical_preds)11501program->blocks[idx].logical_succs.emplace_back(BB.index);11502}11503}1150411505Temp11506lanecount_to_mask(isel_context* ctx, Temp count, bool allow64 = true)11507{11508assert(count.regClass() == s1);1150911510Builder bld(ctx->program, ctx->block);11511Temp mask = bld.sop2(aco_opcode::s_bfm_b64, bld.def(s2), count, Operand::zero());11512Temp cond;1151311514if (ctx->program->wave_size == 64) {11515/* If we know that all 64 threads can't be active at a time, we just use the mask as-is */11516if (!allow64)11517return mask;1151811519/* Special case for 64 active invocations, because 64 doesn't work with s_bfm */11520Temp active_64 = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), count,11521Operand::c32(6u /* log2(64) */));11522cond =11523bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand::c32(-1u), mask, bld.scc(active_64));11524} else {11525/* We use s_bfm_b64 (not _b32) which works with 32, but we need to extract the lower half of11526* the register */11527cond = emit_extract_vector(ctx, mask, 0, bld.lm);11528}1152911530return cond;11531}1153211533Temp11534merged_wave_info_to_mask(isel_context* ctx, unsigned i)11535{11536Builder bld(ctx->program, ctx->block);1153711538/* lanecount_to_mask() only cares about s0.u[6:0] so we don't need either s_bfe nor s_and here */11539Temp count = i == 011540? get_arg(ctx, ctx->args->ac.merged_wave_info)11541: bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc),11542get_arg(ctx, ctx->args->ac.merged_wave_info), Operand::c32(i * 8u));1154311544return lanecount_to_mask(ctx, count);11545}1154611547void11548ngg_emit_sendmsg_gs_alloc_req(isel_context* ctx, Temp vtx_cnt, Temp prm_cnt)11549{11550assert(vtx_cnt.id() && prm_cnt.id());1155111552Builder bld(ctx->program, ctx->block);11553Temp prm_cnt_0;1155411555if (ctx->program->chip_class == GFX10 &&11556(ctx->stage.has(SWStage::GS) || ctx->program->info->has_ngg_culling)) {11557/* Navi 1x workaround: check whether the workgroup has no output.11558* If so, change the number of exported vertices and primitives to 1.11559*/11560prm_cnt_0 = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), prm_cnt, Operand::zero());11561prm_cnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::c32(1u), prm_cnt,11562bld.scc(prm_cnt_0));11563vtx_cnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::c32(1u), vtx_cnt,11564bld.scc(prm_cnt_0));11565}1156611567/* Put the number of vertices and primitives into m0 for the GS_ALLOC_REQ */11568Temp tmp =11569bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), prm_cnt, Operand::c32(12u));11570tmp = bld.sop2(aco_opcode::s_or_b32, bld.m0(bld.def(s1)), bld.def(s1, scc), tmp, vtx_cnt);1157111572/* Request the SPI to allocate space for the primitives and vertices11573* that will be exported by the threadgroup.11574*/11575bld.sopp(aco_opcode::s_sendmsg, bld.m0(tmp), -1, sendmsg_gs_alloc_req);1157611577if (prm_cnt_0.id()) {11578/* Navi 1x workaround: export a triangle with NaN coordinates when NGG has no output.11579* It can't have all-zero positions because that would render an undesired pixel with11580* conservative rasterization.11581*/11582Temp first_lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));11583Temp cond = bld.sop2(Builder::s_lshl, bld.def(bld.lm), bld.def(s1, scc),11584Operand::c32_or_c64(1u, ctx->program->wave_size == 64), first_lane);11585cond = bld.sop2(Builder::s_cselect, bld.def(bld.lm), cond,11586Operand::zero(ctx->program->wave_size == 64 ? 8 : 4), bld.scc(prm_cnt_0));1158711588if_context ic_prim_0;11589begin_divergent_if_then(ctx, &ic_prim_0, cond);11590bld.reset(ctx->block);11591ctx->block->kind |= block_kind_export_end;1159211593/* Use zero: means that it's a triangle whose every vertex index is 0. */11594Temp zero = bld.copy(bld.def(v1), Operand::zero());11595/* Use NaN for the coordinates, so that the rasterizer allways culls it. */11596Temp nan_coord = bld.copy(bld.def(v1), Operand::c32(-1u));1159711598bld.exp(aco_opcode::exp, zero, Operand(v1), Operand(v1), Operand(v1), 1 /* enabled mask */,11599V_008DFC_SQ_EXP_PRIM /* dest */, false /* compressed */, true /* done */,11600false /* valid mask */);11601bld.exp(aco_opcode::exp, nan_coord, nan_coord, nan_coord, nan_coord, 0xf /* enabled mask */,11602V_008DFC_SQ_EXP_POS /* dest */, false /* compressed */, true /* done */,11603true /* valid mask */);1160411605begin_divergent_if_else(ctx, &ic_prim_0);11606end_divergent_if(ctx, &ic_prim_0);11607bld.reset(ctx->block);11608}11609}1161011611} /* end namespace */1161211613void11614select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,11615ac_shader_config* config, struct radv_shader_args* args)11616{11617isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args, false);11618if_context ic_merged_wave_info;11619bool ngg_gs = ctx.stage.hw == HWStage::NGG && ctx.stage.has(SWStage::GS);1162011621for (unsigned i = 0; i < shader_count; i++) {11622nir_shader* nir = shaders[i];11623init_context(&ctx, nir);1162411625setup_fp_mode(&ctx, nir);1162611627if (!i) {11628/* needs to be after init_context() for FS */11629Pseudo_instruction* startpgm = add_startpgm(&ctx);11630append_logical_start(ctx.block);1163111632if (unlikely(args->options->has_ls_vgpr_init_bug && ctx.stage == vertex_tess_control_hs))11633fix_ls_vgpr_init_bug(&ctx, startpgm);1163411635split_arguments(&ctx, startpgm);1163611637if (program->stage.has(SWStage::VS) || program->stage.has(SWStage::TES)) {11638Builder(ctx.program, ctx.block).sopp(aco_opcode::s_setprio, -1u, 0x3u);11639}11640}1164111642/* In a merged VS+TCS HS, the VS implementation can be completely empty. */11643nir_function_impl* func = nir_shader_get_entrypoint(nir);11644bool empty_shader =11645nir_cf_list_is_empty_block(&func->body) &&11646((nir->info.stage == MESA_SHADER_VERTEX &&11647(ctx.stage == vertex_tess_control_hs || ctx.stage == vertex_geometry_gs)) ||11648(nir->info.stage == MESA_SHADER_TESS_EVAL && ctx.stage == tess_eval_geometry_gs));1164911650bool check_merged_wave_info =11651ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader && !(ngg_gs && i == 1));11652bool endif_merged_wave_info =11653ctx.tcs_in_out_eq ? i == 1 : (check_merged_wave_info && !(ngg_gs && i == 1));1165411655if (program->chip_class == GFX10 && program->stage.hw == HWStage::NGG &&11656program->stage.num_sw_stages() == 1) {11657/* Workaround for Navi1x HW bug to ensure that all NGG waves launch before11658* s_sendmsg(GS_ALLOC_REQ). */11659Builder(ctx.program, ctx.block).sopp(aco_opcode::s_barrier, -1u, 0u);11660}1166111662if (check_merged_wave_info) {11663Temp cond = merged_wave_info_to_mask(&ctx, i);11664begin_divergent_if_then(&ctx, &ic_merged_wave_info, cond);11665}1166611667if (i) {11668Builder bld(ctx.program, ctx.block);1166911670/* Skip s_barrier from TCS when VS outputs are not stored in the LDS. */11671bool tcs_skip_barrier = ctx.stage == vertex_tess_control_hs &&11672ctx.tcs_temp_only_inputs == nir->info.inputs_read;1167311674if (!ngg_gs && !tcs_skip_barrier)11675create_workgroup_barrier(bld);1167611677if (ctx.stage == vertex_geometry_gs || ctx.stage == tess_eval_geometry_gs) {11678ctx.gs_wave_id = bld.pseudo(aco_opcode::p_extract, bld.def(s1, m0), bld.def(s1, scc),11679get_arg(&ctx, args->ac.merged_wave_info), Operand::c32(2u),11680Operand::c32(8u), Operand::zero());11681}11682} else if (ctx.stage == geometry_gs)11683ctx.gs_wave_id = get_arg(&ctx, args->ac.gs_wave_id);1168411685if (ctx.stage == fragment_fs)11686handle_bc_optimize(&ctx);1168711688visit_cf_list(&ctx, &func->body);1168911690if (ctx.program->info->so.num_outputs && ctx.stage.hw == HWStage::VS)11691emit_streamout(&ctx, 0);1169211693if (ctx.stage.hw == HWStage::VS) {11694create_vs_exports(&ctx);11695} else if (nir->info.stage == MESA_SHADER_GEOMETRY && !ngg_gs) {11696Builder bld(ctx.program, ctx.block);11697bld.barrier(aco_opcode::p_barrier,11698memory_sync_info(storage_vmem_output, semantic_release, scope_device));11699bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx.gs_wave_id), -1,11700sendmsg_gs_done(false, false, 0));11701}1170211703if (ctx.stage == fragment_fs) {11704create_fs_exports(&ctx);11705}1170611707if (endif_merged_wave_info) {11708begin_divergent_if_else(&ctx, &ic_merged_wave_info);11709end_divergent_if(&ctx, &ic_merged_wave_info);11710}1171111712if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {11713/* Outputs of the previous stage are inputs to the next stage */11714ctx.inputs = ctx.outputs;11715ctx.outputs = shader_io_state();11716}1171711718cleanup_context(&ctx);11719}1172011721program->config->float_mode = program->blocks[0].fp_mode.val;1172211723append_logical_end(ctx.block);11724ctx.block->kind |= block_kind_uniform;11725Builder bld(ctx.program, ctx.block);11726bld.sopp(aco_opcode::s_endpgm);1172711728cleanup_cfg(program);11729}1173011731void11732select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_config* config,11733struct radv_shader_args* args)11734{11735isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, args, true);1173611737ctx.block->fp_mode = program->next_fp_mode;1173811739add_startpgm(&ctx);11740append_logical_start(ctx.block);1174111742Builder bld(ctx.program, ctx.block);1174311744Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4),11745program->private_segment_buffer, Operand::c32(RING_GSVS_VS * 16u));1174611747Operand stream_id = Operand::zero();11748if (args->shader_info->so.num_outputs)11749stream_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),11750get_arg(&ctx, ctx.args->ac.streamout_config), Operand::c32(0x20018u));1175111752Temp vtx_offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u),11753get_arg(&ctx, ctx.args->ac.vertex_id));1175411755std::stack<if_context> if_contexts;1175611757for (unsigned stream = 0; stream < 4; stream++) {11758if (stream_id.isConstant() && stream != stream_id.constantValue())11759continue;1176011761unsigned num_components = args->shader_info->gs.num_stream_output_components[stream];11762if (stream > 0 && (!num_components || !args->shader_info->so.num_outputs))11763continue;1176411765memset(ctx.outputs.mask, 0, sizeof(ctx.outputs.mask));1176611767if (!stream_id.isConstant()) {11768Temp cond =11769bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), stream_id, Operand::c32(stream));11770if_contexts.emplace();11771begin_uniform_if_then(&ctx, &if_contexts.top(), cond);11772bld.reset(ctx.block);11773}1177411775unsigned offset = 0;11776for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {11777if (args->shader_info->gs.output_streams[i] != stream)11778continue;1177911780unsigned output_usage_mask = args->shader_info->gs.output_usage_mask[i];11781unsigned length = util_last_bit(output_usage_mask);11782for (unsigned j = 0; j < length; ++j) {11783if (!(output_usage_mask & (1 << j)))11784continue;1178511786Temp val = bld.tmp(v1);11787unsigned const_offset = offset * args->shader_info->gs.vertices_out * 16 * 4;11788load_vmem_mubuf(&ctx, val, gsvs_ring, vtx_offset, Temp(), const_offset, 4, 1, 0u, true,11789true, true);1179011791ctx.outputs.mask[i] |= 1 << j;11792ctx.outputs.temps[i * 4u + j] = val;1179311794offset++;11795}11796}1179711798if (args->shader_info->so.num_outputs) {11799emit_streamout(&ctx, stream);11800bld.reset(ctx.block);11801}1180211803if (stream == 0) {11804create_vs_exports(&ctx);11805}1180611807if (!stream_id.isConstant()) {11808begin_uniform_if_else(&ctx, &if_contexts.top());11809bld.reset(ctx.block);11810}11811}1181211813while (!if_contexts.empty()) {11814end_uniform_if(&ctx, &if_contexts.top());11815if_contexts.pop();11816}1181711818program->config->float_mode = program->blocks[0].fp_mode.val;1181911820append_logical_end(ctx.block);11821ctx.block->kind |= block_kind_uniform;11822bld.reset(ctx.block);11823bld.sopp(aco_opcode::s_endpgm);1182411825cleanup_cfg(program);11826}1182711828void11829select_trap_handler_shader(Program* program, struct nir_shader* shader, ac_shader_config* config,11830struct radv_shader_args* args)11831{11832assert(args->options->chip_class == GFX8);1183311834init_program(program, compute_cs, args->shader_info, args->options->chip_class,11835args->options->family, args->options->wgp_mode, config);1183611837isel_context ctx = {};11838ctx.program = program;11839ctx.args = args;11840ctx.options = args->options;11841ctx.stage = program->stage;1184211843ctx.block = ctx.program->create_and_insert_block();11844ctx.block->kind = block_kind_top_level;1184511846program->workgroup_size = 1; /* XXX */1184711848add_startpgm(&ctx);11849append_logical_start(ctx.block);1185011851Builder bld(ctx.program, ctx.block);1185211853/* Load the buffer descriptor from TMA. */11854bld.smem(aco_opcode::s_load_dwordx4, Definition(PhysReg{ttmp4}, s4), Operand(PhysReg{tma}, s2),11855Operand::zero());1185611857/* Store TTMP0-TTMP1. */11858bld.smem(aco_opcode::s_buffer_store_dwordx2, Operand(PhysReg{ttmp4}, s4), Operand::zero(),11859Operand(PhysReg{ttmp0}, s2), memory_sync_info(), true);1186011861uint32_t hw_regs_idx[] = {118622, /* HW_REG_STATUS */118633, /* HW_REG_TRAP_STS */118644, /* HW_REG_HW_ID */118657, /* HW_REG_IB_STS */11866};1186711868/* Store some hardware registers. */11869for (unsigned i = 0; i < ARRAY_SIZE(hw_regs_idx); i++) {11870/* "((size - 1) << 11) | register" */11871bld.sopk(aco_opcode::s_getreg_b32, Definition(PhysReg{ttmp8}, s1),11872((20 - 1) << 11) | hw_regs_idx[i]);1187311874bld.smem(aco_opcode::s_buffer_store_dword, Operand(PhysReg{ttmp4}, s4),11875Operand::c32(8u + i * 4), Operand(PhysReg{ttmp8}, s1), memory_sync_info(), true);11876}1187711878program->config->float_mode = program->blocks[0].fp_mode.val;1187911880append_logical_end(ctx.block);11881ctx.block->kind |= block_kind_uniform;11882bld.sopp(aco_opcode::s_endpgm);1188311884cleanup_cfg(program);11885}11886} // namespace aco118871188811889