Path: blob/21.2-virgl/src/amd/llvm/ac_llvm_build.c
7237 views
/*1* Copyright 2014 Advanced Micro Devices, Inc.2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the5* "Software"), to deal in the Software without restriction, including6* without limitation the rights to use, copy, modify, merge, publish,7* distribute, sub license, and/or sell copies of the Software, and to8* permit persons to whom the Software is furnished to do so, subject to9* the following conditions:10*11* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR12* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,13* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL14* THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,15* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR16* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE17* USE OR OTHER DEALINGS IN THE SOFTWARE.18*19* The above copyright notice and this permission notice (including the20* next paragraph) shall be included in all copies or substantial portions21* of the Software.22*23*/24/* based on pieces from si_pipe.c and radeon_llvm_emit.c */25#include "ac_llvm_build.h"2627#include "ac_exp_param.h"28#include "ac_llvm_util.h"29#include "ac_shader_util.h"30#include "c11/threads.h"31#include "shader_enums.h"32#include "sid.h"33#include "util/bitscan.h"34#include "util/macros.h"35#include "util/u_atomic.h"36#include "util/u_math.h"37#include <llvm-c/Core.h>38#include <llvm/Config/llvm-config.h>3940#include <assert.h>41#include <stdio.h>4243#define AC_LLVM_INITIAL_CF_DEPTH 44445/* Data for if/else/endif and bgnloop/endloop control flow structures.46*/47struct ac_llvm_flow {48/* Loop exit or next part of if/else/endif. */49LLVMBasicBlockRef next_block;50LLVMBasicBlockRef loop_entry_block;51};5253/* Initialize module-independent parts of the context.54*55* The caller is responsible for initializing ctx::module and ctx::builder.56*/57void ac_llvm_context_init(struct ac_llvm_context *ctx, struct ac_llvm_compiler *compiler,58enum chip_class chip_class, enum radeon_family family,59const struct radeon_info *info,60enum ac_float_mode float_mode, unsigned wave_size,61unsigned ballot_mask_bits)62{63ctx->context = LLVMContextCreate();6465ctx->chip_class = chip_class;66ctx->family = family;67ctx->info = info;68ctx->wave_size = wave_size;69ctx->ballot_mask_bits = ballot_mask_bits;70ctx->float_mode = float_mode;71ctx->module = ac_create_module(compiler->tm, ctx->context);72ctx->builder = ac_create_builder(ctx->context, float_mode);7374ctx->voidt = LLVMVoidTypeInContext(ctx->context);75ctx->i1 = LLVMInt1TypeInContext(ctx->context);76ctx->i8 = LLVMInt8TypeInContext(ctx->context);77ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);78ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);79ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);80ctx->i128 = LLVMIntTypeInContext(ctx->context, 128);81ctx->intptr = ctx->i32;82ctx->f16 = LLVMHalfTypeInContext(ctx->context);83ctx->f32 = LLVMFloatTypeInContext(ctx->context);84ctx->f64 = LLVMDoubleTypeInContext(ctx->context);85ctx->v2i16 = LLVMVectorType(ctx->i16, 2);86ctx->v4i16 = LLVMVectorType(ctx->i16, 4);87ctx->v2f16 = LLVMVectorType(ctx->f16, 2);88ctx->v4f16 = LLVMVectorType(ctx->f16, 4);89ctx->v2i32 = LLVMVectorType(ctx->i32, 2);90ctx->v3i32 = LLVMVectorType(ctx->i32, 3);91ctx->v4i32 = LLVMVectorType(ctx->i32, 4);92ctx->v2f32 = LLVMVectorType(ctx->f32, 2);93ctx->v3f32 = LLVMVectorType(ctx->f32, 3);94ctx->v4f32 = LLVMVectorType(ctx->f32, 4);95ctx->v8i32 = LLVMVectorType(ctx->i32, 8);96ctx->iN_wavemask = LLVMIntTypeInContext(ctx->context, ctx->wave_size);97ctx->iN_ballotmask = LLVMIntTypeInContext(ctx->context, ballot_mask_bits);9899ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false);100ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false);101ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false);102ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false);103ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);104ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);105ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false);106ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false);107ctx->i128_0 = LLVMConstInt(ctx->i128, 0, false);108ctx->i128_1 = LLVMConstInt(ctx->i128, 1, false);109ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0);110ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0);111ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);112ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);113ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0);114ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0);115116ctx->i1false = LLVMConstInt(ctx->i1, 0, false);117ctx->i1true = LLVMConstInt(ctx->i1, 1, false);118119ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context, "range", 5);120121ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context, "invariant.load", 14);122123ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context, "amdgpu.uniform", 14);124125ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);126ctx->flow = calloc(1, sizeof(*ctx->flow));127}128129void ac_llvm_context_dispose(struct ac_llvm_context *ctx)130{131free(ctx->flow->stack);132free(ctx->flow);133ctx->flow = NULL;134}135136int ac_get_llvm_num_components(LLVMValueRef value)137{138LLVMTypeRef type = LLVMTypeOf(value);139unsigned num_components =140LLVMGetTypeKind(type) == LLVMVectorTypeKind ? LLVMGetVectorSize(type) : 1;141return num_components;142}143144LLVMValueRef ac_llvm_extract_elem(struct ac_llvm_context *ac, LLVMValueRef value, int index)145{146if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) {147assert(index == 0);148return value;149}150151return LLVMBuildExtractElement(ac->builder, value, LLVMConstInt(ac->i32, index, false), "");152}153154int ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type)155{156if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)157type = LLVMGetElementType(type);158159if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind)160return LLVMGetIntTypeWidth(type);161162if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {163if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_LDS)164return 32;165}166167if (type == ctx->f16)168return 16;169if (type == ctx->f32)170return 32;171if (type == ctx->f64)172return 64;173174unreachable("Unhandled type kind in get_elem_bits");175}176177unsigned ac_get_type_size(LLVMTypeRef type)178{179LLVMTypeKind kind = LLVMGetTypeKind(type);180181switch (kind) {182case LLVMIntegerTypeKind:183return LLVMGetIntTypeWidth(type) / 8;184case LLVMHalfTypeKind:185return 2;186case LLVMFloatTypeKind:187return 4;188case LLVMDoubleTypeKind:189return 8;190case LLVMPointerTypeKind:191if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT)192return 4;193return 8;194case LLVMVectorTypeKind:195return LLVMGetVectorSize(type) * ac_get_type_size(LLVMGetElementType(type));196case LLVMArrayTypeKind:197return LLVMGetArrayLength(type) * ac_get_type_size(LLVMGetElementType(type));198default:199assert(0);200return 0;201}202}203204static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)205{206if (t == ctx->i1)207return ctx->i1;208else if (t == ctx->i8)209return ctx->i8;210else if (t == ctx->f16 || t == ctx->i16)211return ctx->i16;212else if (t == ctx->f32 || t == ctx->i32)213return ctx->i32;214else if (t == ctx->f64 || t == ctx->i64)215return ctx->i64;216else217unreachable("Unhandled integer size");218}219220LLVMTypeRef ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t)221{222if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {223LLVMTypeRef elem_type = LLVMGetElementType(t);224return LLVMVectorType(to_integer_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));225}226if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) {227switch (LLVMGetPointerAddressSpace(t)) {228case AC_ADDR_SPACE_GLOBAL:229return ctx->i64;230case AC_ADDR_SPACE_CONST_32BIT:231case AC_ADDR_SPACE_LDS:232return ctx->i32;233default:234unreachable("unhandled address space");235}236}237return to_integer_type_scalar(ctx, t);238}239240LLVMValueRef ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v)241{242LLVMTypeRef type = LLVMTypeOf(v);243if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {244return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), "");245}246return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), "");247}248249LLVMValueRef ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v)250{251LLVMTypeRef type = LLVMTypeOf(v);252if (LLVMGetTypeKind(type) == LLVMPointerTypeKind)253return v;254return ac_to_integer(ctx, v);255}256257static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)258{259if (t == ctx->i8)260return ctx->i8;261else if (t == ctx->i16 || t == ctx->f16)262return ctx->f16;263else if (t == ctx->i32 || t == ctx->f32)264return ctx->f32;265else if (t == ctx->i64 || t == ctx->f64)266return ctx->f64;267else268unreachable("Unhandled float size");269}270271LLVMTypeRef ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t)272{273if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {274LLVMTypeRef elem_type = LLVMGetElementType(t);275return LLVMVectorType(to_float_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));276}277return to_float_type_scalar(ctx, t);278}279280LLVMValueRef ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v)281{282LLVMTypeRef type = LLVMTypeOf(v);283return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), "");284}285286LLVMValueRef ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,287LLVMTypeRef return_type, LLVMValueRef *params, unsigned param_count,288unsigned attrib_mask)289{290LLVMValueRef function, call;291bool set_callsite_attrs = !(attrib_mask & AC_FUNC_ATTR_LEGACY);292293function = LLVMGetNamedFunction(ctx->module, name);294if (!function) {295LLVMTypeRef param_types[32], function_type;296unsigned i;297298assert(param_count <= 32);299300for (i = 0; i < param_count; ++i) {301assert(params[i]);302param_types[i] = LLVMTypeOf(params[i]);303}304function_type = LLVMFunctionType(return_type, param_types, param_count, 0);305function = LLVMAddFunction(ctx->module, name, function_type);306307LLVMSetFunctionCallConv(function, LLVMCCallConv);308LLVMSetLinkage(function, LLVMExternalLinkage);309310if (!set_callsite_attrs)311ac_add_func_attributes(ctx->context, function, attrib_mask);312}313314call = LLVMBuildCall(ctx->builder, function, params, param_count, "");315if (set_callsite_attrs)316ac_add_func_attributes(ctx->context, call, attrib_mask);317return call;318}319320/**321* Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with322* intrinsic names).323*/324void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize)325{326LLVMTypeRef elem_type = type;327328if (LLVMGetTypeKind(type) == LLVMStructTypeKind) {329unsigned count = LLVMCountStructElementTypes(type);330int ret = snprintf(buf, bufsize, "sl_");331buf += ret;332bufsize -= ret;333334LLVMTypeRef *elems = alloca(count * sizeof(LLVMTypeRef));335LLVMGetStructElementTypes(type, elems);336337for (unsigned i = 0; i < count; i++) {338ac_build_type_name_for_intr(elems[i], buf, bufsize);339ret = strlen(buf);340buf += ret;341bufsize -= ret;342}343344snprintf(buf, bufsize, "s");345return;346}347348assert(bufsize >= 8);349if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {350int ret = snprintf(buf, bufsize, "v%u", LLVMGetVectorSize(type));351if (ret < 0) {352char *type_name = LLVMPrintTypeToString(type);353fprintf(stderr, "Error building type name for: %s\n", type_name);354LLVMDisposeMessage(type_name);355return;356}357elem_type = LLVMGetElementType(type);358buf += ret;359bufsize -= ret;360}361switch (LLVMGetTypeKind(elem_type)) {362default:363break;364case LLVMIntegerTypeKind:365snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));366break;367case LLVMHalfTypeKind:368snprintf(buf, bufsize, "f16");369break;370case LLVMFloatTypeKind:371snprintf(buf, bufsize, "f32");372break;373case LLVMDoubleTypeKind:374snprintf(buf, bufsize, "f64");375break;376}377}378379/**380* Helper function that builds an LLVM IR PHI node and immediately adds381* incoming edges.382*/383LLVMValueRef ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type, unsigned count_incoming,384LLVMValueRef *values, LLVMBasicBlockRef *blocks)385{386LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");387LLVMAddIncoming(phi, values, blocks, count_incoming);388return phi;389}390391void ac_build_s_barrier(struct ac_llvm_context *ctx)392{393ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL, 0, AC_FUNC_ATTR_CONVERGENT);394}395396/* Prevent optimizations (at least of memory accesses) across the current397* point in the program by emitting empty inline assembly that is marked as398* having side effects.399*400* Optionally, a value can be passed through the inline assembly to prevent401* LLVM from hoisting calls to ReadNone functions.402*/403void ac_build_optimization_barrier(struct ac_llvm_context *ctx, LLVMValueRef *pgpr, bool sgpr)404{405static int counter = 0;406407LLVMBuilderRef builder = ctx->builder;408char code[16];409const char *constraint = sgpr ? "=s,0" : "=v,0";410411snprintf(code, sizeof(code), "; %d", (int)p_atomic_inc_return(&counter));412413if (!pgpr) {414LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);415LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);416LLVMBuildCall(builder, inlineasm, NULL, 0, "");417} else if (LLVMTypeOf(*pgpr) == ctx->i32) {418/* Simple version for i32 that allows the caller to set LLVM metadata on the call419* instruction. */420LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);421LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);422423*pgpr = LLVMBuildCall(builder, inlineasm, pgpr, 1, "");424} else if (LLVMTypeOf(*pgpr) == ctx->i16) {425/* Simple version for i16 that allows the caller to set LLVM metadata on the call426* instruction. */427LLVMTypeRef ftype = LLVMFunctionType(ctx->i16, &ctx->i16, 1, false);428LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);429430*pgpr = LLVMBuildCall(builder, inlineasm, pgpr, 1, "");431} else if (LLVMGetTypeKind(LLVMTypeOf(*pgpr)) == LLVMPointerTypeKind) {432LLVMTypeRef type = LLVMTypeOf(*pgpr);433LLVMTypeRef ftype = LLVMFunctionType(type, &type, 1, false);434LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);435436*pgpr = LLVMBuildCall(builder, inlineasm, pgpr, 1, "");437} else {438LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);439LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);440LLVMTypeRef type = LLVMTypeOf(*pgpr);441unsigned bitsize = ac_get_elem_bits(ctx, type);442LLVMValueRef vgpr = *pgpr;443LLVMTypeRef vgpr_type;444unsigned vgpr_size;445LLVMValueRef vgpr0;446447if (bitsize < 32)448vgpr = LLVMBuildZExt(ctx->builder, vgpr, ctx->i32, "");449450vgpr_type = LLVMTypeOf(vgpr);451vgpr_size = ac_get_type_size(vgpr_type);452453assert(vgpr_size % 4 == 0);454455vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");456vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");457vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");458vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");459vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");460461if (bitsize < 32)462vgpr = LLVMBuildTrunc(builder, vgpr, type, "");463464*pgpr = vgpr;465}466}467468LLVMValueRef ac_build_shader_clock(struct ac_llvm_context *ctx, nir_scope scope)469{470const char *subgroup = "llvm.readcyclecounter";471const char *name = scope == NIR_SCOPE_DEVICE ? "llvm.amdgcn.s.memrealtime" : subgroup;472473LLVMValueRef tmp = ac_build_intrinsic(ctx, name, ctx->i64, NULL, 0, 0);474return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");475}476477LLVMValueRef ac_build_ballot(struct ac_llvm_context *ctx, LLVMValueRef value)478{479const char *name;480481if (LLVMTypeOf(value) == ctx->i1)482value = LLVMBuildZExt(ctx->builder, value, ctx->i32, "");483484if (ctx->wave_size == 64)485name = "llvm.amdgcn.icmp.i64.i32";486else487name = "llvm.amdgcn.icmp.i32.i32";488489LLVMValueRef args[3] = {value, ctx->i32_0, LLVMConstInt(ctx->i32, LLVMIntNE, 0)};490491/* We currently have no other way to prevent LLVM from lifting the icmp492* calls to a dominating basic block.493*/494ac_build_optimization_barrier(ctx, &args[0], false);495496args[0] = ac_to_integer(ctx, args[0]);497498return ac_build_intrinsic(499ctx, name, ctx->iN_wavemask, args, 3,500AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);501}502503LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx, LLVMValueRef value)504{505const char *name;506507if (ctx->wave_size == 64)508name = "llvm.amdgcn.icmp.i64.i1";509else510name = "llvm.amdgcn.icmp.i32.i1";511512LLVMValueRef args[3] = {513value,514ctx->i1false,515LLVMConstInt(ctx->i32, LLVMIntNE, 0),516};517518return ac_build_intrinsic(519ctx, name, ctx->iN_wavemask, args, 3,520AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);521}522523LLVMValueRef ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value)524{525LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);526LLVMValueRef vote_set = ac_build_ballot(ctx, value);527return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");528}529530LLVMValueRef ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value)531{532LLVMValueRef vote_set = ac_build_ballot(ctx, value);533return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0),534"");535}536537LLVMValueRef ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value)538{539LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);540LLVMValueRef vote_set = ac_build_ballot(ctx, value);541542LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");543LLVMValueRef none =544LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0), "");545return LLVMBuildOr(ctx->builder, all, none, "");546}547548LLVMValueRef ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,549unsigned value_count, unsigned component)550{551LLVMValueRef vec = NULL;552553if (value_count == 1) {554return values[component];555} else if (!value_count)556unreachable("value_count is 0");557558for (unsigned i = component; i < value_count + component; i++) {559LLVMValueRef value = values[i];560561if (i == component)562vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));563LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false);564vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, "");565}566return vec;567}568569LLVMValueRef ac_build_gather_values_extended(struct ac_llvm_context *ctx, LLVMValueRef *values,570unsigned value_count, unsigned value_stride, bool load,571bool always_vector)572{573LLVMBuilderRef builder = ctx->builder;574LLVMValueRef vec = NULL;575unsigned i;576577if (value_count == 1 && !always_vector) {578if (load)579return LLVMBuildLoad(builder, values[0], "");580return values[0];581} else if (!value_count)582unreachable("value_count is 0");583584for (i = 0; i < value_count; i++) {585LLVMValueRef value = values[i * value_stride];586if (load)587value = LLVMBuildLoad(builder, value, "");588589if (!i)590vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));591LLVMValueRef index = LLVMConstInt(ctx->i32, i, false);592vec = LLVMBuildInsertElement(builder, vec, value, index, "");593}594return vec;595}596597LLVMValueRef ac_build_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,598unsigned value_count)599{600return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false);601}602603LLVMValueRef ac_build_concat(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)604{605unsigned a_size = ac_get_llvm_num_components(a);606unsigned b_size = ac_get_llvm_num_components(b);607608LLVMValueRef *elems = alloca((a_size + b_size) * sizeof(LLVMValueRef));609for (unsigned i = 0; i < a_size; i++)610elems[i] = ac_llvm_extract_elem(ctx, a, i);611for (unsigned i = 0; i < b_size; i++)612elems[a_size + i] = ac_llvm_extract_elem(ctx, b, i);613614return ac_build_gather_values(ctx, elems, a_size + b_size);615}616617/* Expand a scalar or vector to <dst_channels x type> by filling the remaining618* channels with undef. Extract at most src_channels components from the input.619*/620LLVMValueRef ac_build_expand(struct ac_llvm_context *ctx, LLVMValueRef value,621unsigned src_channels, unsigned dst_channels)622{623LLVMTypeRef elemtype;624LLVMValueRef *const chan = alloca(dst_channels * sizeof(LLVMValueRef));625626if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {627unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));628629if (src_channels == dst_channels && vec_size == dst_channels)630return value;631632src_channels = MIN2(src_channels, vec_size);633634for (unsigned i = 0; i < src_channels; i++)635chan[i] = ac_llvm_extract_elem(ctx, value, i);636637elemtype = LLVMGetElementType(LLVMTypeOf(value));638} else {639if (src_channels) {640assert(src_channels == 1);641chan[0] = value;642}643elemtype = LLVMTypeOf(value);644}645646for (unsigned i = src_channels; i < dst_channels; i++)647chan[i] = LLVMGetUndef(elemtype);648649return ac_build_gather_values(ctx, chan, dst_channels);650}651652/* Extract components [start, start + channels) from a vector.653*/654LLVMValueRef ac_extract_components(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned start,655unsigned channels)656{657LLVMValueRef *const chan = alloca(channels * sizeof(LLVMValueRef));658659for (unsigned i = 0; i < channels; i++)660chan[i] = ac_llvm_extract_elem(ctx, value, i + start);661662return ac_build_gather_values(ctx, chan, channels);663}664665/* Expand a scalar or vector to <4 x type> by filling the remaining channels666* with undef. Extract at most num_channels components from the input.667*/668LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx, LLVMValueRef value,669unsigned num_channels)670{671return ac_build_expand(ctx, value, num_channels, 4);672}673674LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value)675{676unsigned type_size = ac_get_type_size(LLVMTypeOf(value));677const char *name;678679if (type_size == 2)680name = "llvm.rint.f16";681else if (type_size == 4)682name = "llvm.rint.f32";683else684name = "llvm.rint.f64";685686return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1, AC_FUNC_ATTR_READNONE);687}688689LLVMValueRef ac_build_fdiv(struct ac_llvm_context *ctx, LLVMValueRef num, LLVMValueRef den)690{691unsigned type_size = ac_get_type_size(LLVMTypeOf(den));692const char *name;693694/* For doubles, we need precise division to pass GLCTS. */695if (ctx->float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL && type_size == 8)696return LLVMBuildFDiv(ctx->builder, num, den, "");697698if (type_size == 2)699name = "llvm.amdgcn.rcp.f16";700else if (type_size == 4)701name = "llvm.amdgcn.rcp.f32";702else703name = "llvm.amdgcn.rcp.f64";704705LLVMValueRef rcp =706ac_build_intrinsic(ctx, name, LLVMTypeOf(den), &den, 1, AC_FUNC_ATTR_READNONE);707708return LLVMBuildFMul(ctx->builder, num, rcp, "");709}710711/* See fast_idiv_by_const.h. */712/* Set: increment = util_fast_udiv_info::increment ? multiplier : 0; */713LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx, LLVMValueRef num,714LLVMValueRef multiplier, LLVMValueRef pre_shift,715LLVMValueRef post_shift, LLVMValueRef increment)716{717LLVMBuilderRef builder = ctx->builder;718719num = LLVMBuildLShr(builder, num, pre_shift, "");720num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),721LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");722num = LLVMBuildAdd(builder, num, LLVMBuildZExt(builder, increment, ctx->i64, ""), "");723num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");724num = LLVMBuildTrunc(builder, num, ctx->i32, "");725return LLVMBuildLShr(builder, num, post_shift, "");726}727728/* See fast_idiv_by_const.h. */729/* If num != UINT_MAX, this more efficient version can be used. */730/* Set: increment = util_fast_udiv_info::increment; */731LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx, LLVMValueRef num,732LLVMValueRef multiplier, LLVMValueRef pre_shift,733LLVMValueRef post_shift, LLVMValueRef increment)734{735LLVMBuilderRef builder = ctx->builder;736737num = LLVMBuildLShr(builder, num, pre_shift, "");738num = LLVMBuildNUWAdd(builder, num, increment, "");739num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),740LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");741num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");742num = LLVMBuildTrunc(builder, num, ctx->i32, "");743return LLVMBuildLShr(builder, num, post_shift, "");744}745746/* See fast_idiv_by_const.h. */747/* Both operands must fit in 31 bits and the divisor must not be 1. */748LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx, LLVMValueRef num,749LLVMValueRef multiplier, LLVMValueRef post_shift)750{751LLVMBuilderRef builder = ctx->builder;752753num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),754LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");755num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");756num = LLVMBuildTrunc(builder, num, ctx->i32, "");757return LLVMBuildLShr(builder, num, post_shift, "");758}759760/* Coordinates for cube map selection. sc, tc, and ma are as in Table 8.27761* of the OpenGL 4.5 (Compatibility Profile) specification, except ma is762* already multiplied by two. id is the cube face number.763*/764struct cube_selection_coords {765LLVMValueRef stc[2];766LLVMValueRef ma;767LLVMValueRef id;768};769770static void build_cube_intrinsic(struct ac_llvm_context *ctx, LLVMValueRef in[3],771struct cube_selection_coords *out)772{773LLVMTypeRef f32 = ctx->f32;774775out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc", f32, in, 3, AC_FUNC_ATTR_READNONE);776out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc", f32, in, 3, AC_FUNC_ATTR_READNONE);777out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema", f32, in, 3, AC_FUNC_ATTR_READNONE);778out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid", f32, in, 3, AC_FUNC_ATTR_READNONE);779}780781/**782* Build a manual selection sequence for cube face sc/tc coordinates and783* major axis vector (multiplied by 2 for consistency) for the given784* vec3 \p coords, for the face implied by \p selcoords.785*786* For the major axis, we always adjust the sign to be in the direction of787* selcoords.ma; i.e., a positive out_ma means that coords is pointed towards788* the selcoords major axis.789*/790static void build_cube_select(struct ac_llvm_context *ctx,791const struct cube_selection_coords *selcoords,792const LLVMValueRef *coords, LLVMValueRef *out_st,793LLVMValueRef *out_ma)794{795LLVMBuilderRef builder = ctx->builder;796LLVMTypeRef f32 = LLVMTypeOf(coords[0]);797LLVMValueRef is_ma_positive;798LLVMValueRef sgn_ma;799LLVMValueRef is_ma_z, is_not_ma_z;800LLVMValueRef is_ma_y;801LLVMValueRef is_ma_x;802LLVMValueRef sgn;803LLVMValueRef tmp;804805is_ma_positive = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->ma, LLVMConstReal(f32, 0.0), "");806sgn_ma = LLVMBuildSelect(builder, is_ma_positive, LLVMConstReal(f32, 1.0),807LLVMConstReal(f32, -1.0), "");808809is_ma_z = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 4.0), "");810is_not_ma_z = LLVMBuildNot(builder, is_ma_z, "");811is_ma_y = LLVMBuildAnd(812builder, is_not_ma_z,813LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 2.0), ""), "");814is_ma_x = LLVMBuildAnd(builder, is_not_ma_z, LLVMBuildNot(builder, is_ma_y, ""), "");815816/* Select sc */817tmp = LLVMBuildSelect(builder, is_ma_x, coords[2], coords[0], "");818sgn = LLVMBuildSelect(819builder, is_ma_y, LLVMConstReal(f32, 1.0),820LLVMBuildSelect(builder, is_ma_z, sgn_ma, LLVMBuildFNeg(builder, sgn_ma, ""), ""), "");821out_st[0] = LLVMBuildFMul(builder, tmp, sgn, "");822823/* Select tc */824tmp = LLVMBuildSelect(builder, is_ma_y, coords[2], coords[1], "");825sgn = LLVMBuildSelect(builder, is_ma_y, sgn_ma, LLVMConstReal(f32, -1.0), "");826out_st[1] = LLVMBuildFMul(builder, tmp, sgn, "");827828/* Select ma */829tmp = LLVMBuildSelect(builder, is_ma_z, coords[2],830LLVMBuildSelect(builder, is_ma_y, coords[1], coords[0], ""), "");831tmp = ac_build_intrinsic(ctx, "llvm.fabs.f32", ctx->f32, &tmp, 1, AC_FUNC_ATTR_READNONE);832*out_ma = LLVMBuildFMul(builder, tmp, LLVMConstReal(f32, 2.0), "");833}834835void ac_prepare_cube_coords(struct ac_llvm_context *ctx, bool is_deriv, bool is_array, bool is_lod,836LLVMValueRef *coords_arg, LLVMValueRef *derivs_arg)837{838839LLVMBuilderRef builder = ctx->builder;840struct cube_selection_coords selcoords;841LLVMValueRef coords[3];842LLVMValueRef invma;843844if (is_array && !is_lod) {845LLVMValueRef tmp = ac_build_round(ctx, coords_arg[3]);846847/* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says:848*849* "For Array forms, the array layer used will be850*851* max(0, min(d−1, floor(layer+0.5)))852*853* where d is the depth of the texture array and layer854* comes from the component indicated in the tables below.855* Workaroudn for an issue where the layer is taken from a856* helper invocation which happens to fall on a different857* layer due to extrapolation."858*859* GFX8 and earlier attempt to implement this in hardware by860* clamping the value of coords[2] = (8 * layer) + face.861* Unfortunately, this means that the we end up with the wrong862* face when clamping occurs.863*864* Clamp the layer earlier to work around the issue.865*/866if (ctx->chip_class <= GFX8) {867LLVMValueRef ge0;868ge0 = LLVMBuildFCmp(builder, LLVMRealOGE, tmp, ctx->f32_0, "");869tmp = LLVMBuildSelect(builder, ge0, tmp, ctx->f32_0, "");870}871872coords_arg[3] = tmp;873}874875build_cube_intrinsic(ctx, coords_arg, &selcoords);876877invma =878ac_build_intrinsic(ctx, "llvm.fabs.f32", ctx->f32, &selcoords.ma, 1, AC_FUNC_ATTR_READNONE);879invma = ac_build_fdiv(ctx, LLVMConstReal(ctx->f32, 1.0), invma);880881for (int i = 0; i < 2; ++i)882coords[i] = LLVMBuildFMul(builder, selcoords.stc[i], invma, "");883884coords[2] = selcoords.id;885886if (is_deriv && derivs_arg) {887LLVMValueRef derivs[4];888int axis;889890/* Convert cube derivatives to 2D derivatives. */891for (axis = 0; axis < 2; axis++) {892LLVMValueRef deriv_st[2];893LLVMValueRef deriv_ma;894895/* Transform the derivative alongside the texture896* coordinate. Mathematically, the correct formula is897* as follows. Assume we're projecting onto the +Z face898* and denote by dx/dh the derivative of the (original)899* X texture coordinate with respect to horizontal900* window coordinates. The projection onto the +Z face901* plane is:902*903* f(x,z) = x/z904*905* Then df/dh = df/dx * dx/dh + df/dz * dz/dh906* = 1/z * dx/dh - x/z * 1/z * dz/dh.907*908* This motivatives the implementation below.909*910* Whether this actually gives the expected results for911* apps that might feed in derivatives obtained via912* finite differences is anyone's guess. The OpenGL spec913* seems awfully quiet about how textureGrad for cube914* maps should be handled.915*/916build_cube_select(ctx, &selcoords, &derivs_arg[axis * 3], deriv_st, &deriv_ma);917918deriv_ma = LLVMBuildFMul(builder, deriv_ma, invma, "");919920for (int i = 0; i < 2; ++i)921derivs[axis * 2 + i] =922LLVMBuildFSub(builder, LLVMBuildFMul(builder, deriv_st[i], invma, ""),923LLVMBuildFMul(builder, deriv_ma, coords[i], ""), "");924}925926memcpy(derivs_arg, derivs, sizeof(derivs));927}928929/* Shift the texture coordinate. This must be applied after the930* derivative calculation.931*/932for (int i = 0; i < 2; ++i)933coords[i] = LLVMBuildFAdd(builder, coords[i], LLVMConstReal(ctx->f32, 1.5), "");934935if (is_array) {936/* for cube arrays coord.z = coord.w(array_index) * 8 + face */937/* coords_arg.w component - array_index for cube arrays */938coords[2] = ac_build_fmad(ctx, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), coords[2]);939}940941memcpy(coords_arg, coords, sizeof(coords));942}943944LLVMValueRef ac_build_fs_interp(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,945LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,946LLVMValueRef j)947{948LLVMValueRef args[5];949LLVMValueRef p1;950951args[0] = i;952args[1] = llvm_chan;953args[2] = attr_number;954args[3] = params;955956p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1", ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);957958args[0] = p1;959args[1] = j;960args[2] = llvm_chan;961args[3] = attr_number;962args[4] = params;963964return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2", ctx->f32, args, 5,965AC_FUNC_ATTR_READNONE);966}967968LLVMValueRef ac_build_fs_interp_f16(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,969LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,970LLVMValueRef j, bool high_16bits)971{972LLVMValueRef args[6];973LLVMValueRef p1;974975args[0] = i;976args[1] = llvm_chan;977args[2] = attr_number;978args[3] = high_16bits ? ctx->i1true : ctx->i1false;979args[4] = params;980981p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16", ctx->f32, args, 5,982AC_FUNC_ATTR_READNONE);983984args[0] = p1;985args[1] = j;986args[2] = llvm_chan;987args[3] = attr_number;988args[4] = high_16bits ? ctx->i1true : ctx->i1false;989args[5] = params;990991return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16", ctx->f16, args, 6,992AC_FUNC_ATTR_READNONE);993}994995LLVMValueRef ac_build_fs_interp_mov(struct ac_llvm_context *ctx, LLVMValueRef parameter,996LLVMValueRef llvm_chan, LLVMValueRef attr_number,997LLVMValueRef params)998{999LLVMValueRef args[4];10001001args[0] = parameter;1002args[1] = llvm_chan;1003args[2] = attr_number;1004args[3] = params;10051006return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov", ctx->f32, args, 4,1007AC_FUNC_ATTR_READNONE);1008}10091010LLVMValueRef ac_build_gep_ptr(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,1011LLVMValueRef index)1012{1013return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");1014}10151016LLVMValueRef ac_build_gep0(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index)1017{1018LLVMValueRef indices[2] = {1019ctx->i32_0,1020index,1021};1022return LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, "");1023}10241025LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr, LLVMValueRef index)1026{1027return LLVMBuildPointerCast(ctx->builder, LLVMBuildGEP(ctx->builder, ptr, &index, 1, ""),1028LLVMTypeOf(ptr), "");1029}10301031void ac_build_indexed_store(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index,1032LLVMValueRef value)1033{1034LLVMBuildStore(ctx->builder, value, ac_build_gep0(ctx, base_ptr, index));1035}10361037/**1038* Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.1039* It's equivalent to doing a load from &base_ptr[index].1040*1041* \param base_ptr Where the array starts.1042* \param index The element index into the array.1043* \param uniform Whether the base_ptr and index can be assumed to be1044* dynamically uniform (i.e. load to an SGPR)1045* \param invariant Whether the load is invariant (no other opcodes affect it)1046* \param no_unsigned_wraparound1047* For all possible re-associations and re-distributions of an expression1048* "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs1049* without inbounds in base_ptr), this parameter is true if "addr + offset"1050* does not result in an unsigned integer wraparound. This is used for1051* optimal code generation of 32-bit pointer arithmetic.1052*1053* For example, a 32-bit immediate offset that causes a 32-bit unsigned1054* integer wraparound can't be an imm offset in s_load_dword, because1055* the instruction performs "addr + offset" in 64 bits.1056*1057* Expected usage for bindless textures by chaining GEPs:1058* // possible unsigned wraparound, don't use InBounds:1059* ptr1 = LLVMBuildGEP(base_ptr, index);1060* image = load(ptr1); // becomes "s_load ptr1, 0"1061*1062* ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize);1063* sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds1064*/1065static LLVMValueRef ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,1066LLVMValueRef index, bool uniform, bool invariant,1067bool no_unsigned_wraparound)1068{1069LLVMValueRef pointer, result;10701071if (no_unsigned_wraparound &&1072LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT)1073pointer = LLVMBuildInBoundsGEP(ctx->builder, base_ptr, &index, 1, "");1074else1075pointer = LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");10761077if (uniform)1078LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);1079result = LLVMBuildLoad(ctx->builder, pointer, "");1080if (invariant)1081LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);1082LLVMSetAlignment(result, 4);1083return result;1084}10851086LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index)1087{1088return ac_build_load_custom(ctx, base_ptr, index, false, false, false);1089}10901091LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,1092LLVMValueRef index)1093{1094return ac_build_load_custom(ctx, base_ptr, index, false, true, false);1095}10961097/* This assumes that there is no unsigned integer wraparound during the address1098* computation, excluding all GEPs within base_ptr. */1099LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,1100LLVMValueRef index)1101{1102return ac_build_load_custom(ctx, base_ptr, index, true, true, true);1103}11041105/* See ac_build_load_custom() documentation. */1106LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx,1107LLVMValueRef base_ptr, LLVMValueRef index)1108{1109return ac_build_load_custom(ctx, base_ptr, index, true, true, false);1110}11111112static unsigned get_load_cache_policy(struct ac_llvm_context *ctx, unsigned cache_policy)1113{1114return cache_policy | (ctx->chip_class >= GFX10 && cache_policy & ac_glc ? ac_dlc : 0);1115}11161117static void ac_build_buffer_store_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,1118LLVMValueRef data, LLVMValueRef vindex,1119LLVMValueRef voffset, LLVMValueRef soffset,1120unsigned cache_policy, bool use_format, bool structurized)1121{1122LLVMValueRef args[6];1123int idx = 0;1124args[idx++] = data;1125args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");1126if (structurized)1127args[idx++] = vindex ? vindex : ctx->i32_0;1128args[idx++] = voffset ? voffset : ctx->i32_0;1129args[idx++] = soffset ? soffset : ctx->i32_0;1130args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0);1131const char *indexing_kind = structurized ? "struct" : "raw";1132char name[256], type_name[8];11331134ac_build_type_name_for_intr(LLVMTypeOf(data), type_name, sizeof(type_name));11351136if (use_format) {1137snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s", indexing_kind,1138type_name);1139} else {1140snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s", indexing_kind, type_name);1141}11421143ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);1144}11451146void ac_build_buffer_store_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef data,1147LLVMValueRef vindex, LLVMValueRef voffset, unsigned cache_policy)1148{1149ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset, NULL, cache_policy, true, true);1150}11511152/* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.1153* The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),1154* or v4i32 (num_channels=3,4).1155*/1156void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,1157unsigned num_channels, LLVMValueRef voffset, LLVMValueRef soffset,1158unsigned inst_offset, unsigned cache_policy)1159{1160/* Split 3 channel stores. */1161if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false)) {1162LLVMValueRef v[3], v01;11631164for (int i = 0; i < 3; i++) {1165v[i] = LLVMBuildExtractElement(ctx->builder, vdata, LLVMConstInt(ctx->i32, i, 0), "");1166}1167v01 = ac_build_gather_values(ctx, v, 2);11681169ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset, soffset, inst_offset, cache_policy);1170ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset, soffset, inst_offset + 8,1171cache_policy);1172return;1173}11741175/* SWIZZLE_ENABLE requires that soffset isn't folded into voffset1176* (voffset is swizzled, but soffset isn't swizzled).1177* llvm.amdgcn.buffer.store doesn't have a separate soffset parameter.1178*/1179if (!(cache_policy & ac_swizzled)) {1180LLVMValueRef offset = soffset;11811182if (inst_offset)1183offset = LLVMBuildAdd(ctx->builder, offset, LLVMConstInt(ctx->i32, inst_offset, 0), "");11841185ac_build_buffer_store_common(ctx, rsrc, ac_to_float(ctx, vdata), ctx->i32_0, voffset, offset,1186cache_policy, false, false);1187return;1188}11891190static const unsigned dfmts[] = {V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32,1191V_008F0C_BUF_DATA_FORMAT_32_32_32,1192V_008F0C_BUF_DATA_FORMAT_32_32_32_32};1193unsigned dfmt = dfmts[num_channels - 1];1194unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;1195LLVMValueRef immoffset = LLVMConstInt(ctx->i32, inst_offset, 0);11961197ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, immoffset, num_channels, dfmt,1198nfmt, cache_policy);1199}12001201static LLVMValueRef ac_build_buffer_load_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,1202LLVMValueRef vindex, LLVMValueRef voffset,1203LLVMValueRef soffset, unsigned num_channels,1204LLVMTypeRef channel_type, unsigned cache_policy,1205bool can_speculate, bool use_format,1206bool structurized)1207{1208LLVMValueRef args[5];1209int idx = 0;1210args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");1211if (structurized)1212args[idx++] = vindex ? vindex : ctx->i32_0;1213args[idx++] = voffset ? voffset : ctx->i32_0;1214args[idx++] = soffset ? soffset : ctx->i32_0;1215args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);1216unsigned func =1217!ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels;1218const char *indexing_kind = structurized ? "struct" : "raw";1219char name[256], type_name[8];12201221/* D16 is only supported on gfx8+ */1222assert(!use_format || (channel_type != ctx->f16 && channel_type != ctx->i16) ||1223ctx->chip_class >= GFX8);12241225LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type;1226ac_build_type_name_for_intr(type, type_name, sizeof(type_name));12271228if (use_format) {1229snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s", indexing_kind,1230type_name);1231} else {1232snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s", indexing_kind, type_name);1233}12341235return ac_build_intrinsic(ctx, name, type, args, idx, ac_get_load_intr_attribs(can_speculate));1236}12371238LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, int num_channels,1239LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,1240unsigned inst_offset, LLVMTypeRef channel_type,1241unsigned cache_policy, bool can_speculate, bool allow_smem)1242{1243LLVMValueRef offset = LLVMConstInt(ctx->i32, inst_offset, 0);1244if (voffset)1245offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");1246if (soffset)1247offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");12481249if (allow_smem && !(cache_policy & ac_slc) &&1250(!(cache_policy & ac_glc) || ctx->chip_class >= GFX8)) {1251assert(vindex == NULL);12521253LLVMValueRef result[8];12541255for (int i = 0; i < num_channels; i++) {1256if (i) {1257offset = LLVMBuildAdd(ctx->builder, offset, LLVMConstInt(ctx->i32, 4, 0), "");1258}1259LLVMValueRef args[3] = {1260rsrc,1261offset,1262LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0),1263};1264result[i] = ac_build_intrinsic(ctx, "llvm.amdgcn.s.buffer.load.f32", ctx->f32, args, 3,1265AC_FUNC_ATTR_READNONE);1266}1267if (num_channels == 1)1268return result[0];12691270if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false))1271result[num_channels++] = LLVMGetUndef(ctx->f32);1272return ac_build_gather_values(ctx, result, num_channels);1273}12741275return ac_build_buffer_load_common(ctx, rsrc, vindex, offset, ctx->i32_0, num_channels,1276channel_type, cache_policy, can_speculate, false, false);1277}12781279LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc,1280LLVMValueRef vindex, LLVMValueRef voffset,1281unsigned num_channels, unsigned cache_policy,1282bool can_speculate, bool d16, bool tfe)1283{1284if (tfe) {1285assert(!d16);12861287char code[256];1288/* The definition in the assembly and the one in the constraint string1289* differs because of an assembler bug.1290*/1291snprintf(code, sizeof(code),1292"v_mov_b32 v0, 0\n"1293"v_mov_b32 v1, 0\n"1294"v_mov_b32 v2, 0\n"1295"v_mov_b32 v3, 0\n"1296"v_mov_b32 v4, 0\n"1297"buffer_load_format_xyzw v[0:3], $1, $2, 0, idxen offen %s %s tfe %s\n"1298"s_waitcnt vmcnt(0)",1299cache_policy & ac_glc ? "glc" : "",1300cache_policy & ac_slc ? "slc" : "",1301cache_policy & ac_dlc ? "dlc" : "");13021303LLVMTypeRef param_types[] = {ctx->v2i32, ctx->v4i32};1304LLVMTypeRef calltype = LLVMFunctionType(LLVMVectorType(ctx->f32, 5), param_types, 2, false);1305LLVMValueRef inlineasm = LLVMConstInlineAsm(calltype, code, "=&{v[0:4]},v,s", false, false);13061307LLVMValueRef addr_comp[2] = {vindex ? vindex : ctx->i32_0,1308voffset ? voffset : ctx->i32_0};13091310LLVMValueRef args[] = {ac_build_gather_values(ctx, addr_comp, 2),1311LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "")};1312LLVMValueRef res = LLVMBuildCall(ctx->builder, inlineasm, args, 2, "");13131314return ac_build_concat(ctx, ac_trim_vector(ctx, res, num_channels),1315ac_llvm_extract_elem(ctx, res, 4));1316}13171318return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0, num_channels,1319d16 ? ctx->f16 : ctx->f32, cache_policy, can_speculate, true,1320true);1321}13221323static LLVMValueRef ac_build_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,1324LLVMValueRef vindex, LLVMValueRef voffset,1325LLVMValueRef soffset, LLVMValueRef immoffset,1326unsigned num_channels, unsigned dfmt, unsigned nfmt,1327unsigned cache_policy, bool can_speculate,1328bool structurized)1329{1330voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");13311332LLVMValueRef args[6];1333int idx = 0;1334args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");1335if (structurized)1336args[idx++] = vindex ? vindex : ctx->i32_0;1337args[idx++] = voffset ? voffset : ctx->i32_0;1338args[idx++] = soffset ? soffset : ctx->i32_0;1339args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx->chip_class, dfmt, nfmt), 0);1340args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);1341unsigned func =1342!ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;1343const char *indexing_kind = structurized ? "struct" : "raw";1344char name[256], type_name[8];13451346LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32;1347ac_build_type_name_for_intr(type, type_name, sizeof(type_name));13481349snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s", indexing_kind, type_name);13501351return ac_build_intrinsic(ctx, name, type, args, idx, ac_get_load_intr_attribs(can_speculate));1352}13531354LLVMValueRef ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,1355LLVMValueRef vindex, LLVMValueRef voffset,1356LLVMValueRef soffset, LLVMValueRef immoffset,1357unsigned num_channels, unsigned dfmt, unsigned nfmt,1358unsigned cache_policy, bool can_speculate)1359{1360return ac_build_tbuffer_load(ctx, rsrc, vindex, voffset, soffset, immoffset, num_channels, dfmt,1361nfmt, cache_policy, can_speculate, true);1362}13631364LLVMValueRef ac_build_tbuffer_load_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,1365LLVMValueRef voffset, LLVMValueRef soffset,1366LLVMValueRef immoffset, unsigned cache_policy)1367{1368voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");13691370return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i16,1371cache_policy, false, false, false);1372}13731374LLVMValueRef ac_build_tbuffer_load_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc,1375LLVMValueRef voffset, LLVMValueRef soffset,1376LLVMValueRef immoffset, unsigned cache_policy)1377{1378voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");13791380return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i8, cache_policy,1381false, false, false);1382}13831384/**1385* Convert an 11- or 10-bit unsigned floating point number to an f32.1386*1387* The input exponent is expected to be biased analogous to IEEE-754, i.e. by1388* 2^(exp_bits-1) - 1 (as defined in OpenGL and other graphics APIs).1389*/1390static LLVMValueRef ac_ufN_to_float(struct ac_llvm_context *ctx, LLVMValueRef src,1391unsigned exp_bits, unsigned mant_bits)1392{1393assert(LLVMTypeOf(src) == ctx->i32);13941395LLVMValueRef tmp;1396LLVMValueRef mantissa;1397mantissa =1398LLVMBuildAnd(ctx->builder, src, LLVMConstInt(ctx->i32, (1 << mant_bits) - 1, false), "");13991400/* Converting normal numbers is just a shift + correcting the exponent bias */1401unsigned normal_shift = 23 - mant_bits;1402unsigned bias_shift = 127 - ((1 << (exp_bits - 1)) - 1);1403LLVMValueRef shifted, normal;14041405shifted = LLVMBuildShl(ctx->builder, src, LLVMConstInt(ctx->i32, normal_shift, false), "");1406normal =1407LLVMBuildAdd(ctx->builder, shifted, LLVMConstInt(ctx->i32, bias_shift << 23, false), "");14081409/* Converting nan/inf numbers is the same, but with a different exponent update */1410LLVMValueRef naninf;1411naninf = LLVMBuildOr(ctx->builder, normal, LLVMConstInt(ctx->i32, 0xff << 23, false), "");14121413/* Converting denormals is the complex case: determine the leading zeros of the1414* mantissa to obtain the correct shift for the mantissa and exponent correction.1415*/1416LLVMValueRef denormal;1417LLVMValueRef params[2] = {1418mantissa, ctx->i1true, /* result can be undef when arg is 0 */1419};1420LLVMValueRef ctlz =1421ac_build_intrinsic(ctx, "llvm.ctlz.i32", ctx->i32, params, 2, AC_FUNC_ATTR_READNONE);14221423/* Shift such that the leading 1 ends up as the LSB of the exponent field. */1424tmp = LLVMBuildSub(ctx->builder, ctlz, LLVMConstInt(ctx->i32, 8, false), "");1425denormal = LLVMBuildShl(ctx->builder, mantissa, tmp, "");14261427unsigned denormal_exp = bias_shift + (32 - mant_bits) - 1;1428tmp = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, denormal_exp, false), ctlz, "");1429tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(ctx->i32, 23, false), "");1430denormal = LLVMBuildAdd(ctx->builder, denormal, tmp, "");14311432/* Select the final result. */1433LLVMValueRef result;14341435tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src,1436LLVMConstInt(ctx->i32, ((1ULL << exp_bits) - 1) << mant_bits, false), "");1437result = LLVMBuildSelect(ctx->builder, tmp, naninf, normal, "");14381439tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src,1440LLVMConstInt(ctx->i32, 1ULL << mant_bits, false), "");1441result = LLVMBuildSelect(ctx->builder, tmp, result, denormal, "");14421443tmp = LLVMBuildICmp(ctx->builder, LLVMIntNE, src, ctx->i32_0, "");1444result = LLVMBuildSelect(ctx->builder, tmp, result, ctx->i32_0, "");14451446return ac_to_float(ctx, result);1447}14481449/**1450* Generate a fully general open coded buffer format fetch with all required1451* fixups suitable for vertex fetch, using non-format buffer loads.1452*1453* Some combinations of argument values have special interpretations:1454* - size = 8 bytes, format = fixed indicates PIPE_FORMAT_R11G11B10_FLOAT1455* - size = 8 bytes, format != {float,fixed} indicates a 2_10_10_10 data format1456*1457* \param log_size log(size of channel in bytes)1458* \param num_channels number of channels (1 to 4)1459* \param format AC_FETCH_FORMAT_xxx value1460* \param reverse whether XYZ channels are reversed1461* \param known_aligned whether the source is known to be aligned to hardware's1462* effective element size for loading the given format1463* (note: this means dword alignment for 8_8_8_8, 16_16, etc.)1464* \param rsrc buffer resource descriptor1465* \return the resulting vector of floats or integers bitcast to <4 x i32>1466*/1467LLVMValueRef ac_build_opencoded_load_format(struct ac_llvm_context *ctx, unsigned log_size,1468unsigned num_channels, unsigned format, bool reverse,1469bool known_aligned, LLVMValueRef rsrc,1470LLVMValueRef vindex, LLVMValueRef voffset,1471LLVMValueRef soffset, unsigned cache_policy,1472bool can_speculate)1473{1474LLVMValueRef tmp;1475unsigned load_log_size = log_size;1476unsigned load_num_channels = num_channels;1477if (log_size == 3) {1478load_log_size = 2;1479if (format == AC_FETCH_FORMAT_FLOAT) {1480load_num_channels = 2 * num_channels;1481} else {1482load_num_channels = 1; /* 10_11_11 or 2_10_10_10 */1483}1484}14851486int log_recombine = 0;1487if ((ctx->chip_class == GFX6 || ctx->chip_class >= GFX10) && !known_aligned) {1488/* Avoid alignment restrictions by loading one byte at a time. */1489load_num_channels <<= load_log_size;1490log_recombine = load_log_size;1491load_log_size = 0;1492} else if (load_num_channels == 2 || load_num_channels == 4) {1493log_recombine = -util_logbase2(load_num_channels);1494load_num_channels = 1;1495load_log_size += -log_recombine;1496}14971498LLVMValueRef loads[32]; /* up to 32 bytes */1499for (unsigned i = 0; i < load_num_channels; ++i) {1500tmp =1501LLVMBuildAdd(ctx->builder, soffset, LLVMConstInt(ctx->i32, i << load_log_size, false), "");1502LLVMTypeRef channel_type =1503load_log_size == 0 ? ctx->i8 : load_log_size == 1 ? ctx->i16 : ctx->i32;1504unsigned num_channels = 1 << (MAX2(load_log_size, 2) - 2);1505loads[i] =1506ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, tmp, num_channels, channel_type,1507cache_policy, can_speculate, false, true);1508if (load_log_size >= 2)1509loads[i] = ac_to_integer(ctx, loads[i]);1510}15111512if (log_recombine > 0) {1513/* Recombine bytes if necessary (GFX6 only) */1514LLVMTypeRef dst_type = log_recombine == 2 ? ctx->i32 : ctx->i16;15151516for (unsigned src = 0, dst = 0; src < load_num_channels; ++dst) {1517LLVMValueRef accum = NULL;1518for (unsigned i = 0; i < (1 << log_recombine); ++i, ++src) {1519tmp = LLVMBuildZExt(ctx->builder, loads[src], dst_type, "");1520if (i == 0) {1521accum = tmp;1522} else {1523tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(dst_type, 8 * i, false), "");1524accum = LLVMBuildOr(ctx->builder, accum, tmp, "");1525}1526}1527loads[dst] = accum;1528}1529} else if (log_recombine < 0) {1530/* Split vectors of dwords */1531if (load_log_size > 2) {1532assert(load_num_channels == 1);1533LLVMValueRef loaded = loads[0];1534unsigned log_split = load_log_size - 2;1535log_recombine += log_split;1536load_num_channels = 1 << log_split;1537load_log_size = 2;1538for (unsigned i = 0; i < load_num_channels; ++i) {1539tmp = LLVMConstInt(ctx->i32, i, false);1540loads[i] = LLVMBuildExtractElement(ctx->builder, loaded, tmp, "");1541}1542}15431544/* Further split dwords and shorts if required */1545if (log_recombine < 0) {1546for (unsigned src = load_num_channels, dst = load_num_channels << -log_recombine; src > 0;1547--src) {1548unsigned dst_bits = 1 << (3 + load_log_size + log_recombine);1549LLVMTypeRef dst_type = LLVMIntTypeInContext(ctx->context, dst_bits);1550LLVMValueRef loaded = loads[src - 1];1551LLVMTypeRef loaded_type = LLVMTypeOf(loaded);1552for (unsigned i = 1 << -log_recombine; i > 0; --i, --dst) {1553tmp = LLVMConstInt(loaded_type, dst_bits * (i - 1), false);1554tmp = LLVMBuildLShr(ctx->builder, loaded, tmp, "");1555loads[dst - 1] = LLVMBuildTrunc(ctx->builder, tmp, dst_type, "");1556}1557}1558}1559}15601561if (log_size == 3) {1562if (format == AC_FETCH_FORMAT_FLOAT) {1563for (unsigned i = 0; i < num_channels; ++i) {1564tmp = ac_build_gather_values(ctx, &loads[2 * i], 2);1565loads[i] = LLVMBuildBitCast(ctx->builder, tmp, ctx->f64, "");1566}1567} else if (format == AC_FETCH_FORMAT_FIXED) {1568/* 10_11_11_FLOAT */1569LLVMValueRef data = loads[0];1570LLVMValueRef i32_2047 = LLVMConstInt(ctx->i32, 2047, false);1571LLVMValueRef r = LLVMBuildAnd(ctx->builder, data, i32_2047, "");1572tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 11, false), "");1573LLVMValueRef g = LLVMBuildAnd(ctx->builder, tmp, i32_2047, "");1574LLVMValueRef b = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 22, false), "");15751576loads[0] = ac_to_integer(ctx, ac_ufN_to_float(ctx, r, 5, 6));1577loads[1] = ac_to_integer(ctx, ac_ufN_to_float(ctx, g, 5, 6));1578loads[2] = ac_to_integer(ctx, ac_ufN_to_float(ctx, b, 5, 5));15791580num_channels = 3;1581log_size = 2;1582format = AC_FETCH_FORMAT_FLOAT;1583} else {1584/* 2_10_10_10 data formats */1585LLVMValueRef data = loads[0];1586LLVMTypeRef i10 = LLVMIntTypeInContext(ctx->context, 10);1587LLVMTypeRef i2 = LLVMIntTypeInContext(ctx->context, 2);1588loads[0] = LLVMBuildTrunc(ctx->builder, data, i10, "");1589tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 10, false), "");1590loads[1] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");1591tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 20, false), "");1592loads[2] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");1593tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 30, false), "");1594loads[3] = LLVMBuildTrunc(ctx->builder, tmp, i2, "");15951596num_channels = 4;1597}1598}15991600if (format == AC_FETCH_FORMAT_FLOAT) {1601if (log_size != 2) {1602for (unsigned chan = 0; chan < num_channels; ++chan) {1603tmp = ac_to_float(ctx, loads[chan]);1604if (log_size == 3)1605tmp = LLVMBuildFPTrunc(ctx->builder, tmp, ctx->f32, "");1606else if (log_size == 1)1607tmp = LLVMBuildFPExt(ctx->builder, tmp, ctx->f32, "");1608loads[chan] = ac_to_integer(ctx, tmp);1609}1610}1611} else if (format == AC_FETCH_FORMAT_UINT) {1612if (log_size != 2) {1613for (unsigned chan = 0; chan < num_channels; ++chan)1614loads[chan] = LLVMBuildZExt(ctx->builder, loads[chan], ctx->i32, "");1615}1616} else if (format == AC_FETCH_FORMAT_SINT) {1617if (log_size != 2) {1618for (unsigned chan = 0; chan < num_channels; ++chan)1619loads[chan] = LLVMBuildSExt(ctx->builder, loads[chan], ctx->i32, "");1620}1621} else {1622bool unsign = format == AC_FETCH_FORMAT_UNORM || format == AC_FETCH_FORMAT_USCALED ||1623format == AC_FETCH_FORMAT_UINT;16241625for (unsigned chan = 0; chan < num_channels; ++chan) {1626if (unsign) {1627tmp = LLVMBuildUIToFP(ctx->builder, loads[chan], ctx->f32, "");1628} else {1629tmp = LLVMBuildSIToFP(ctx->builder, loads[chan], ctx->f32, "");1630}16311632LLVMValueRef scale = NULL;1633if (format == AC_FETCH_FORMAT_FIXED) {1634assert(log_size == 2);1635scale = LLVMConstReal(ctx->f32, 1.0 / 0x10000);1636} else if (format == AC_FETCH_FORMAT_UNORM) {1637unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));1638scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << bits) - 1));1639} else if (format == AC_FETCH_FORMAT_SNORM) {1640unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));1641scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << (bits - 1)) - 1));1642}1643if (scale)1644tmp = LLVMBuildFMul(ctx->builder, tmp, scale, "");16451646if (format == AC_FETCH_FORMAT_SNORM) {1647/* Clamp to [-1, 1] */1648LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);1649LLVMValueRef clamp = LLVMBuildFCmp(ctx->builder, LLVMRealULT, tmp, neg_one, "");1650tmp = LLVMBuildSelect(ctx->builder, clamp, neg_one, tmp, "");1651}16521653loads[chan] = ac_to_integer(ctx, tmp);1654}1655}16561657while (num_channels < 4) {1658if (format == AC_FETCH_FORMAT_UINT || format == AC_FETCH_FORMAT_SINT) {1659loads[num_channels] = num_channels == 3 ? ctx->i32_1 : ctx->i32_0;1660} else {1661loads[num_channels] = ac_to_integer(ctx, num_channels == 3 ? ctx->f32_1 : ctx->f32_0);1662}1663num_channels++;1664}16651666if (reverse) {1667tmp = loads[0];1668loads[0] = loads[2];1669loads[2] = tmp;1670}16711672return ac_build_gather_values(ctx, loads, 4);1673}16741675static void ac_build_tbuffer_store(struct ac_llvm_context *ctx, LLVMValueRef rsrc,1676LLVMValueRef vdata, LLVMValueRef vindex, LLVMValueRef voffset,1677LLVMValueRef soffset, LLVMValueRef immoffset,1678unsigned num_channels, unsigned dfmt, unsigned nfmt,1679unsigned cache_policy, bool structurized)1680{1681voffset = LLVMBuildAdd(ctx->builder, voffset ? voffset : ctx->i32_0, immoffset, "");16821683LLVMValueRef args[7];1684int idx = 0;1685args[idx++] = vdata;1686args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");1687if (structurized)1688args[idx++] = vindex ? vindex : ctx->i32_0;1689args[idx++] = voffset ? voffset : ctx->i32_0;1690args[idx++] = soffset ? soffset : ctx->i32_0;1691args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx->chip_class, dfmt, nfmt), 0);1692args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0);1693unsigned func =1694!ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;1695const char *indexing_kind = structurized ? "struct" : "raw";1696char name[256], type_name[8];16971698LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32;1699ac_build_type_name_for_intr(type, type_name, sizeof(type_name));17001701snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.store.%s", indexing_kind, type_name);17021703ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);1704}17051706void ac_build_struct_tbuffer_store(struct ac_llvm_context *ctx, LLVMValueRef rsrc,1707LLVMValueRef vdata, LLVMValueRef vindex, LLVMValueRef voffset,1708LLVMValueRef soffset, LLVMValueRef immoffset,1709unsigned num_channels, unsigned dfmt, unsigned nfmt,1710unsigned cache_policy)1711{1712ac_build_tbuffer_store(ctx, rsrc, vdata, vindex, voffset, soffset, immoffset, num_channels, dfmt,1713nfmt, cache_policy, true);1714}17151716void ac_build_raw_tbuffer_store(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,1717LLVMValueRef voffset, LLVMValueRef soffset, LLVMValueRef immoffset,1718unsigned num_channels, unsigned dfmt, unsigned nfmt,1719unsigned cache_policy)1720{1721ac_build_tbuffer_store(ctx, rsrc, vdata, NULL, voffset, soffset, immoffset, num_channels, dfmt,1722nfmt, cache_policy, false);1723}17241725void ac_build_tbuffer_store_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,1726LLVMValueRef vdata, LLVMValueRef voffset, LLVMValueRef soffset,1727unsigned cache_policy)1728{1729vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, "");17301731ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false,1732false);1733}17341735void ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,1736LLVMValueRef voffset, LLVMValueRef soffset, unsigned cache_policy)1737{1738vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, "");17391740ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false,1741false);1742}17431744/**1745* Set range metadata on an instruction. This can only be used on load and1746* call instructions. If you know an instruction can only produce the values1747* 0, 1, 2, you would do set_range_metadata(value, 0, 3);1748* \p lo is the minimum value inclusive.1749* \p hi is the maximum value exclusive.1750*/1751void ac_set_range_metadata(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned lo,1752unsigned hi)1753{1754LLVMValueRef range_md, md_args[2];1755LLVMTypeRef type = LLVMTypeOf(value);1756LLVMContextRef context = LLVMGetTypeContext(type);17571758md_args[0] = LLVMConstInt(type, lo, false);1759md_args[1] = LLVMConstInt(type, hi, false);1760range_md = LLVMMDNodeInContext(context, md_args, 2);1761LLVMSetMetadata(value, ctx->range_md_kind, range_md);1762}17631764LLVMValueRef ac_get_thread_id(struct ac_llvm_context *ctx)1765{1766return ac_build_mbcnt(ctx, LLVMConstInt(ctx->iN_wavemask, ~0ull, 0));1767}17681769/*1770* AMD GCN implements derivatives using the local data store (LDS)1771* All writes to the LDS happen in all executing threads at1772* the same time. TID is the Thread ID for the current1773* thread and is a value between 0 and 63, representing1774* the thread's position in the wavefront.1775*1776* For the pixel shader threads are grouped into quads of four pixels.1777* The TIDs of the pixels of a quad are:1778*1779* +------+------+1780* |4n + 0|4n + 1|1781* +------+------+1782* |4n + 2|4n + 3|1783* +------+------+1784*1785* So, masking the TID with 0xfffffffc yields the TID of the top left pixel1786* of the quad, masking with 0xfffffffd yields the TID of the top pixel of1787* the current pixel's column, and masking with 0xfffffffe yields the TID1788* of the left pixel of the current pixel's row.1789*1790* Adding 1 yields the TID of the pixel to the right of the left pixel, and1791* adding 2 yields the TID of the pixel below the top pixel.1792*/1793LLVMValueRef ac_build_ddxy(struct ac_llvm_context *ctx, uint32_t mask, int idx, LLVMValueRef val)1794{1795unsigned tl_lanes[4], trbl_lanes[4];1796char name[32], type[8];1797LLVMValueRef tl, trbl;1798LLVMTypeRef result_type;1799LLVMValueRef result;18001801result_type = ac_to_float_type(ctx, LLVMTypeOf(val));18021803if (result_type == ctx->f16)1804val = LLVMBuildZExt(ctx->builder, val, ctx->i32, "");1805else if (result_type == ctx->v2f16)1806val = LLVMBuildBitCast(ctx->builder, val, ctx->i32, "");18071808for (unsigned i = 0; i < 4; ++i) {1809tl_lanes[i] = i & mask;1810trbl_lanes[i] = (i & mask) + idx;1811}18121813tl = ac_build_quad_swizzle(ctx, val, tl_lanes[0], tl_lanes[1], tl_lanes[2], tl_lanes[3]);1814trbl =1815ac_build_quad_swizzle(ctx, val, trbl_lanes[0], trbl_lanes[1], trbl_lanes[2], trbl_lanes[3]);18161817if (result_type == ctx->f16) {1818tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, "");1819trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, "");1820}18211822tl = LLVMBuildBitCast(ctx->builder, tl, result_type, "");1823trbl = LLVMBuildBitCast(ctx->builder, trbl, result_type, "");1824result = LLVMBuildFSub(ctx->builder, trbl, tl, "");18251826ac_build_type_name_for_intr(result_type, type, sizeof(type));1827snprintf(name, sizeof(name), "llvm.amdgcn.wqm.%s", type);18281829return ac_build_intrinsic(ctx, name, result_type, &result, 1, 0);1830}18311832void ac_build_sendmsg(struct ac_llvm_context *ctx, uint32_t msg, LLVMValueRef wave_id)1833{1834LLVMValueRef args[2];1835args[0] = LLVMConstInt(ctx->i32, msg, false);1836args[1] = wave_id;1837ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0);1838}18391840LLVMValueRef ac_build_imsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type)1841{1842LLVMValueRef msb =1843ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32", dst_type, &arg, 1, AC_FUNC_ATTR_READNONE);18441845/* The HW returns the last bit index from MSB, but NIR/TGSI wants1846* the index from LSB. Invert it by doing "31 - msb". */1847msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false), msb, "");18481849LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true);1850LLVMValueRef cond =1851LLVMBuildOr(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, ctx->i32_0, ""),1852LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, all_ones, ""), "");18531854return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, "");1855}18561857LLVMValueRef ac_build_umsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type)1858{1859const char *intrin_name;1860LLVMTypeRef type;1861LLVMValueRef highest_bit;1862LLVMValueRef zero;1863unsigned bitsize;18641865bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg));1866switch (bitsize) {1867case 64:1868intrin_name = "llvm.ctlz.i64";1869type = ctx->i64;1870highest_bit = LLVMConstInt(ctx->i64, 63, false);1871zero = ctx->i64_0;1872break;1873case 32:1874intrin_name = "llvm.ctlz.i32";1875type = ctx->i32;1876highest_bit = LLVMConstInt(ctx->i32, 31, false);1877zero = ctx->i32_0;1878break;1879case 16:1880intrin_name = "llvm.ctlz.i16";1881type = ctx->i16;1882highest_bit = LLVMConstInt(ctx->i16, 15, false);1883zero = ctx->i16_0;1884break;1885case 8:1886intrin_name = "llvm.ctlz.i8";1887type = ctx->i8;1888highest_bit = LLVMConstInt(ctx->i8, 7, false);1889zero = ctx->i8_0;1890break;1891default:1892unreachable(!"invalid bitsize");1893break;1894}18951896LLVMValueRef params[2] = {1897arg,1898ctx->i1true,1899};19001901LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, AC_FUNC_ATTR_READNONE);19021903/* The HW returns the last bit index from MSB, but TGSI/NIR wants1904* the index from LSB. Invert it by doing "31 - msb". */1905msb = LLVMBuildSub(ctx->builder, highest_bit, msb, "");19061907if (bitsize == 64) {1908msb = LLVMBuildTrunc(ctx->builder, msb, ctx->i32, "");1909} else if (bitsize < 32) {1910msb = LLVMBuildSExt(ctx->builder, msb, ctx->i32, "");1911}19121913/* check for zero */1914return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""),1915LLVMConstInt(ctx->i32, -1, true), msb, "");1916}19171918LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)1919{1920char name[64], type[64];19211922ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));1923snprintf(name, sizeof(name), "llvm.minnum.%s", type);1924LLVMValueRef args[2] = {a, b};1925return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE);1926}19271928LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)1929{1930char name[64], type[64];19311932ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));1933snprintf(name, sizeof(name), "llvm.maxnum.%s", type);1934LLVMValueRef args[2] = {a, b};1935return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE);1936}19371938LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)1939{1940LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, "");1941return LLVMBuildSelect(ctx->builder, cmp, a, b, "");1942}19431944LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)1945{1946LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, "");1947return LLVMBuildSelect(ctx->builder, cmp, a, b, "");1948}19491950LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)1951{1952LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, "");1953return LLVMBuildSelect(ctx->builder, cmp, a, b, "");1954}19551956LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)1957{1958LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, a, b, "");1959return LLVMBuildSelect(ctx->builder, cmp, a, b, "");1960}19611962LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)1963{1964LLVMTypeRef t = LLVMTypeOf(value);1965return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)),1966LLVMConstReal(t, 1.0));1967}19681969void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)1970{1971LLVMValueRef args[9];19721973args[0] = LLVMConstInt(ctx->i32, a->target, 0);1974args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);19751976if (a->compr) {1977args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], ctx->v2i16, "");1978args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], ctx->v2i16, "");1979args[4] = LLVMConstInt(ctx->i1, a->done, 0);1980args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);19811982ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16", ctx->voidt, args, 6, 0);1983} else {1984args[2] = a->out[0];1985args[3] = a->out[1];1986args[4] = a->out[2];1987args[5] = a->out[3];1988args[6] = LLVMConstInt(ctx->i1, a->done, 0);1989args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);19901991ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32", ctx->voidt, args, 8, 0);1992}1993}19941995void ac_build_export_null(struct ac_llvm_context *ctx)1996{1997struct ac_export_args args;19981999args.enabled_channels = 0x0; /* enabled channels */2000args.valid_mask = 1; /* whether the EXEC mask is valid */2001args.done = 1; /* DONE bit */2002args.target = V_008DFC_SQ_EXP_NULL;2003args.compr = 0; /* COMPR flag (0 = 32-bit export) */2004args.out[0] = LLVMGetUndef(ctx->f32); /* R */2005args.out[1] = LLVMGetUndef(ctx->f32); /* G */2006args.out[2] = LLVMGetUndef(ctx->f32); /* B */2007args.out[3] = LLVMGetUndef(ctx->f32); /* A */20082009ac_build_export(ctx, &args);2010}20112012static unsigned ac_num_coords(enum ac_image_dim dim)2013{2014switch (dim) {2015case ac_image_1d:2016return 1;2017case ac_image_2d:2018case ac_image_1darray:2019return 2;2020case ac_image_3d:2021case ac_image_cube:2022case ac_image_2darray:2023case ac_image_2dmsaa:2024return 3;2025case ac_image_2darraymsaa:2026return 4;2027default:2028unreachable("ac_num_coords: bad dim");2029}2030}20312032static unsigned ac_num_derivs(enum ac_image_dim dim)2033{2034switch (dim) {2035case ac_image_1d:2036case ac_image_1darray:2037return 2;2038case ac_image_2d:2039case ac_image_2darray:2040case ac_image_cube:2041return 4;2042case ac_image_3d:2043return 6;2044case ac_image_2dmsaa:2045case ac_image_2darraymsaa:2046default:2047unreachable("derivatives not supported");2048}2049}20502051static const char *get_atomic_name(enum ac_atomic_op op)2052{2053switch (op) {2054case ac_atomic_swap:2055return "swap";2056case ac_atomic_add:2057return "add";2058case ac_atomic_sub:2059return "sub";2060case ac_atomic_smin:2061return "smin";2062case ac_atomic_umin:2063return "umin";2064case ac_atomic_smax:2065return "smax";2066case ac_atomic_umax:2067return "umax";2068case ac_atomic_and:2069return "and";2070case ac_atomic_or:2071return "or";2072case ac_atomic_xor:2073return "xor";2074case ac_atomic_inc_wrap:2075return "inc";2076case ac_atomic_dec_wrap:2077return "dec";2078}2079unreachable("bad atomic op");2080}20812082LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_args *a)2083{2084const char *overload[3] = {"", "", ""};2085unsigned num_overloads = 0;2086LLVMValueRef args[18];2087unsigned num_args = 0;2088enum ac_image_dim dim = a->dim;20892090assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 || !a->level_zero);2091assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&2092a->opcode != ac_image_store_mip) ||2093a->lod);2094assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||2095(!a->compare && !a->offset));2096assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||2097a->opcode == ac_image_get_lod) ||2098!a->bias);2099assert((a->bias ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) + (a->derivs[0] ? 1 : 0) <=21001);2101assert((a->min_lod ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) <= 1);2102assert(!a->d16 || (ctx->chip_class >= GFX8 && a->opcode != ac_image_atomic &&2103a->opcode != ac_image_atomic_cmpswap && a->opcode != ac_image_get_lod &&2104a->opcode != ac_image_get_resinfo));2105assert(!a->a16 || ctx->chip_class >= GFX9);2106assert(a->g16 == a->a16 || ctx->chip_class >= GFX10);21072108assert(!a->offset ||2109ac_get_elem_bits(ctx, LLVMTypeOf(a->offset)) == 32);2110assert(!a->bias ||2111ac_get_elem_bits(ctx, LLVMTypeOf(a->bias)) == 32);2112assert(!a->compare ||2113ac_get_elem_bits(ctx, LLVMTypeOf(a->compare)) == 32);2114assert(!a->derivs[0] ||2115((!a->g16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->derivs[0])) == 16) &&2116(a->g16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->derivs[0])) == 32)));2117assert(!a->coords[0] ||2118((!a->a16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == 16) &&2119(a->a16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == 32)));2120assert(!a->lod ||2121((a->opcode != ac_image_get_resinfo || ac_get_elem_bits(ctx, LLVMTypeOf(a->lod))) &&2122(a->opcode == ac_image_get_resinfo ||2123ac_get_elem_bits(ctx, LLVMTypeOf(a->lod)) ==2124ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])))));2125assert(!a->min_lod ||2126ac_get_elem_bits(ctx, LLVMTypeOf(a->min_lod)) ==2127ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])));21282129if (a->opcode == ac_image_get_lod) {2130switch (dim) {2131case ac_image_1darray:2132dim = ac_image_1d;2133break;2134case ac_image_2darray:2135case ac_image_cube:2136dim = ac_image_2d;2137break;2138default:2139break;2140}2141}21422143bool sample = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||2144a->opcode == ac_image_get_lod;2145bool atomic = a->opcode == ac_image_atomic || a->opcode == ac_image_atomic_cmpswap;2146bool load = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||2147a->opcode == ac_image_load || a->opcode == ac_image_load_mip;2148LLVMTypeRef coord_type = sample ? (a->a16 ? ctx->f16 : ctx->f32) : (a->a16 ? ctx->i16 : ctx->i32);2149uint8_t dmask = a->dmask;2150LLVMTypeRef data_type;2151char data_type_str[32];21522153if (atomic) {2154data_type = LLVMTypeOf(a->data[0]);2155} else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {2156/* Image stores might have been shrinked using the format. */2157data_type = LLVMTypeOf(a->data[0]);2158dmask = (1 << ac_get_llvm_num_components(a->data[0])) - 1;2159} else {2160data_type = a->d16 ? ctx->v4f16 : ctx->v4f32;2161}21622163if (a->tfe) {2164data_type = LLVMStructTypeInContext(2165ctx->context, (LLVMTypeRef[]){data_type, ctx->i32}, 2, false);2166}21672168if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {2169args[num_args++] = a->data[0];2170if (a->opcode == ac_image_atomic_cmpswap)2171args[num_args++] = a->data[1];2172}21732174if (!atomic)2175args[num_args++] = LLVMConstInt(ctx->i32, dmask, false);21762177if (a->offset)2178args[num_args++] = ac_to_integer(ctx, a->offset);2179if (a->bias) {2180args[num_args++] = ac_to_float(ctx, a->bias);2181overload[num_overloads++] = ".f32";2182}2183if (a->compare)2184args[num_args++] = ac_to_float(ctx, a->compare);2185if (a->derivs[0]) {2186unsigned count = ac_num_derivs(dim);2187for (unsigned i = 0; i < count; ++i)2188args[num_args++] = ac_to_float(ctx, a->derivs[i]);2189overload[num_overloads++] = a->g16 ? ".f16" : ".f32";2190}2191unsigned num_coords = a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0;2192for (unsigned i = 0; i < num_coords; ++i)2193args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");2194if (a->lod)2195args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");2196if (a->min_lod)2197args[num_args++] = LLVMBuildBitCast(ctx->builder, a->min_lod, coord_type, "");21982199overload[num_overloads++] = sample ? (a->a16 ? ".f16" : ".f32") : (a->a16 ? ".i16" : ".i32");22002201args[num_args++] = a->resource;2202if (sample) {2203args[num_args++] = a->sampler;2204args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);2205}22062207args[num_args++] = a->tfe ? ctx->i32_1 : ctx->i32_0; /* texfailctrl */2208args[num_args++] = LLVMConstInt(2209ctx->i32, load ? get_load_cache_policy(ctx, a->cache_policy) : a->cache_policy, false);22102211const char *name;2212const char *atomic_subop = "";2213switch (a->opcode) {2214case ac_image_sample:2215name = "sample";2216break;2217case ac_image_gather4:2218name = "gather4";2219break;2220case ac_image_load:2221name = "load";2222break;2223case ac_image_load_mip:2224name = "load.mip";2225break;2226case ac_image_store:2227name = "store";2228break;2229case ac_image_store_mip:2230name = "store.mip";2231break;2232case ac_image_atomic:2233name = "atomic.";2234atomic_subop = get_atomic_name(a->atomic);2235break;2236case ac_image_atomic_cmpswap:2237name = "atomic.";2238atomic_subop = "cmpswap";2239break;2240case ac_image_get_lod:2241name = "getlod";2242break;2243case ac_image_get_resinfo:2244name = "getresinfo";2245break;2246default:2247unreachable("invalid image opcode");2248}22492250const char *dimname;2251switch (dim) {2252case ac_image_1d:2253dimname = "1d";2254break;2255case ac_image_2d:2256dimname = "2d";2257break;2258case ac_image_3d:2259dimname = "3d";2260break;2261case ac_image_cube:2262dimname = "cube";2263break;2264case ac_image_1darray:2265dimname = "1darray";2266break;2267case ac_image_2darray:2268dimname = "2darray";2269break;2270case ac_image_2dmsaa:2271dimname = "2dmsaa";2272break;2273case ac_image_2darraymsaa:2274dimname = "2darraymsaa";2275break;2276default:2277unreachable("invalid dim");2278}22792280ac_build_type_name_for_intr(data_type, data_type_str, sizeof(data_type_str));22812282bool lod_suffix = a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);2283char intr_name[96];2284snprintf(intr_name, sizeof(intr_name),2285"llvm.amdgcn.image.%s%s" /* base name */2286"%s%s%s%s" /* sample/gather modifiers */2287".%s.%s%s%s%s", /* dimension and type overloads */2288name, atomic_subop, a->compare ? ".c" : "",2289a->bias ? ".b" : lod_suffix ? ".l" : a->derivs[0] ? ".d" : a->level_zero ? ".lz" : "",2290a->min_lod ? ".cl" : "", a->offset ? ".o" : "", dimname,2291data_type_str, overload[0], overload[1], overload[2]);22922293LLVMTypeRef retty;2294if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)2295retty = ctx->voidt;2296else2297retty = data_type;22982299LLVMValueRef result = ac_build_intrinsic(ctx, intr_name, retty, args, num_args, a->attributes);2300if (a->tfe) {2301LLVMValueRef texel = LLVMBuildExtractValue(ctx->builder, result, 0, "");2302LLVMValueRef code = LLVMBuildExtractValue(ctx->builder, result, 1, "");2303result = ac_build_concat(ctx, texel, ac_to_float(ctx, code));2304}23052306if (!sample && !atomic && retty != ctx->voidt)2307result = ac_to_integer(ctx, result);23082309return result;2310}23112312LLVMValueRef ac_build_image_get_sample_count(struct ac_llvm_context *ctx, LLVMValueRef rsrc)2313{2314LLVMValueRef samples;23152316/* Read the samples from the descriptor directly.2317* Hardware doesn't have any instruction for this.2318*/2319samples = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 3, 0), "");2320samples = LLVMBuildLShr(ctx->builder, samples, LLVMConstInt(ctx->i32, 16, 0), "");2321samples = LLVMBuildAnd(ctx->builder, samples, LLVMConstInt(ctx->i32, 0xf, 0), "");2322samples = LLVMBuildShl(ctx->builder, ctx->i32_1, samples, "");2323return samples;2324}23252326LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx, LLVMValueRef args[2])2327{2328return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", ctx->v2f16, args, 2,2329AC_FUNC_ATTR_READNONE);2330}23312332LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2])2333{2334LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16", ctx->v2i16, args, 2,2335AC_FUNC_ATTR_READNONE);2336return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");2337}23382339LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2])2340{2341LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16", ctx->v2i16, args, 2,2342AC_FUNC_ATTR_READNONE);2343return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");2344}23452346LLVMValueRef ac_build_cvt_pknorm_i16_f16(struct ac_llvm_context *ctx,2347LLVMValueRef args[2])2348{2349LLVMTypeRef param_types[] = {ctx->f16, ctx->f16};2350LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false);2351LLVMValueRef code = LLVMConstInlineAsm(calltype,2352"v_cvt_pknorm_i16_f16 $0, $1, $2", "=v,v,v",2353false, false);2354return LLVMBuildCall(ctx->builder, code, args, 2, "");2355}23562357LLVMValueRef ac_build_cvt_pknorm_u16_f16(struct ac_llvm_context *ctx,2358LLVMValueRef args[2])2359{2360LLVMTypeRef param_types[] = {ctx->f16, ctx->f16};2361LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false);2362LLVMValueRef code = LLVMConstInlineAsm(calltype,2363"v_cvt_pknorm_u16_f16 $0, $1, $2", "=v,v,v",2364false, false);2365return LLVMBuildCall(ctx->builder, code, args, 2, "");2366}23672368/* The 8-bit and 10-bit clamping is for HW workarounds. */2369LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,2370bool hi)2371{2372assert(bits == 8 || bits == 10 || bits == 16);23732374LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0);2375LLVMValueRef min_rgb = LLVMConstInt(ctx->i32, bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0);2376LLVMValueRef max_alpha = bits != 10 ? max_rgb : ctx->i32_1;2377LLVMValueRef min_alpha = bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);23782379/* Clamp. */2380if (bits != 16) {2381for (int i = 0; i < 2; i++) {2382bool alpha = hi && i == 1;2383args[i] = ac_build_imin(ctx, args[i], alpha ? max_alpha : max_rgb);2384args[i] = ac_build_imax(ctx, args[i], alpha ? min_alpha : min_rgb);2385}2386}23872388LLVMValueRef res =2389ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16", ctx->v2i16, args, 2, AC_FUNC_ATTR_READNONE);2390return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");2391}23922393/* The 8-bit and 10-bit clamping is for HW workarounds. */2394LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,2395bool hi)2396{2397assert(bits == 8 || bits == 10 || bits == 16);23982399LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0);2400LLVMValueRef max_alpha = bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);24012402/* Clamp. */2403if (bits != 16) {2404for (int i = 0; i < 2; i++) {2405bool alpha = hi && i == 1;2406args[i] = ac_build_umin(ctx, args[i], alpha ? max_alpha : max_rgb);2407}2408}24092410LLVMValueRef res =2411ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16", ctx->v2i16, args, 2, AC_FUNC_ATTR_READNONE);2412return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");2413}24142415LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1)2416{2417return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1, &i1, 1, AC_FUNC_ATTR_READNONE);2418}24192420void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1)2421{2422ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt, &i1, 1, 0);2423}24242425LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input, LLVMValueRef offset,2426LLVMValueRef width, bool is_signed)2427{2428LLVMValueRef args[] = {2429input,2430offset,2431width,2432};24332434return ac_build_intrinsic(ctx, is_signed ? "llvm.amdgcn.sbfe.i32" : "llvm.amdgcn.ubfe.i32",2435ctx->i32, args, 3, AC_FUNC_ATTR_READNONE);2436}24372438LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,2439LLVMValueRef s2)2440{2441return LLVMBuildAdd(ctx->builder, LLVMBuildMul(ctx->builder, s0, s1, ""), s2, "");2442}24432444LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,2445LLVMValueRef s2)2446{2447/* FMA is better on GFX10, because it has FMA units instead of MUL-ADD units. */2448if (ctx->chip_class >= GFX10) {2449return ac_build_intrinsic(ctx, "llvm.fma.f32", ctx->f32, (LLVMValueRef[]){s0, s1, s2}, 3,2450AC_FUNC_ATTR_READNONE);2451}24522453return LLVMBuildFAdd(ctx->builder, LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, "");2454}24552456void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags)2457{2458if (!wait_flags)2459return;24602461unsigned lgkmcnt = 63;2462unsigned vmcnt = ctx->chip_class >= GFX9 ? 63 : 15;2463unsigned vscnt = 63;24642465if (wait_flags & AC_WAIT_LGKM)2466lgkmcnt = 0;2467if (wait_flags & AC_WAIT_VLOAD)2468vmcnt = 0;24692470if (wait_flags & AC_WAIT_VSTORE) {2471if (ctx->chip_class >= GFX10)2472vscnt = 0;2473else2474vmcnt = 0;2475}24762477/* There is no intrinsic for vscnt(0), so use a fence. */2478if ((wait_flags & AC_WAIT_LGKM && wait_flags & AC_WAIT_VLOAD && wait_flags & AC_WAIT_VSTORE) ||2479vscnt == 0) {2480LLVMBuildFence(ctx->builder, LLVMAtomicOrderingRelease, false, "");2481return;2482}24832484unsigned simm16 = (lgkmcnt << 8) | (7 << 4) | /* expcnt */2485(vmcnt & 0xf) | ((vmcnt >> 4) << 14);24862487LLVMValueRef args[1] = {2488LLVMConstInt(ctx->i32, simm16, false),2489};2490ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt", ctx->voidt, args, 1, 0);2491}24922493LLVMValueRef ac_build_fsat(struct ac_llvm_context *ctx, LLVMValueRef src,2494LLVMTypeRef type)2495{2496unsigned bitsize = ac_get_elem_bits(ctx, type);2497LLVMValueRef zero = LLVMConstReal(type, 0.0);2498LLVMValueRef one = LLVMConstReal(type, 1.0);2499LLVMValueRef result;25002501if (bitsize == 64 || (bitsize == 16 && ctx->chip_class <= GFX8)) {2502/* Use fmin/fmax for 64-bit fsat or 16-bit on GFX6-GFX8 because LLVM2503* doesn't expose an intrinsic.2504*/2505result = ac_build_fmin(ctx, ac_build_fmax(ctx, src, zero), one);2506} else {2507LLVMTypeRef type;2508char *intr;25092510if (bitsize == 16) {2511intr = "llvm.amdgcn.fmed3.f16";2512type = ctx->f16;2513} else {2514assert(bitsize == 32);2515intr = "llvm.amdgcn.fmed3.f32";2516type = ctx->f32;2517}25182519LLVMValueRef params[] = {2520zero,2521one,2522src,2523};25242525result = ac_build_intrinsic(ctx, intr, type, params, 3,2526AC_FUNC_ATTR_READNONE);2527}25282529if (ctx->chip_class < GFX9 && bitsize == 32) {2530/* Only pre-GFX9 chips do not flush denorms. */2531result = ac_build_canonicalize(ctx, result, bitsize);2532}25332534return result;2535}25362537LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)2538{2539LLVMTypeRef type;2540char *intr;25412542if (bitsize == 16) {2543intr = "llvm.amdgcn.fract.f16";2544type = ctx->f16;2545} else if (bitsize == 32) {2546intr = "llvm.amdgcn.fract.f32";2547type = ctx->f32;2548} else {2549intr = "llvm.amdgcn.fract.f64";2550type = ctx->f64;2551}25522553LLVMValueRef params[] = {2554src0,2555};2556return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);2557}25582559LLVMValueRef ac_const_uint_vec(struct ac_llvm_context *ctx, LLVMTypeRef type, uint64_t value)2560{25612562if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {2563LLVMValueRef scalar = LLVMConstInt(LLVMGetElementType(type), value, 0);2564unsigned vec_size = LLVMGetVectorSize(type);2565LLVMValueRef *scalars = alloca(vec_size * sizeof(LLVMValueRef));25662567for (unsigned i = 0; i < vec_size; i++)2568scalars[i] = scalar;2569return LLVMConstVector(scalars, vec_size);2570}2571return LLVMConstInt(type, value, 0);2572}25732574LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0)2575{2576LLVMTypeRef type = LLVMTypeOf(src0);2577LLVMValueRef val;25782579/* v_med3 is selected only when max is first. (LLVM bug?) */2580val = ac_build_imax(ctx, src0, ac_const_uint_vec(ctx, type, -1));2581return ac_build_imin(ctx, val, ac_const_uint_vec(ctx, type, 1));2582}25832584static LLVMValueRef ac_eliminate_negative_zero(struct ac_llvm_context *ctx, LLVMValueRef val)2585{2586ac_enable_signed_zeros(ctx);2587/* (val + 0) converts negative zero to positive zero. */2588val = LLVMBuildFAdd(ctx->builder, val, LLVMConstNull(LLVMTypeOf(val)), "");2589ac_disable_signed_zeros(ctx);2590return val;2591}25922593LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src)2594{2595LLVMTypeRef type = LLVMTypeOf(src);2596LLVMValueRef pos, neg, dw[2], val;2597unsigned bitsize = ac_get_elem_bits(ctx, type);25982599/* The standard version leads to this:2600* v_cmp_ngt_f32_e64 s[0:1], s4, 0 ; D40B0000 000100042601* v_cndmask_b32_e64 v4, 1.0, s4, s[0:1] ; D5010004 000008F22602* v_cmp_le_f32_e32 vcc, 0, v4 ; 7C0608802603* v_cndmask_b32_e32 v4, -1.0, v4, vcc ; 020808F32604*2605* The isign version:2606* v_add_f32_e64 v4, s4, 0 ; D5030004 000100042607* v_med3_i32 v4, v4, -1, 1 ; D5580004 020583042608* v_cvt_f32_i32_e32 v4, v4 ; 7E080B042609*2610* (src0 + 0) converts negative zero to positive zero.2611* After that, int(fsign(x)) == isign(floatBitsToInt(x)).2612*2613* For FP64, use the standard version, which doesn't suffer from the huge DP rate2614* reduction. (FP64 comparisons are as fast as int64 comparisons)2615*/2616if (bitsize == 16 || bitsize == 32) {2617val = ac_to_integer(ctx, ac_eliminate_negative_zero(ctx, src));2618val = ac_build_isign(ctx, val);2619return LLVMBuildSIToFP(ctx->builder, val, type, "");2620}26212622assert(bitsize == 64);2623pos = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src, ctx->f64_0, "");2624neg = LLVMBuildFCmp(ctx->builder, LLVMRealOLT, src, ctx->f64_0, "");2625dw[0] = ctx->i32_0;2626dw[1] = LLVMBuildSelect(2627ctx->builder, pos, LLVMConstInt(ctx->i32, 0x3FF00000, 0),2628LLVMBuildSelect(ctx->builder, neg, LLVMConstInt(ctx->i32, 0xBFF00000, 0), ctx->i32_0, ""),2629"");2630return LLVMBuildBitCast(ctx->builder, ac_build_gather_values(ctx, dw, 2), ctx->f64, "");2631}26322633LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0)2634{2635LLVMValueRef result;2636unsigned bitsize;26372638bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));26392640switch (bitsize) {2641case 128:2642result = ac_build_intrinsic(ctx, "llvm.ctpop.i128", ctx->i128, (LLVMValueRef[]){src0}, 1,2643AC_FUNC_ATTR_READNONE);2644result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");2645break;2646case 64:2647result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64, (LLVMValueRef[]){src0}, 1,2648AC_FUNC_ATTR_READNONE);26492650result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");2651break;2652case 32:2653result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32, (LLVMValueRef[]){src0}, 1,2654AC_FUNC_ATTR_READNONE);2655break;2656case 16:2657result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16, (LLVMValueRef[]){src0}, 1,2658AC_FUNC_ATTR_READNONE);26592660result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");2661break;2662case 8:2663result = ac_build_intrinsic(ctx, "llvm.ctpop.i8", ctx->i8, (LLVMValueRef[]){src0}, 1,2664AC_FUNC_ATTR_READNONE);26652666result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");2667break;2668default:2669unreachable(!"invalid bitsize");2670break;2671}26722673return result;2674}26752676LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, LLVMValueRef src0)2677{2678LLVMValueRef result;2679unsigned bitsize;26802681bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));26822683switch (bitsize) {2684case 64:2685result = ac_build_intrinsic(ctx, "llvm.bitreverse.i64", ctx->i64, (LLVMValueRef[]){src0}, 1,2686AC_FUNC_ATTR_READNONE);26872688result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");2689break;2690case 32:2691result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32, (LLVMValueRef[]){src0}, 1,2692AC_FUNC_ATTR_READNONE);2693break;2694case 16:2695result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16, (LLVMValueRef[]){src0}, 1,2696AC_FUNC_ATTR_READNONE);26972698result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");2699break;2700case 8:2701result = ac_build_intrinsic(ctx, "llvm.bitreverse.i8", ctx->i8, (LLVMValueRef[]){src0}, 1,2702AC_FUNC_ATTR_READNONE);27032704result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");2705break;2706default:2707unreachable(!"invalid bitsize");2708break;2709}27102711return result;2712}27132714#define AC_EXP_TARGET 02715#define AC_EXP_ENABLED_CHANNELS 12716#define AC_EXP_OUT0 227172718enum ac_ir_type2719{2720AC_IR_UNDEF,2721AC_IR_CONST,2722AC_IR_VALUE,2723};27242725struct ac_vs_exp_chan {2726LLVMValueRef value;2727float const_float;2728enum ac_ir_type type;2729};27302731struct ac_vs_exp_inst {2732unsigned offset;2733LLVMValueRef inst;2734struct ac_vs_exp_chan chan[4];2735};27362737struct ac_vs_exports {2738unsigned num;2739struct ac_vs_exp_inst exp[VARYING_SLOT_MAX];2740};27412742/* Return true if the PARAM export has been eliminated. */2743static bool ac_eliminate_const_output(uint8_t *vs_output_param_offset, uint32_t num_outputs,2744struct ac_vs_exp_inst *exp)2745{2746unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */2747bool is_zero[4] = {0}, is_one[4] = {0};27482749for (i = 0; i < 4; i++) {2750/* It's a constant expression. Undef outputs are eliminated too. */2751if (exp->chan[i].type == AC_IR_UNDEF) {2752is_zero[i] = true;2753is_one[i] = true;2754} else if (exp->chan[i].type == AC_IR_CONST) {2755if (exp->chan[i].const_float == 0)2756is_zero[i] = true;2757else if (exp->chan[i].const_float == 1)2758is_one[i] = true;2759else2760return false; /* other constant */2761} else2762return false;2763}27642765/* Only certain combinations of 0 and 1 can be eliminated. */2766if (is_zero[0] && is_zero[1] && is_zero[2])2767default_val = is_zero[3] ? 0 : 1;2768else if (is_one[0] && is_one[1] && is_one[2])2769default_val = is_zero[3] ? 2 : 3;2770else2771return false;27722773/* The PARAM export can be represented as DEFAULT_VAL. Kill it. */2774LLVMInstructionEraseFromParent(exp->inst);27752776/* Change OFFSET to DEFAULT_VAL. */2777for (i = 0; i < num_outputs; i++) {2778if (vs_output_param_offset[i] == exp->offset) {2779vs_output_param_offset[i] = AC_EXP_PARAM_DEFAULT_VAL_0000 + default_val;2780break;2781}2782}2783return true;2784}27852786static bool ac_eliminate_duplicated_output(struct ac_llvm_context *ctx,2787uint8_t *vs_output_param_offset, uint32_t num_outputs,2788struct ac_vs_exports *processed,2789struct ac_vs_exp_inst *exp)2790{2791unsigned p, copy_back_channels = 0;27922793/* See if the output is already in the list of processed outputs.2794* The LLVMValueRef comparison relies on SSA.2795*/2796for (p = 0; p < processed->num; p++) {2797bool different = false;27982799for (unsigned j = 0; j < 4; j++) {2800struct ac_vs_exp_chan *c1 = &processed->exp[p].chan[j];2801struct ac_vs_exp_chan *c2 = &exp->chan[j];28022803/* Treat undef as a match. */2804if (c2->type == AC_IR_UNDEF)2805continue;28062807/* If c1 is undef but c2 isn't, we can copy c2 to c12808* and consider the instruction duplicated.2809*/2810if (c1->type == AC_IR_UNDEF) {2811copy_back_channels |= 1 << j;2812continue;2813}28142815/* Test whether the channels are not equal. */2816if (c1->type != c2->type ||2817(c1->type == AC_IR_CONST && c1->const_float != c2->const_float) ||2818(c1->type == AC_IR_VALUE && c1->value != c2->value)) {2819different = true;2820break;2821}2822}2823if (!different)2824break;28252826copy_back_channels = 0;2827}2828if (p == processed->num)2829return false;28302831/* If a match was found, but the matching export has undef where the new2832* one has a normal value, copy the normal value to the undef channel.2833*/2834struct ac_vs_exp_inst *match = &processed->exp[p];28352836/* Get current enabled channels mask. */2837LLVMValueRef arg = LLVMGetOperand(match->inst, AC_EXP_ENABLED_CHANNELS);2838unsigned enabled_channels = LLVMConstIntGetZExtValue(arg);28392840while (copy_back_channels) {2841unsigned chan = u_bit_scan(©_back_channels);28422843assert(match->chan[chan].type == AC_IR_UNDEF);2844LLVMSetOperand(match->inst, AC_EXP_OUT0 + chan, exp->chan[chan].value);2845match->chan[chan] = exp->chan[chan];28462847/* Update number of enabled channels because the original mask2848* is not always 0xf.2849*/2850enabled_channels |= (1 << chan);2851LLVMSetOperand(match->inst, AC_EXP_ENABLED_CHANNELS,2852LLVMConstInt(ctx->i32, enabled_channels, 0));2853}28542855/* The PARAM export is duplicated. Kill it. */2856LLVMInstructionEraseFromParent(exp->inst);28572858/* Change OFFSET to the matching export. */2859for (unsigned i = 0; i < num_outputs; i++) {2860if (vs_output_param_offset[i] == exp->offset) {2861vs_output_param_offset[i] = match->offset;2862break;2863}2864}2865return true;2866}28672868void ac_optimize_vs_outputs(struct ac_llvm_context *ctx, LLVMValueRef main_fn,2869uint8_t *vs_output_param_offset, uint32_t num_outputs,2870uint32_t skip_output_mask, uint8_t *num_param_exports)2871{2872LLVMBasicBlockRef bb;2873bool removed_any = false;2874struct ac_vs_exports exports;28752876exports.num = 0;28772878/* Process all LLVM instructions. */2879bb = LLVMGetFirstBasicBlock(main_fn);2880while (bb) {2881LLVMValueRef inst = LLVMGetFirstInstruction(bb);28822883while (inst) {2884LLVMValueRef cur = inst;2885inst = LLVMGetNextInstruction(inst);2886struct ac_vs_exp_inst exp;28872888if (LLVMGetInstructionOpcode(cur) != LLVMCall)2889continue;28902891LLVMValueRef callee = ac_llvm_get_called_value(cur);28922893if (!ac_llvm_is_function(callee))2894continue;28952896const char *name = LLVMGetValueName(callee);2897unsigned num_args = LLVMCountParams(callee);28982899/* Check if this is an export instruction. */2900if ((num_args != 9 && num_args != 8) ||2901(strcmp(name, "llvm.SI.export") && strcmp(name, "llvm.amdgcn.exp.f32")))2902continue;29032904LLVMValueRef arg = LLVMGetOperand(cur, AC_EXP_TARGET);2905unsigned target = LLVMConstIntGetZExtValue(arg);29062907if (target < V_008DFC_SQ_EXP_PARAM)2908continue;29092910target -= V_008DFC_SQ_EXP_PARAM;29112912/* Parse the instruction. */2913memset(&exp, 0, sizeof(exp));2914exp.offset = target;2915exp.inst = cur;29162917for (unsigned i = 0; i < 4; i++) {2918LLVMValueRef v = LLVMGetOperand(cur, AC_EXP_OUT0 + i);29192920exp.chan[i].value = v;29212922if (LLVMIsUndef(v)) {2923exp.chan[i].type = AC_IR_UNDEF;2924} else if (LLVMIsAConstantFP(v)) {2925LLVMBool loses_info;2926exp.chan[i].type = AC_IR_CONST;2927exp.chan[i].const_float = LLVMConstRealGetDouble(v, &loses_info);2928} else {2929exp.chan[i].type = AC_IR_VALUE;2930}2931}29322933/* Eliminate constant and duplicated PARAM exports. */2934if (!((1u << target) & skip_output_mask) &&2935(ac_eliminate_const_output(vs_output_param_offset, num_outputs, &exp) ||2936ac_eliminate_duplicated_output(ctx, vs_output_param_offset, num_outputs, &exports,2937&exp))) {2938removed_any = true;2939} else {2940exports.exp[exports.num++] = exp;2941}2942}2943bb = LLVMGetNextBasicBlock(bb);2944}29452946/* Remove holes in export memory due to removed PARAM exports.2947* This is done by renumbering all PARAM exports.2948*/2949if (removed_any) {2950uint8_t old_offset[VARYING_SLOT_MAX];2951unsigned out, i;29522953/* Make a copy of the offsets. We need the old version while2954* we are modifying some of them. */2955memcpy(old_offset, vs_output_param_offset, sizeof(old_offset));29562957for (i = 0; i < exports.num; i++) {2958unsigned offset = exports.exp[i].offset;29592960/* Update vs_output_param_offset. Multiple outputs can2961* have the same offset.2962*/2963for (out = 0; out < num_outputs; out++) {2964if (old_offset[out] == offset)2965vs_output_param_offset[out] = i;2966}29672968/* Change the PARAM offset in the instruction. */2969LLVMSetOperand(exports.exp[i].inst, AC_EXP_TARGET,2970LLVMConstInt(ctx->i32, V_008DFC_SQ_EXP_PARAM + i, 0));2971}2972*num_param_exports = exports.num;2973}2974}29752976void ac_init_exec_full_mask(struct ac_llvm_context *ctx)2977{2978LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);2979ac_build_intrinsic(ctx, "llvm.amdgcn.init.exec", ctx->voidt, &full_mask, 1,2980AC_FUNC_ATTR_CONVERGENT);2981}29822983void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx)2984{2985unsigned lds_size = ctx->chip_class >= GFX7 ? 65536 : 32768;2986ctx->lds = LLVMBuildIntToPtr(2987ctx->builder, ctx->i32_0,2988LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_ADDR_SPACE_LDS), "lds");2989}29902991LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx, LLVMValueRef dw_addr)2992{2993return LLVMBuildLoad(ctx->builder, ac_build_gep0(ctx, ctx->lds, dw_addr), "");2994}29952996void ac_lds_store(struct ac_llvm_context *ctx, LLVMValueRef dw_addr, LLVMValueRef value)2997{2998value = ac_to_integer(ctx, value);2999ac_build_indexed_store(ctx, ctx->lds, dw_addr, value);3000}30013002LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, LLVMTypeRef dst_type, LLVMValueRef src0)3003{3004unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));3005const char *intrin_name;3006LLVMTypeRef type;3007LLVMValueRef zero;30083009switch (src0_bitsize) {3010case 64:3011intrin_name = "llvm.cttz.i64";3012type = ctx->i64;3013zero = ctx->i64_0;3014break;3015case 32:3016intrin_name = "llvm.cttz.i32";3017type = ctx->i32;3018zero = ctx->i32_0;3019break;3020case 16:3021intrin_name = "llvm.cttz.i16";3022type = ctx->i16;3023zero = ctx->i16_0;3024break;3025case 8:3026intrin_name = "llvm.cttz.i8";3027type = ctx->i8;3028zero = ctx->i8_0;3029break;3030default:3031unreachable(!"invalid bitsize");3032}30333034LLVMValueRef params[2] = {3035src0,30363037/* The value of 1 means that ffs(x=0) = undef, so LLVM won't3038* add special code to check for x=0. The reason is that3039* the LLVM behavior for x=0 is different from what we3040* need here. However, LLVM also assumes that ffs(x) is3041* in [0, 31], but GLSL expects that ffs(0) = -1, so3042* a conditional assignment to handle 0 is still required.3043*3044* The hardware already implements the correct behavior.3045*/3046ctx->i1true,3047};30483049LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, AC_FUNC_ATTR_READNONE);30503051if (src0_bitsize == 64) {3052lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, "");3053} else if (src0_bitsize < 32) {3054lsb = LLVMBuildSExt(ctx->builder, lsb, ctx->i32, "");3055}30563057/* TODO: We need an intrinsic to skip this conditional. */3058/* Check for zero: */3059return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, src0, zero, ""),3060LLVMConstInt(ctx->i32, -1, 0), lsb, "");3061}30623063LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type)3064{3065return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST);3066}30673068LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type)3069{3070return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST_32BIT);3071}30723073static struct ac_llvm_flow *get_current_flow(struct ac_llvm_context *ctx)3074{3075if (ctx->flow->depth > 0)3076return &ctx->flow->stack[ctx->flow->depth - 1];3077return NULL;3078}30793080static struct ac_llvm_flow *get_innermost_loop(struct ac_llvm_context *ctx)3081{3082for (unsigned i = ctx->flow->depth; i > 0; --i) {3083if (ctx->flow->stack[i - 1].loop_entry_block)3084return &ctx->flow->stack[i - 1];3085}3086return NULL;3087}30883089static struct ac_llvm_flow *push_flow(struct ac_llvm_context *ctx)3090{3091struct ac_llvm_flow *flow;30923093if (ctx->flow->depth >= ctx->flow->depth_max) {3094unsigned new_max = MAX2(ctx->flow->depth << 1, AC_LLVM_INITIAL_CF_DEPTH);30953096ctx->flow->stack = realloc(ctx->flow->stack, new_max * sizeof(*ctx->flow->stack));3097ctx->flow->depth_max = new_max;3098}30993100flow = &ctx->flow->stack[ctx->flow->depth];3101ctx->flow->depth++;31023103flow->next_block = NULL;3104flow->loop_entry_block = NULL;3105return flow;3106}31073108static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base, int label_id)3109{3110char buf[32];3111snprintf(buf, sizeof(buf), "%s%d", base, label_id);3112LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf);3113}31143115/* Append a basic block at the level of the parent flow.3116*/3117static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx, const char *name)3118{3119assert(ctx->flow->depth >= 1);31203121if (ctx->flow->depth >= 2) {3122struct ac_llvm_flow *flow = &ctx->flow->stack[ctx->flow->depth - 2];31233124return LLVMInsertBasicBlockInContext(ctx->context, flow->next_block, name);3125}31263127LLVMValueRef main_fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder));3128return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name);3129}31303131/* Emit a branch to the given default target for the current block if3132* applicable -- that is, if the current block does not already contain a3133* branch from a break or continue.3134*/3135static void emit_default_branch(LLVMBuilderRef builder, LLVMBasicBlockRef target)3136{3137if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder)))3138LLVMBuildBr(builder, target);3139}31403141void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id)3142{3143struct ac_llvm_flow *flow = push_flow(ctx);3144flow->loop_entry_block = append_basic_block(ctx, "LOOP");3145flow->next_block = append_basic_block(ctx, "ENDLOOP");3146set_basicblock_name(flow->loop_entry_block, "loop", label_id);3147LLVMBuildBr(ctx->builder, flow->loop_entry_block);3148LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block);3149}31503151void ac_build_break(struct ac_llvm_context *ctx)3152{3153struct ac_llvm_flow *flow = get_innermost_loop(ctx);3154LLVMBuildBr(ctx->builder, flow->next_block);3155}31563157void ac_build_continue(struct ac_llvm_context *ctx)3158{3159struct ac_llvm_flow *flow = get_innermost_loop(ctx);3160LLVMBuildBr(ctx->builder, flow->loop_entry_block);3161}31623163void ac_build_else(struct ac_llvm_context *ctx, int label_id)3164{3165struct ac_llvm_flow *current_branch = get_current_flow(ctx);3166LLVMBasicBlockRef endif_block;31673168assert(!current_branch->loop_entry_block);31693170endif_block = append_basic_block(ctx, "ENDIF");3171emit_default_branch(ctx->builder, endif_block);31723173LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);3174set_basicblock_name(current_branch->next_block, "else", label_id);31753176current_branch->next_block = endif_block;3177}31783179/* Invoked after a branch is exited. */3180static void ac_branch_exited(struct ac_llvm_context *ctx)3181{3182if (ctx->flow->depth == 0 && ctx->conditional_demote_seen) {3183/* The previous conditional branch contained demote. Kill threads3184* after all conditional blocks because amdgcn.wqm.vote doesn't3185* return usable values inside the blocks.3186*3187* This is an optional optimization that only kills whole inactive quads.3188*/3189LLVMValueRef cond = LLVMBuildLoad(ctx->builder, ctx->postponed_kill, "");3190ac_build_kill_if_false(ctx, ac_build_wqm_vote(ctx, cond));3191ctx->conditional_demote_seen = false;3192}3193}31943195void ac_build_endif(struct ac_llvm_context *ctx, int label_id)3196{3197struct ac_llvm_flow *current_branch = get_current_flow(ctx);31983199assert(!current_branch->loop_entry_block);32003201emit_default_branch(ctx->builder, current_branch->next_block);3202LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);3203set_basicblock_name(current_branch->next_block, "endif", label_id);32043205ctx->flow->depth--;3206ac_branch_exited(ctx);3207}32083209void ac_build_endloop(struct ac_llvm_context *ctx, int label_id)3210{3211struct ac_llvm_flow *current_loop = get_current_flow(ctx);32123213assert(current_loop->loop_entry_block);32143215emit_default_branch(ctx->builder, current_loop->loop_entry_block);32163217LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block);3218set_basicblock_name(current_loop->next_block, "endloop", label_id);3219ctx->flow->depth--;3220ac_branch_exited(ctx);3221}32223223void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id)3224{3225struct ac_llvm_flow *flow = push_flow(ctx);3226LLVMBasicBlockRef if_block;32273228if_block = append_basic_block(ctx, "IF");3229flow->next_block = append_basic_block(ctx, "ELSE");3230set_basicblock_name(if_block, "if", label_id);3231LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block);3232LLVMPositionBuilderAtEnd(ctx->builder, if_block);3233}32343235LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name)3236{3237LLVMBuilderRef builder = ac->builder;3238LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);3239LLVMValueRef function = LLVMGetBasicBlockParent(current_block);3240LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);3241LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);3242LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context);3243LLVMValueRef res;32443245if (first_instr) {3246LLVMPositionBuilderBefore(first_builder, first_instr);3247} else {3248LLVMPositionBuilderAtEnd(first_builder, first_block);3249}32503251res = LLVMBuildAlloca(first_builder, type, name);3252LLVMDisposeBuilder(first_builder);3253return res;3254}32553256LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name)3257{3258LLVMValueRef ptr = ac_build_alloca_undef(ac, type, name);3259LLVMBuildStore(ac->builder, LLVMConstNull(type), ptr);3260return ptr;3261}32623263LLVMValueRef ac_build_alloca_init(struct ac_llvm_context *ac, LLVMValueRef val, const char *name)3264{3265LLVMValueRef ptr = ac_build_alloca_undef(ac, LLVMTypeOf(val), name);3266LLVMBuildStore(ac->builder, val, ptr);3267return ptr;3268}32693270LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr, LLVMTypeRef type)3271{3272int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));3273return LLVMBuildBitCast(ctx->builder, ptr, LLVMPointerType(type, addr_space), "");3274}32753276LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned count)3277{3278unsigned num_components = ac_get_llvm_num_components(value);3279if (count == num_components)3280return value;32813282LLVMValueRef *const masks = alloca(MAX2(count, 2) * sizeof(LLVMValueRef));3283masks[0] = ctx->i32_0;3284masks[1] = ctx->i32_1;3285for (unsigned i = 2; i < count; i++)3286masks[i] = LLVMConstInt(ctx->i32, i, false);32873288if (count == 1)3289return LLVMBuildExtractElement(ctx->builder, value, masks[0], "");32903291LLVMValueRef swizzle = LLVMConstVector(masks, count);3292return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, "");3293}32943295/* If param is i64 and bitwidth <= 32, the return value will be i32. */3296LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param, unsigned rshift,3297unsigned bitwidth)3298{3299LLVMValueRef value = param;3300if (rshift)3301value = LLVMBuildLShr(ctx->builder, value, LLVMConstInt(LLVMTypeOf(param), rshift, false), "");33023303if (rshift + bitwidth < 32) {3304uint64_t mask = (1ull << bitwidth) - 1;3305value = LLVMBuildAnd(ctx->builder, value, LLVMConstInt(LLVMTypeOf(param), mask, false), "");3306}33073308if (bitwidth <= 32 && LLVMTypeOf(param) == ctx->i64)3309value = LLVMBuildTrunc(ctx->builder, value, ctx->i32, "");3310return value;3311}33123313/* Adjust the sample index according to FMASK.3314*3315* For uncompressed MSAA surfaces, FMASK should return 0x76543210,3316* which is the identity mapping. Each nibble says which physical sample3317* should be fetched to get that sample.3318*3319* For example, 0x11111100 means there are only 2 samples stored and3320* the second sample covers 3/4 of the pixel. When reading samples 03321* and 1, return physical sample 0 (determined by the first two 0s3322* in FMASK), otherwise return physical sample 1.3323*3324* The sample index should be adjusted as follows:3325* addr[sample_index] = (fmask >> (addr[sample_index] * 4)) & 0xF;3326*/3327void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask, LLVMValueRef *addr,3328bool is_array_tex)3329{3330struct ac_image_args fmask_load = {0};3331fmask_load.opcode = ac_image_load;3332fmask_load.resource = fmask;3333fmask_load.dmask = 0xf;3334fmask_load.dim = is_array_tex ? ac_image_2darray : ac_image_2d;3335fmask_load.attributes = AC_FUNC_ATTR_READNONE;33363337fmask_load.coords[0] = addr[0];3338fmask_load.coords[1] = addr[1];3339if (is_array_tex)3340fmask_load.coords[2] = addr[2];3341fmask_load.a16 = ac_get_elem_bits(ac, LLVMTypeOf(addr[0])) == 16;33423343LLVMValueRef fmask_value = ac_build_image_opcode(ac, &fmask_load);3344fmask_value = LLVMBuildExtractElement(ac->builder, fmask_value, ac->i32_0, "");33453346/* Apply the formula. */3347unsigned sample_chan = is_array_tex ? 3 : 2;3348LLVMValueRef final_sample;3349final_sample = LLVMBuildMul(ac->builder, addr[sample_chan],3350LLVMConstInt(LLVMTypeOf(addr[0]), 4, 0), "");3351final_sample = LLVMBuildLShr(ac->builder, fmask_value,3352LLVMBuildZExt(ac->builder, final_sample, ac->i32, ""), "");3353/* Mask the sample index by 0x7, because 0x8 means an unknown value3354* with EQAA, so those will map to 0. */3355final_sample = LLVMBuildAnd(ac->builder, final_sample, LLVMConstInt(ac->i32, 0x7, 0), "");3356if (fmask_load.a16)3357final_sample = LLVMBuildTrunc(ac->builder, final_sample, ac->i16, "");33583359/* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK3360* resource descriptor is 0 (invalid).3361*/3362LLVMValueRef tmp;3363tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, "");3364tmp = LLVMBuildExtractElement(ac->builder, tmp, ac->i32_1, "");3365tmp = LLVMBuildICmp(ac->builder, LLVMIntNE, tmp, ac->i32_0, "");33663367/* Replace the MSAA sample index. */3368addr[sample_chan] = LLVMBuildSelect(ac->builder, tmp, final_sample, addr[sample_chan], "");3369}33703371static LLVMValueRef _ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src,3372LLVMValueRef lane, bool with_opt_barrier)3373{3374LLVMTypeRef type = LLVMTypeOf(src);3375LLVMValueRef result;33763377if (with_opt_barrier)3378ac_build_optimization_barrier(ctx, &src, false);33793380src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");3381if (lane)3382lane = LLVMBuildZExt(ctx->builder, lane, ctx->i32, "");33833384result =3385ac_build_intrinsic(ctx, lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane",3386ctx->i32, (LLVMValueRef[]){src, lane}, lane == NULL ? 1 : 2,3387AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);33883389return LLVMBuildTrunc(ctx->builder, result, type, "");3390}33913392static LLVMValueRef ac_build_readlane_common(struct ac_llvm_context *ctx, LLVMValueRef src,3393LLVMValueRef lane, bool with_opt_barrier)3394{3395LLVMTypeRef src_type = LLVMTypeOf(src);3396src = ac_to_integer(ctx, src);3397unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));3398LLVMValueRef ret;33993400if (bits > 32) {3401assert(bits % 32 == 0);3402LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);3403LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");3404ret = LLVMGetUndef(vec_type);3405for (unsigned i = 0; i < bits / 32; i++) {3406LLVMValueRef ret_comp;34073408src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");34093410ret_comp = _ac_build_readlane(ctx, src, lane, with_opt_barrier);34113412ret =3413LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");3414}3415} else {3416ret = _ac_build_readlane(ctx, src, lane, with_opt_barrier);3417}34183419if (LLVMGetTypeKind(src_type) == LLVMPointerTypeKind)3420return LLVMBuildIntToPtr(ctx->builder, ret, src_type, "");3421return LLVMBuildBitCast(ctx->builder, ret, src_type, "");3422}34233424/**3425* Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic.3426*3427* The optimization barrier is not needed if the value is the same in all lanes3428* or if this is called in the outermost block.3429*3430* @param ctx3431* @param src3432* @param lane - id of the lane or NULL for the first active lane3433* @return value of the lane3434*/3435LLVMValueRef ac_build_readlane_no_opt_barrier(struct ac_llvm_context *ctx, LLVMValueRef src,3436LLVMValueRef lane)3437{3438return ac_build_readlane_common(ctx, src, lane, false);3439}34403441LLVMValueRef ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)3442{3443return ac_build_readlane_common(ctx, src, lane, true);3444}34453446LLVMValueRef ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value,3447LLVMValueRef lane)3448{3449return ac_build_intrinsic(ctx, "llvm.amdgcn.writelane", ctx->i32,3450(LLVMValueRef[]){value, lane, src}, 3,3451AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);3452}34533454LLVMValueRef ac_build_mbcnt_add(struct ac_llvm_context *ctx, LLVMValueRef mask, LLVMValueRef add_src)3455{3456if (ctx->wave_size == 32) {3457LLVMValueRef val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,3458(LLVMValueRef[]){mask, ctx->i32_0}, 2, AC_FUNC_ATTR_READNONE);3459ac_set_range_metadata(ctx, val, 0, ctx->wave_size);3460return val;3461}3462LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask, ctx->v2i32, "");3463LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_0, "");3464LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_1, "");3465LLVMValueRef val =3466ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,3467(LLVMValueRef[]){mask_lo, add_src}, 2, AC_FUNC_ATTR_READNONE);3468val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32, (LLVMValueRef[]){mask_hi, val},34692, AC_FUNC_ATTR_READNONE);3470ac_set_range_metadata(ctx, val, 0, ctx->wave_size);3471return val;3472}34733474LLVMValueRef ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask)3475{3476return ac_build_mbcnt_add(ctx, mask, ctx->i32_0);3477}34783479enum dpp_ctrl3480{3481_dpp_quad_perm = 0x000,3482_dpp_row_sl = 0x100,3483_dpp_row_sr = 0x110,3484_dpp_row_rr = 0x120,3485dpp_wf_sl1 = 0x130,3486dpp_wf_rl1 = 0x134,3487dpp_wf_sr1 = 0x138,3488dpp_wf_rr1 = 0x13C,3489dpp_row_mirror = 0x140,3490dpp_row_half_mirror = 0x141,3491dpp_row_bcast15 = 0x142,3492dpp_row_bcast31 = 0x1433493};34943495static inline enum dpp_ctrl dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2,3496unsigned lane3)3497{3498assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4);3499return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6);3500}35013502static inline enum dpp_ctrl dpp_row_sl(unsigned amount)3503{3504assert(amount > 0 && amount < 16);3505return _dpp_row_sl | amount;3506}35073508static inline enum dpp_ctrl dpp_row_sr(unsigned amount)3509{3510assert(amount > 0 && amount < 16);3511return _dpp_row_sr | amount;3512}35133514static LLVMValueRef _ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,3515enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,3516bool bound_ctrl)3517{3518LLVMTypeRef type = LLVMTypeOf(src);3519LLVMValueRef res;35203521old = LLVMBuildZExt(ctx->builder, old, ctx->i32, "");3522src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");35233524res = ac_build_intrinsic(3525ctx, "llvm.amdgcn.update.dpp.i32", ctx->i32,3526(LLVMValueRef[]){old, src, LLVMConstInt(ctx->i32, dpp_ctrl, 0),3527LLVMConstInt(ctx->i32, row_mask, 0), LLVMConstInt(ctx->i32, bank_mask, 0),3528LLVMConstInt(ctx->i1, bound_ctrl, 0)},35296, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);35303531return LLVMBuildTrunc(ctx->builder, res, type, "");3532}35333534static LLVMValueRef ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,3535enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,3536bool bound_ctrl)3537{3538LLVMTypeRef src_type = LLVMTypeOf(src);3539src = ac_to_integer(ctx, src);3540old = ac_to_integer(ctx, old);3541unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));3542LLVMValueRef ret;3543if (bits > 32) {3544assert(bits % 32 == 0);3545LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);3546LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");3547LLVMValueRef old_vector = LLVMBuildBitCast(ctx->builder, old, vec_type, "");3548ret = LLVMGetUndef(vec_type);3549for (unsigned i = 0; i < bits / 32; i++) {3550src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");3551old = LLVMBuildExtractElement(ctx->builder, old_vector, LLVMConstInt(ctx->i32, i, 0), "");3552LLVMValueRef ret_comp =3553_ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);3554ret =3555LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");3556}3557} else {3558ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);3559}3560return LLVMBuildBitCast(ctx->builder, ret, src_type, "");3561}35623563static LLVMValueRef _ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src,3564uint64_t sel, bool exchange_rows, bool bound_ctrl)3565{3566LLVMTypeRef type = LLVMTypeOf(src);3567LLVMValueRef result;35683569src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");35703571LLVMValueRef args[6] = {3572src,3573src,3574LLVMConstInt(ctx->i32, sel, false),3575LLVMConstInt(ctx->i32, sel >> 32, false),3576ctx->i1true, /* fi */3577bound_ctrl ? ctx->i1true : ctx->i1false,3578};35793580result =3581ac_build_intrinsic(ctx, exchange_rows ? "llvm.amdgcn.permlanex16" : "llvm.amdgcn.permlane16",3582ctx->i32, args, 6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);35833584return LLVMBuildTrunc(ctx->builder, result, type, "");3585}35863587static LLVMValueRef ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel,3588bool exchange_rows, bool bound_ctrl)3589{3590LLVMTypeRef src_type = LLVMTypeOf(src);3591src = ac_to_integer(ctx, src);3592unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));3593LLVMValueRef ret;3594if (bits > 32) {3595assert(bits % 32 == 0);3596LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);3597LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");3598ret = LLVMGetUndef(vec_type);3599for (unsigned i = 0; i < bits / 32; i++) {3600src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");3601LLVMValueRef ret_comp = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl);3602ret =3603LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");3604}3605} else {3606ret = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl);3607}3608return LLVMBuildBitCast(ctx->builder, ret, src_type, "");3609}36103611static inline unsigned ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)3612{3613assert(and_mask < 32 && or_mask < 32 && xor_mask < 32);3614return and_mask | (or_mask << 5) | (xor_mask << 10);3615}36163617static LLVMValueRef _ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,3618unsigned mask)3619{3620LLVMTypeRef src_type = LLVMTypeOf(src);3621LLVMValueRef ret;36223623src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");36243625ret = ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle", ctx->i32,3626(LLVMValueRef[]){src, LLVMConstInt(ctx->i32, mask, 0)}, 2,3627AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);36283629return LLVMBuildTrunc(ctx->builder, ret, src_type, "");3630}36313632LLVMValueRef ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)3633{3634LLVMTypeRef src_type = LLVMTypeOf(src);3635src = ac_to_integer(ctx, src);3636unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));3637LLVMValueRef ret;3638if (bits > 32) {3639assert(bits % 32 == 0);3640LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);3641LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");3642ret = LLVMGetUndef(vec_type);3643for (unsigned i = 0; i < bits / 32; i++) {3644src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");3645LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src, mask);3646ret =3647LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");3648}3649} else {3650ret = _ac_build_ds_swizzle(ctx, src, mask);3651}3652return LLVMBuildBitCast(ctx->builder, ret, src_type, "");3653}36543655static LLVMValueRef ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src)3656{3657LLVMTypeRef src_type = LLVMTypeOf(src);3658unsigned bitsize = ac_get_elem_bits(ctx, src_type);3659char name[32], type[8];3660LLVMValueRef ret;36613662src = ac_to_integer(ctx, src);36633664if (bitsize < 32)3665src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");36663667ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));3668snprintf(name, sizeof(name), "llvm.amdgcn.wwm.%s", type);3669ret = ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src}, 1,3670AC_FUNC_ATTR_READNONE);36713672if (bitsize < 32)3673ret = LLVMBuildTrunc(ctx->builder, ret, ac_to_integer_type(ctx, src_type), "");36743675return LLVMBuildBitCast(ctx->builder, ret, src_type, "");3676}36773678static LLVMValueRef ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src,3679LLVMValueRef inactive)3680{3681char name[33], type[8];3682LLVMTypeRef src_type = LLVMTypeOf(src);3683unsigned bitsize = ac_get_elem_bits(ctx, src_type);3684src = ac_to_integer(ctx, src);3685inactive = ac_to_integer(ctx, inactive);36863687if (bitsize < 32) {3688src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");3689inactive = LLVMBuildZExt(ctx->builder, inactive, ctx->i32, "");3690}36913692ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));3693snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type);3694LLVMValueRef ret =3695ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src, inactive}, 2,3696AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);3697if (bitsize < 32)3698ret = LLVMBuildTrunc(ctx->builder, ret, src_type, "");36993700return ret;3701}37023703static LLVMValueRef get_reduction_identity(struct ac_llvm_context *ctx, nir_op op,3704unsigned type_size)3705{37063707if (type_size == 0) {3708switch (op) {3709case nir_op_ior:3710case nir_op_ixor:3711return LLVMConstInt(ctx->i1, 0, 0);3712case nir_op_iand:3713return LLVMConstInt(ctx->i1, 1, 0);3714default:3715unreachable("bad reduction intrinsic");3716}3717} else if (type_size == 1) {3718switch (op) {3719case nir_op_iadd:3720return ctx->i8_0;3721case nir_op_imul:3722return ctx->i8_1;3723case nir_op_imin:3724return LLVMConstInt(ctx->i8, INT8_MAX, 0);3725case nir_op_umin:3726return LLVMConstInt(ctx->i8, UINT8_MAX, 0);3727case nir_op_imax:3728return LLVMConstInt(ctx->i8, INT8_MIN, 0);3729case nir_op_umax:3730return ctx->i8_0;3731case nir_op_iand:3732return LLVMConstInt(ctx->i8, -1, 0);3733case nir_op_ior:3734return ctx->i8_0;3735case nir_op_ixor:3736return ctx->i8_0;3737default:3738unreachable("bad reduction intrinsic");3739}3740} else if (type_size == 2) {3741switch (op) {3742case nir_op_iadd:3743return ctx->i16_0;3744case nir_op_fadd:3745return ctx->f16_0;3746case nir_op_imul:3747return ctx->i16_1;3748case nir_op_fmul:3749return ctx->f16_1;3750case nir_op_imin:3751return LLVMConstInt(ctx->i16, INT16_MAX, 0);3752case nir_op_umin:3753return LLVMConstInt(ctx->i16, UINT16_MAX, 0);3754case nir_op_fmin:3755return LLVMConstReal(ctx->f16, INFINITY);3756case nir_op_imax:3757return LLVMConstInt(ctx->i16, INT16_MIN, 0);3758case nir_op_umax:3759return ctx->i16_0;3760case nir_op_fmax:3761return LLVMConstReal(ctx->f16, -INFINITY);3762case nir_op_iand:3763return LLVMConstInt(ctx->i16, -1, 0);3764case nir_op_ior:3765return ctx->i16_0;3766case nir_op_ixor:3767return ctx->i16_0;3768default:3769unreachable("bad reduction intrinsic");3770}3771} else if (type_size == 4) {3772switch (op) {3773case nir_op_iadd:3774return ctx->i32_0;3775case nir_op_fadd:3776return ctx->f32_0;3777case nir_op_imul:3778return ctx->i32_1;3779case nir_op_fmul:3780return ctx->f32_1;3781case nir_op_imin:3782return LLVMConstInt(ctx->i32, INT32_MAX, 0);3783case nir_op_umin:3784return LLVMConstInt(ctx->i32, UINT32_MAX, 0);3785case nir_op_fmin:3786return LLVMConstReal(ctx->f32, INFINITY);3787case nir_op_imax:3788return LLVMConstInt(ctx->i32, INT32_MIN, 0);3789case nir_op_umax:3790return ctx->i32_0;3791case nir_op_fmax:3792return LLVMConstReal(ctx->f32, -INFINITY);3793case nir_op_iand:3794return LLVMConstInt(ctx->i32, -1, 0);3795case nir_op_ior:3796return ctx->i32_0;3797case nir_op_ixor:3798return ctx->i32_0;3799default:3800unreachable("bad reduction intrinsic");3801}3802} else { /* type_size == 64bit */3803switch (op) {3804case nir_op_iadd:3805return ctx->i64_0;3806case nir_op_fadd:3807return ctx->f64_0;3808case nir_op_imul:3809return ctx->i64_1;3810case nir_op_fmul:3811return ctx->f64_1;3812case nir_op_imin:3813return LLVMConstInt(ctx->i64, INT64_MAX, 0);3814case nir_op_umin:3815return LLVMConstInt(ctx->i64, UINT64_MAX, 0);3816case nir_op_fmin:3817return LLVMConstReal(ctx->f64, INFINITY);3818case nir_op_imax:3819return LLVMConstInt(ctx->i64, INT64_MIN, 0);3820case nir_op_umax:3821return ctx->i64_0;3822case nir_op_fmax:3823return LLVMConstReal(ctx->f64, -INFINITY);3824case nir_op_iand:3825return LLVMConstInt(ctx->i64, -1, 0);3826case nir_op_ior:3827return ctx->i64_0;3828case nir_op_ixor:3829return ctx->i64_0;3830default:3831unreachable("bad reduction intrinsic");3832}3833}3834}38353836static LLVMValueRef ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs,3837nir_op op)3838{3839bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8;3840bool _32bit = ac_get_type_size(LLVMTypeOf(lhs)) == 4;3841switch (op) {3842case nir_op_iadd:3843return LLVMBuildAdd(ctx->builder, lhs, rhs, "");3844case nir_op_fadd:3845return LLVMBuildFAdd(ctx->builder, lhs, rhs, "");3846case nir_op_imul:3847return LLVMBuildMul(ctx->builder, lhs, rhs, "");3848case nir_op_fmul:3849return LLVMBuildFMul(ctx->builder, lhs, rhs, "");3850case nir_op_imin:3851return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""),3852lhs, rhs, "");3853case nir_op_umin:3854return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""),3855lhs, rhs, "");3856case nir_op_fmin:3857return ac_build_intrinsic(3858ctx, _64bit ? "llvm.minnum.f64" : _32bit ? "llvm.minnum.f32" : "llvm.minnum.f16",3859_64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2,3860AC_FUNC_ATTR_READNONE);3861case nir_op_imax:3862return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""),3863lhs, rhs, "");3864case nir_op_umax:3865return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""),3866lhs, rhs, "");3867case nir_op_fmax:3868return ac_build_intrinsic(3869ctx, _64bit ? "llvm.maxnum.f64" : _32bit ? "llvm.maxnum.f32" : "llvm.maxnum.f16",3870_64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2,3871AC_FUNC_ATTR_READNONE);3872case nir_op_iand:3873return LLVMBuildAnd(ctx->builder, lhs, rhs, "");3874case nir_op_ior:3875return LLVMBuildOr(ctx->builder, lhs, rhs, "");3876case nir_op_ixor:3877return LLVMBuildXor(ctx->builder, lhs, rhs, "");3878default:3879unreachable("bad reduction intrinsic");3880}3881}38823883/**3884* \param src The value to shift.3885* \param identity The value to use the first lane.3886* \param maxprefix specifies that the result only needs to be correct for a3887* prefix of this many threads3888* \return src, shifted 1 lane up, and identity shifted into lane 0.3889*/3890static LLVMValueRef ac_wavefront_shift_right_1(struct ac_llvm_context *ctx, LLVMValueRef src,3891LLVMValueRef identity, unsigned maxprefix)3892{3893if (ctx->chip_class >= GFX10) {3894/* wavefront shift_right by 1 on GFX10 (emulate dpp_wf_sr1) */3895LLVMValueRef active, tmp1, tmp2;3896LLVMValueRef tid = ac_get_thread_id(ctx);38973898tmp1 = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);38993900tmp2 = ac_build_permlane16(ctx, src, (uint64_t)~0, true, false);39013902if (maxprefix > 32) {3903active =3904LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, false), "");39053906tmp2 = LLVMBuildSelect(ctx->builder, active,3907ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, false)),3908tmp2, "");39093910active = LLVMBuildOr(3911ctx->builder, active,3912LLVMBuildICmp(ctx->builder, LLVMIntEQ,3913LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, false), ""),3914LLVMConstInt(ctx->i32, 0x10, false), ""),3915"");3916return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");3917} else if (maxprefix > 16) {3918active =3919LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 16, false), "");39203921return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");3922}3923} else if (ctx->chip_class >= GFX8) {3924return ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false);3925}39263927/* wavefront shift_right by 1 on SI/CI */3928LLVMValueRef active, tmp1, tmp2;3929LLVMValueRef tid = ac_get_thread_id(ctx);3930tmp1 = ac_build_ds_swizzle(ctx, src, (1 << 15) | dpp_quad_perm(0, 0, 1, 2));3931tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x18, 0x03, 0x00));3932active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,3933LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x7, 0), ""),3934LLVMConstInt(ctx->i32, 0x4, 0), "");3935tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");3936tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x10, 0x07, 0x00));3937active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,3938LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0xf, 0), ""),3939LLVMConstInt(ctx->i32, 0x8, 0), "");3940tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");3941tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x00, 0x0f, 0x00));3942active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,3943LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, 0), ""),3944LLVMConstInt(ctx->i32, 0x10, 0), "");3945tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");3946tmp2 = ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, 0));3947active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, 0), "");3948tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");3949active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 0, 0), "");3950return LLVMBuildSelect(ctx->builder, active, identity, tmp1, "");3951}39523953/**3954* \param maxprefix specifies that the result only needs to be correct for a3955* prefix of this many threads3956*/3957static LLVMValueRef ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src,3958LLVMValueRef identity, unsigned maxprefix, bool inclusive)3959{3960LLVMValueRef result, tmp;39613962if (!inclusive)3963src = ac_wavefront_shift_right_1(ctx, src, identity, maxprefix);39643965result = src;39663967if (ctx->chip_class <= GFX7) {3968assert(maxprefix == 64);3969LLVMValueRef tid = ac_get_thread_id(ctx);3970LLVMValueRef active;3971tmp = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x1e, 0x00, 0x00));3972active = LLVMBuildICmp(ctx->builder, LLVMIntNE,3973LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""), ctx->i32_0, "");3974tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");3975result = ac_build_alu_op(ctx, result, tmp, op);3976tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1c, 0x01, 0x00));3977active = LLVMBuildICmp(ctx->builder, LLVMIntNE,3978LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 2, 0), ""),3979ctx->i32_0, "");3980tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");3981result = ac_build_alu_op(ctx, result, tmp, op);3982tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x18, 0x03, 0x00));3983active = LLVMBuildICmp(ctx->builder, LLVMIntNE,3984LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 4, 0), ""),3985ctx->i32_0, "");3986tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");3987result = ac_build_alu_op(ctx, result, tmp, op);3988tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x10, 0x07, 0x00));3989active = LLVMBuildICmp(ctx->builder, LLVMIntNE,3990LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 8, 0), ""),3991ctx->i32_0, "");3992tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");3993result = ac_build_alu_op(ctx, result, tmp, op);3994tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x00, 0x0f, 0x00));3995active = LLVMBuildICmp(ctx->builder, LLVMIntNE,3996LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, 0), ""),3997ctx->i32_0, "");3998tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");3999result = ac_build_alu_op(ctx, result, tmp, op);4000tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, 0));4001active = LLVMBuildICmp(ctx->builder, LLVMIntNE,4002LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 32, 0), ""),4003ctx->i32_0, "");4004tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");4005result = ac_build_alu_op(ctx, result, tmp, op);4006return result;4007}40084009if (maxprefix <= 1)4010return result;4011tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);4012result = ac_build_alu_op(ctx, result, tmp, op);4013if (maxprefix <= 2)4014return result;4015tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);4016result = ac_build_alu_op(ctx, result, tmp, op);4017if (maxprefix <= 3)4018return result;4019tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);4020result = ac_build_alu_op(ctx, result, tmp, op);4021if (maxprefix <= 4)4022return result;4023tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);4024result = ac_build_alu_op(ctx, result, tmp, op);4025if (maxprefix <= 8)4026return result;4027tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);4028result = ac_build_alu_op(ctx, result, tmp, op);4029if (maxprefix <= 16)4030return result;40314032if (ctx->chip_class >= GFX10) {4033LLVMValueRef tid = ac_get_thread_id(ctx);4034LLVMValueRef active;40354036tmp = ac_build_permlane16(ctx, result, ~(uint64_t)0, true, false);40374038active = LLVMBuildICmp(ctx->builder, LLVMIntNE,4039LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, false), ""),4040ctx->i32_0, "");40414042tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");40434044result = ac_build_alu_op(ctx, result, tmp, op);40454046if (maxprefix <= 32)4047return result;40484049tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));40504051active = LLVMBuildICmp(ctx->builder, LLVMIntUGE, tid, LLVMConstInt(ctx->i32, 32, false), "");40524053tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");40544055result = ac_build_alu_op(ctx, result, tmp, op);4056return result;4057}40584059tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);4060result = ac_build_alu_op(ctx, result, tmp, op);4061if (maxprefix <= 32)4062return result;4063tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);4064result = ac_build_alu_op(ctx, result, tmp, op);4065return result;4066}40674068LLVMValueRef ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)4069{4070LLVMValueRef result;40714072if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {4073LLVMBuilderRef builder = ctx->builder;4074src = LLVMBuildZExt(builder, src, ctx->i32, "");4075result = ac_build_ballot(ctx, src);4076result = ac_build_mbcnt(ctx, result);4077result = LLVMBuildAdd(builder, result, src, "");4078return result;4079}40804081ac_build_optimization_barrier(ctx, &src, false);40824083LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));4084result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),4085LLVMTypeOf(identity), "");4086result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, true);40874088return ac_build_wwm(ctx, result);4089}40904091LLVMValueRef ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)4092{4093LLVMValueRef result;40944095if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {4096LLVMBuilderRef builder = ctx->builder;4097src = LLVMBuildZExt(builder, src, ctx->i32, "");4098result = ac_build_ballot(ctx, src);4099result = ac_build_mbcnt(ctx, result);4100return result;4101}41024103ac_build_optimization_barrier(ctx, &src, false);41044105LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));4106result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),4107LLVMTypeOf(identity), "");4108result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, false);41094110return ac_build_wwm(ctx, result);4111}41124113LLVMValueRef ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op,4114unsigned cluster_size)4115{4116if (cluster_size == 1)4117return src;4118ac_build_optimization_barrier(ctx, &src, false);4119LLVMValueRef result, swap;4120LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));4121result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),4122LLVMTypeOf(identity), "");4123swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2);4124result = ac_build_alu_op(ctx, result, swap, op);4125if (cluster_size == 2)4126return ac_build_wwm(ctx, result);41274128swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1);4129result = ac_build_alu_op(ctx, result, swap, op);4130if (cluster_size == 4)4131return ac_build_wwm(ctx, result);41324133if (ctx->chip_class >= GFX8)4134swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false);4135else4136swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04));4137result = ac_build_alu_op(ctx, result, swap, op);4138if (cluster_size == 8)4139return ac_build_wwm(ctx, result);41404141if (ctx->chip_class >= GFX8)4142swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false);4143else4144swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08));4145result = ac_build_alu_op(ctx, result, swap, op);4146if (cluster_size == 16)4147return ac_build_wwm(ctx, result);41484149if (ctx->chip_class >= GFX10)4150swap = ac_build_permlane16(ctx, result, 0, true, false);4151else if (ctx->chip_class >= GFX8 && cluster_size != 32)4152swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);4153else4154swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10));4155result = ac_build_alu_op(ctx, result, swap, op);4156if (cluster_size == 32)4157return ac_build_wwm(ctx, result);41584159if (ctx->chip_class >= GFX8) {4160if (ctx->wave_size == 64) {4161if (ctx->chip_class >= GFX10)4162swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));4163else4164swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);4165result = ac_build_alu_op(ctx, result, swap, op);4166result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));4167}41684169return ac_build_wwm(ctx, result);4170} else {4171swap = ac_build_readlane(ctx, result, ctx->i32_0);4172result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));4173result = ac_build_alu_op(ctx, result, swap, op);4174return ac_build_wwm(ctx, result);4175}4176}41774178/**4179* "Top half" of a scan that reduces per-wave values across an entire4180* workgroup.4181*4182* The source value must be present in the highest lane of the wave, and the4183* highest lane must be live.4184*/4185void ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)4186{4187if (ws->maxwaves <= 1)4188return;41894190const LLVMValueRef last_lane = LLVMConstInt(ctx->i32, ctx->wave_size - 1, false);4191LLVMBuilderRef builder = ctx->builder;4192LLVMValueRef tid = ac_get_thread_id(ctx);4193LLVMValueRef tmp;41944195tmp = LLVMBuildICmp(builder, LLVMIntEQ, tid, last_lane, "");4196ac_build_ifcc(ctx, tmp, 1000);4197LLVMBuildStore(builder, ws->src, LLVMBuildGEP(builder, ws->scratch, &ws->waveidx, 1, ""));4198ac_build_endif(ctx, 1000);4199}42004201/**4202* "Bottom half" of a scan that reduces per-wave values across an entire4203* workgroup.4204*4205* The caller must place a barrier between the top and bottom halves.4206*/4207void ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)4208{4209const LLVMTypeRef type = LLVMTypeOf(ws->src);4210const LLVMValueRef identity = get_reduction_identity(ctx, ws->op, ac_get_type_size(type));42114212if (ws->maxwaves <= 1) {4213ws->result_reduce = ws->src;4214ws->result_inclusive = ws->src;4215ws->result_exclusive = identity;4216return;4217}4218assert(ws->maxwaves <= 32);42194220LLVMBuilderRef builder = ctx->builder;4221LLVMValueRef tid = ac_get_thread_id(ctx);4222LLVMBasicBlockRef bbs[2];4223LLVMValueRef phivalues_scan[2];4224LLVMValueRef tmp, tmp2;42254226bbs[0] = LLVMGetInsertBlock(builder);4227phivalues_scan[0] = LLVMGetUndef(type);42284229if (ws->enable_reduce)4230tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->numwaves, "");4231else if (ws->enable_inclusive)4232tmp = LLVMBuildICmp(builder, LLVMIntULE, tid, ws->waveidx, "");4233else4234tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->waveidx, "");4235ac_build_ifcc(ctx, tmp, 1001);4236{4237tmp = LLVMBuildLoad(builder, LLVMBuildGEP(builder, ws->scratch, &tid, 1, ""), "");42384239ac_build_optimization_barrier(ctx, &tmp, false);42404241bbs[1] = LLVMGetInsertBlock(builder);4242phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves, true);4243}4244ac_build_endif(ctx, 1001);42454246const LLVMValueRef scan = ac_build_phi(ctx, type, 2, phivalues_scan, bbs);42474248if (ws->enable_reduce) {4249tmp = LLVMBuildSub(builder, ws->numwaves, ctx->i32_1, "");4250ws->result_reduce = ac_build_readlane(ctx, scan, tmp);4251}4252if (ws->enable_inclusive)4253ws->result_inclusive = ac_build_readlane(ctx, scan, ws->waveidx);4254if (ws->enable_exclusive) {4255tmp = LLVMBuildSub(builder, ws->waveidx, ctx->i32_1, "");4256tmp = ac_build_readlane(ctx, scan, tmp);4257tmp2 = LLVMBuildICmp(builder, LLVMIntEQ, ws->waveidx, ctx->i32_0, "");4258ws->result_exclusive = LLVMBuildSelect(builder, tmp2, identity, tmp, "");4259}4260}42614262/**4263* Inclusive scan of a per-wave value across an entire workgroup.4264*4265* This implies an s_barrier instruction.4266*4267* Unlike ac_build_inclusive_scan, the caller \em must ensure that all threads4268* of the workgroup are live. (This requirement cannot easily be relaxed in a4269* useful manner because of the barrier in the algorithm.)4270*/4271void ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)4272{4273ac_build_wg_wavescan_top(ctx, ws);4274ac_build_s_barrier(ctx);4275ac_build_wg_wavescan_bottom(ctx, ws);4276}42774278/**4279* "Top half" of a scan that reduces per-thread values across an entire4280* workgroup.4281*4282* All lanes must be active when this code runs.4283*/4284void ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)4285{4286if (ws->enable_exclusive) {4287ws->extra = ac_build_exclusive_scan(ctx, ws->src, ws->op);4288if (LLVMTypeOf(ws->src) == ctx->i1 && ws->op == nir_op_iadd)4289ws->src = LLVMBuildZExt(ctx->builder, ws->src, ctx->i32, "");4290ws->src = ac_build_alu_op(ctx, ws->extra, ws->src, ws->op);4291} else {4292ws->src = ac_build_inclusive_scan(ctx, ws->src, ws->op);4293}42944295bool enable_inclusive = ws->enable_inclusive;4296bool enable_exclusive = ws->enable_exclusive;4297ws->enable_inclusive = false;4298ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;4299ac_build_wg_wavescan_top(ctx, ws);4300ws->enable_inclusive = enable_inclusive;4301ws->enable_exclusive = enable_exclusive;4302}43034304/**4305* "Bottom half" of a scan that reduces per-thread values across an entire4306* workgroup.4307*4308* The caller must place a barrier between the top and bottom halves.4309*/4310void ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)4311{4312bool enable_inclusive = ws->enable_inclusive;4313bool enable_exclusive = ws->enable_exclusive;4314ws->enable_inclusive = false;4315ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;4316ac_build_wg_wavescan_bottom(ctx, ws);4317ws->enable_inclusive = enable_inclusive;4318ws->enable_exclusive = enable_exclusive;43194320/* ws->result_reduce is already the correct value */4321if (ws->enable_inclusive)4322ws->result_inclusive = ac_build_alu_op(ctx, ws->result_inclusive, ws->src, ws->op);4323if (ws->enable_exclusive)4324ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op);4325}43264327/**4328* A scan that reduces per-thread values across an entire workgroup.4329*4330* The caller must ensure that all lanes are active when this code runs4331* (WWM is insufficient!), because there is an implied barrier.4332*/4333void ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)4334{4335ac_build_wg_scan_top(ctx, ws);4336ac_build_s_barrier(ctx);4337ac_build_wg_scan_bottom(ctx, ws);4338}43394340LLVMValueRef ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned lane0,4341unsigned lane1, unsigned lane2, unsigned lane3)4342{4343unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);4344if (ctx->chip_class >= GFX8) {4345return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);4346} else {4347return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);4348}4349}43504351LLVMValueRef ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index)4352{4353LLVMTypeRef type = LLVMTypeOf(src);4354LLVMValueRef result;43554356index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), "");4357src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");43584359result =4360ac_build_intrinsic(ctx, "llvm.amdgcn.ds.bpermute", ctx->i32, (LLVMValueRef[]){index, src}, 2,4361AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);4362return LLVMBuildTrunc(ctx->builder, result, type, "");4363}43644365LLVMValueRef ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)4366{4367LLVMTypeRef type;4368char *intr;43694370if (bitsize == 16) {4371intr = "llvm.amdgcn.frexp.exp.i16.f16";4372type = ctx->i16;4373} else if (bitsize == 32) {4374intr = "llvm.amdgcn.frexp.exp.i32.f32";4375type = ctx->i32;4376} else {4377intr = "llvm.amdgcn.frexp.exp.i32.f64";4378type = ctx->i32;4379}43804381LLVMValueRef params[] = {4382src0,4383};4384return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);4385}4386LLVMValueRef ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)4387{4388LLVMTypeRef type;4389char *intr;43904391if (bitsize == 16) {4392intr = "llvm.amdgcn.frexp.mant.f16";4393type = ctx->f16;4394} else if (bitsize == 32) {4395intr = "llvm.amdgcn.frexp.mant.f32";4396type = ctx->f32;4397} else {4398intr = "llvm.amdgcn.frexp.mant.f64";4399type = ctx->f64;4400}44014402LLVMValueRef params[] = {4403src0,4404};4405return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);4406}44074408LLVMValueRef ac_build_canonicalize(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)4409{4410LLVMTypeRef type;4411char *intr;44124413if (bitsize == 16) {4414intr = "llvm.canonicalize.f16";4415type = ctx->f16;4416} else if (bitsize == 32) {4417intr = "llvm.canonicalize.f32";4418type = ctx->f32;4419} else {4420intr = "llvm.canonicalize.f64";4421type = ctx->f64;4422}44234424LLVMValueRef params[] = {4425src0,4426};4427return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);4428}44294430/*4431* this takes an I,J coordinate pair,4432* and works out the X and Y derivatives.4433* it returns DDX(I), DDX(J), DDY(I), DDY(J).4434*/4435LLVMValueRef ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij)4436{4437LLVMValueRef result[4], a;4438unsigned i;44394440for (i = 0; i < 2; i++) {4441a = LLVMBuildExtractElement(ctx->builder, interp_ij, LLVMConstInt(ctx->i32, i, false), "");4442result[i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 1, a);4443result[2 + i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 2, a);4444}4445return ac_build_gather_values(ctx, result, 4);4446}44474448LLVMValueRef ac_build_load_helper_invocation(struct ac_llvm_context *ctx)4449{4450LLVMValueRef result;44514452if (LLVM_VERSION_MAJOR >= 13) {4453result = ac_build_intrinsic(ctx, "llvm.amdgcn.live.mask", ctx->i1, NULL, 0,4454AC_FUNC_ATTR_READONLY | AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);4455} else {4456result = ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live", ctx->i1, NULL, 0,4457AC_FUNC_ATTR_READNONE);4458}4459return LLVMBuildNot(ctx->builder, result, "");4460}44614462LLVMValueRef ac_build_is_helper_invocation(struct ac_llvm_context *ctx)4463{4464if (!ctx->postponed_kill)4465return ac_build_load_helper_invocation(ctx);44664467/* postponed_kill should be NULL on LLVM 13+ */4468assert(LLVM_VERSION_MAJOR < 13);44694470/* !(exact && postponed) */4471LLVMValueRef exact =4472ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live", ctx->i1, NULL, 0, AC_FUNC_ATTR_READNONE);44734474LLVMValueRef postponed = LLVMBuildLoad(ctx->builder, ctx->postponed_kill, "");4475return LLVMBuildNot(ctx->builder, LLVMBuildAnd(ctx->builder, exact, postponed, ""), "");4476}44774478LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMValueRef func, LLVMValueRef *args,4479unsigned num_args)4480{4481LLVMValueRef ret = LLVMBuildCall(ctx->builder, func, args, num_args, "");4482LLVMSetInstructionCallConv(ret, LLVMGetFunctionCallConv(func));4483return ret;4484}44854486void ac_export_mrt_z(struct ac_llvm_context *ctx, LLVMValueRef depth, LLVMValueRef stencil,4487LLVMValueRef samplemask, struct ac_export_args *args)4488{4489unsigned mask = 0;4490unsigned format = ac_get_spi_shader_z_format(depth != NULL, stencil != NULL, samplemask != NULL);44914492assert(depth || stencil || samplemask);44934494memset(args, 0, sizeof(*args));44954496args->valid_mask = 1; /* whether the EXEC mask is valid */4497args->done = 1; /* DONE bit */44984499/* Specify the target we are exporting */4500args->target = V_008DFC_SQ_EXP_MRTZ;45014502args->compr = 0; /* COMP flag */4503args->out[0] = LLVMGetUndef(ctx->f32); /* R, depth */4504args->out[1] = LLVMGetUndef(ctx->f32); /* G, stencil test val[0:7], stencil op val[8:15] */4505args->out[2] = LLVMGetUndef(ctx->f32); /* B, sample mask */4506args->out[3] = LLVMGetUndef(ctx->f32); /* A, alpha to mask */45074508if (format == V_028710_SPI_SHADER_UINT16_ABGR) {4509assert(!depth);4510args->compr = 1; /* COMPR flag */45114512if (stencil) {4513/* Stencil should be in X[23:16]. */4514stencil = ac_to_integer(ctx, stencil);4515stencil = LLVMBuildShl(ctx->builder, stencil, LLVMConstInt(ctx->i32, 16, 0), "");4516args->out[0] = ac_to_float(ctx, stencil);4517mask |= 0x3;4518}4519if (samplemask) {4520/* SampleMask should be in Y[15:0]. */4521args->out[1] = samplemask;4522mask |= 0xc;4523}4524} else {4525if (depth) {4526args->out[0] = depth;4527mask |= 0x1;4528}4529if (stencil) {4530args->out[1] = stencil;4531mask |= 0x2;4532}4533if (samplemask) {4534args->out[2] = samplemask;4535mask |= 0x4;4536}4537}45384539/* GFX6 (except OLAND and HAINAN) has a bug that it only looks4540* at the X writemask component. */4541if (ctx->chip_class == GFX6 && ctx->family != CHIP_OLAND && ctx->family != CHIP_HAINAN)4542mask |= 0x1;45434544/* Specify which components to enable */4545args->enabled_channels = mask;4546}45474548/* Send GS Alloc Req message from the first wave of the group to SPI.4549* Message payload is:4550* - bits 0..10: vertices in group4551* - bits 12..22: primitives in group4552*/4553void ac_build_sendmsg_gs_alloc_req(struct ac_llvm_context *ctx, LLVMValueRef wave_id,4554LLVMValueRef vtx_cnt, LLVMValueRef prim_cnt)4555{4556LLVMBuilderRef builder = ctx->builder;4557LLVMValueRef tmp;4558bool export_dummy_prim = false;45594560/* HW workaround for a GPU hang with 100% culling.4561* We always have to export at least 1 primitive.4562* Export a degenerate triangle using vertex 0 for all 3 vertices.4563*/4564if (prim_cnt == ctx->i32_0 && ctx->chip_class == GFX10) {4565assert(vtx_cnt == ctx->i32_0);4566prim_cnt = ctx->i32_1;4567vtx_cnt = ctx->i32_1;4568export_dummy_prim = true;4569}45704571ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, wave_id, ctx->i32_0, ""), 5020);45724573tmp = LLVMBuildShl(builder, prim_cnt, LLVMConstInt(ctx->i32, 12, false), "");4574tmp = LLVMBuildOr(builder, tmp, vtx_cnt, "");4575ac_build_sendmsg(ctx, AC_SENDMSG_GS_ALLOC_REQ, tmp);45764577if (export_dummy_prim) {4578struct ac_ngg_prim prim = {0};4579/* The vertex indices are 0,0,0. */4580prim.passthrough = ctx->i32_0;45814582struct ac_export_args pos = {0};4583/* The hw culls primitives with NaN. */4584pos.out[0] = pos.out[1] = pos.out[2] = pos.out[3] = LLVMConstReal(ctx->f32, NAN);4585pos.target = V_008DFC_SQ_EXP_POS;4586pos.enabled_channels = 0xf;4587pos.done = true;45884589ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(ctx), ctx->i32_0, ""),45905021);4591ac_build_export_prim(ctx, &prim);4592ac_build_export(ctx, &pos);4593ac_build_endif(ctx, 5021);4594}45954596ac_build_endif(ctx, 5020);4597}45984599LLVMValueRef ac_pack_prim_export(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim)4600{4601/* The prim export format is:4602* - bits 0..8: index 04603* - bit 9: edge flag 04604* - bits 10..18: index 14605* - bit 19: edge flag 14606* - bits 20..28: index 24607* - bit 29: edge flag 24608* - bit 31: null primitive (skip)4609*/4610LLVMBuilderRef builder = ctx->builder;4611LLVMValueRef tmp = LLVMBuildZExt(builder, prim->isnull, ctx->i32, "");4612LLVMValueRef result = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->i32, 31, false), "");46134614for (unsigned i = 0; i < prim->num_vertices; ++i) {4615tmp = LLVMBuildShl(builder, prim->index[i], LLVMConstInt(ctx->i32, 10 * i, false), "");4616result = LLVMBuildOr(builder, result, tmp, "");4617tmp = LLVMBuildZExt(builder, prim->edgeflag[i], ctx->i32, "");4618tmp = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->i32, 10 * i + 9, false), "");4619result = LLVMBuildOr(builder, result, tmp, "");4620}4621return result;4622}46234624void ac_build_export_prim(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim)4625{4626struct ac_export_args args;46274628if (prim->passthrough) {4629args.out[0] = prim->passthrough;4630} else {4631args.out[0] = ac_pack_prim_export(ctx, prim);4632}46334634args.out[0] = LLVMBuildBitCast(ctx->builder, args.out[0], ctx->f32, "");4635args.out[1] = LLVMGetUndef(ctx->f32);4636args.out[2] = LLVMGetUndef(ctx->f32);4637args.out[3] = LLVMGetUndef(ctx->f32);46384639args.target = V_008DFC_SQ_EXP_PRIM;4640args.enabled_channels = 1;4641args.done = true;4642args.valid_mask = false;4643args.compr = false;46444645ac_build_export(ctx, &args);4646}46474648static LLVMTypeRef arg_llvm_type(enum ac_arg_type type, unsigned size, struct ac_llvm_context *ctx)4649{4650if (type == AC_ARG_FLOAT) {4651return size == 1 ? ctx->f32 : LLVMVectorType(ctx->f32, size);4652} else if (type == AC_ARG_INT) {4653return size == 1 ? ctx->i32 : LLVMVectorType(ctx->i32, size);4654} else {4655LLVMTypeRef ptr_type;4656switch (type) {4657case AC_ARG_CONST_PTR:4658ptr_type = ctx->i8;4659break;4660case AC_ARG_CONST_FLOAT_PTR:4661ptr_type = ctx->f32;4662break;4663case AC_ARG_CONST_PTR_PTR:4664ptr_type = ac_array_in_const32_addr_space(ctx->i8);4665break;4666case AC_ARG_CONST_DESC_PTR:4667ptr_type = ctx->v4i32;4668break;4669case AC_ARG_CONST_IMAGE_PTR:4670ptr_type = ctx->v8i32;4671break;4672default:4673unreachable("unknown arg type");4674}4675if (size == 1) {4676return ac_array_in_const32_addr_space(ptr_type);4677} else {4678assert(size == 2);4679return ac_array_in_const_addr_space(ptr_type);4680}4681}4682}46834684LLVMValueRef ac_build_main(const struct ac_shader_args *args, struct ac_llvm_context *ctx,4685enum ac_llvm_calling_convention convention, const char *name,4686LLVMTypeRef ret_type, LLVMModuleRef module)4687{4688LLVMTypeRef arg_types[AC_MAX_ARGS];46894690for (unsigned i = 0; i < args->arg_count; i++) {4691arg_types[i] = arg_llvm_type(args->args[i].type, args->args[i].size, ctx);4692}46934694LLVMTypeRef main_function_type = LLVMFunctionType(ret_type, arg_types, args->arg_count, 0);46954696LLVMValueRef main_function = LLVMAddFunction(module, name, main_function_type);4697LLVMBasicBlockRef main_function_body =4698LLVMAppendBasicBlockInContext(ctx->context, main_function, "main_body");4699LLVMPositionBuilderAtEnd(ctx->builder, main_function_body);47004701LLVMSetFunctionCallConv(main_function, convention);4702for (unsigned i = 0; i < args->arg_count; ++i) {4703LLVMValueRef P = LLVMGetParam(main_function, i);47044705if (args->args[i].file != AC_ARG_SGPR)4706continue;47074708ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_INREG);47094710if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {4711ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_NOALIAS);4712ac_add_attr_dereferenceable(P, UINT64_MAX);4713ac_add_attr_alignment(P, 4);4714}4715}47164717ctx->main_function = main_function;47184719/* Enable denormals for FP16 and FP64: */4720LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math", "ieee,ieee");4721/* Disable denormals for FP32: */4722LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math-f32",4723"preserve-sign,preserve-sign");4724return main_function;4725}47264727void ac_build_s_endpgm(struct ac_llvm_context *ctx)4728{4729LLVMTypeRef calltype = LLVMFunctionType(ctx->voidt, NULL, 0, false);4730LLVMValueRef code = LLVMConstInlineAsm(calltype, "s_endpgm", "", true, false);4731LLVMBuildCall(ctx->builder, code, NULL, 0, "");4732}47334734/**4735* Convert triangle strip indices to triangle indices. This is used to decompose4736* triangle strips into triangles.4737*/4738void ac_build_triangle_strip_indices_to_triangle(struct ac_llvm_context *ctx, LLVMValueRef is_odd,4739LLVMValueRef flatshade_first,4740LLVMValueRef index[3])4741{4742LLVMBuilderRef builder = ctx->builder;4743LLVMValueRef out[3];47444745/* We need to change the vertex order for odd triangles to get correct4746* front/back facing by swapping 2 vertex indices, but we also have to4747* keep the provoking vertex in the same place.4748*4749* If the first vertex is provoking, swap index 1 and 2.4750* If the last vertex is provoking, swap index 0 and 1.4751*/4752out[0] = LLVMBuildSelect(builder, flatshade_first, index[0],4753LLVMBuildSelect(builder, is_odd, index[1], index[0], ""), "");4754out[1] = LLVMBuildSelect(builder, flatshade_first,4755LLVMBuildSelect(builder, is_odd, index[2], index[1], ""),4756LLVMBuildSelect(builder, is_odd, index[0], index[1], ""), "");4757out[2] = LLVMBuildSelect(builder, flatshade_first,4758LLVMBuildSelect(builder, is_odd, index[1], index[2], ""), index[2], "");4759memcpy(index, out, sizeof(out));4760}476147624763