Path: blob/21.2-virgl/src/amd/vulkan/radv_nir_to_llvm.c
7204 views
/*1* Copyright © 2016 Red Hat.2* Copyright © 2016 Bas Nieuwenhuizen3*4* based in part on anv driver which is:5* Copyright © 2015 Intel Corporation6*7* Permission is hereby granted, free of charge, to any person obtaining a8* copy of this software and associated documentation files (the "Software"),9* to deal in the Software without restriction, including without limitation10* the rights to use, copy, modify, merge, publish, distribute, sublicense,11* and/or sell copies of the Software, and to permit persons to whom the12* Software is furnished to do so, subject to the following conditions:13*14* The above copyright notice and this permission notice (including the next15* paragraph) shall be included in all copies or substantial portions of the16* Software.17*18* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR19* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,20* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL21* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER22* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING23* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS24* IN THE SOFTWARE.25*/2627#include "nir/nir.h"28#include "radv_debug.h"29#include "radv_private.h"30#include "radv_shader.h"31#include "radv_shader_args.h"32#include "radv_shader_helper.h"3334#include "ac_binary.h"35#include "ac_exp_param.h"36#include "ac_llvm_build.h"37#include "ac_nir_to_llvm.h"38#include "ac_shader_abi.h"39#include "ac_shader_util.h"40#include "sid.h"4142#define RADEON_LLVM_MAX_INPUTS (VARYING_SLOT_VAR31 + 1)4344struct radv_shader_context {45struct ac_llvm_context ac;46const struct nir_shader *shader;47struct ac_shader_abi abi;48const struct radv_shader_args *args;4950gl_shader_stage stage;5152unsigned max_workgroup_size;53LLVMContextRef context;54LLVMValueRef main_function;5556LLVMValueRef descriptor_sets[MAX_SETS];5758LLVMValueRef ring_offsets;5960LLVMValueRef vs_rel_patch_id;6162LLVMValueRef gs_wave_id;63LLVMValueRef gs_vtx_offset[6];6465LLVMValueRef esgs_ring;66LLVMValueRef gsvs_ring[4];67LLVMValueRef hs_ring_tess_offchip;68LLVMValueRef hs_ring_tess_factor;6970LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS * 4];7172uint64_t output_mask;7374LLVMValueRef gs_next_vertex[4];75LLVMValueRef gs_curprim_verts[4];76LLVMValueRef gs_generated_prims[4];77LLVMValueRef gs_ngg_emit;78LLVMValueRef gs_ngg_scratch;7980LLVMValueRef vertexptr; /* GFX10 only */81};8283struct radv_shader_output_values {84LLVMValueRef values[4];85unsigned slot_name;86unsigned slot_index;87unsigned usage_mask;88};8990static inline struct radv_shader_context *91radv_shader_context_from_abi(struct ac_shader_abi *abi)92{93return container_of(abi, struct radv_shader_context, abi);94}9596static LLVMValueRef97create_llvm_function(struct ac_llvm_context *ctx, LLVMModuleRef module, LLVMBuilderRef builder,98const struct ac_shader_args *args, enum ac_llvm_calling_convention convention,99unsigned max_workgroup_size, const struct radv_nir_compiler_options *options)100{101LLVMValueRef main_function = ac_build_main(args, ctx, convention, "main", ctx->voidt, module);102103if (options->address32_hi) {104ac_llvm_add_target_dep_function_attr(main_function, "amdgpu-32bit-address-high-bits",105options->address32_hi);106}107108ac_llvm_set_workgroup_size(main_function, max_workgroup_size);109ac_llvm_set_target_features(main_function, ctx);110111return main_function;112}113114static void115load_descriptor_sets(struct radv_shader_context *ctx)116{117uint32_t mask = ctx->args->shader_info->desc_set_used_mask;118if (ctx->args->shader_info->need_indirect_descriptor_sets) {119LLVMValueRef desc_sets = ac_get_arg(&ctx->ac, ctx->args->descriptor_sets[0]);120while (mask) {121int i = u_bit_scan(&mask);122123ctx->descriptor_sets[i] =124ac_build_load_to_sgpr(&ctx->ac, desc_sets, LLVMConstInt(ctx->ac.i32, i, false));125LLVMSetAlignment(ctx->descriptor_sets[i], 4);126}127} else {128while (mask) {129int i = u_bit_scan(&mask);130131ctx->descriptor_sets[i] = ac_get_arg(&ctx->ac, ctx->args->descriptor_sets[i]);132}133}134}135136static enum ac_llvm_calling_convention137get_llvm_calling_convention(LLVMValueRef func, gl_shader_stage stage)138{139switch (stage) {140case MESA_SHADER_VERTEX:141case MESA_SHADER_TESS_EVAL:142return AC_LLVM_AMDGPU_VS;143break;144case MESA_SHADER_GEOMETRY:145return AC_LLVM_AMDGPU_GS;146break;147case MESA_SHADER_TESS_CTRL:148return AC_LLVM_AMDGPU_HS;149break;150case MESA_SHADER_FRAGMENT:151return AC_LLVM_AMDGPU_PS;152break;153case MESA_SHADER_COMPUTE:154return AC_LLVM_AMDGPU_CS;155break;156default:157unreachable("Unhandle shader type");158}159}160161/* Returns whether the stage is a stage that can be directly before the GS */162static bool163is_pre_gs_stage(gl_shader_stage stage)164{165return stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL;166}167168static void169create_function(struct radv_shader_context *ctx, gl_shader_stage stage, bool has_previous_stage)170{171if (ctx->ac.chip_class >= GFX10) {172if (is_pre_gs_stage(stage) && ctx->args->options->key.vs_common_out.as_ngg) {173/* On GFX10, VS is merged into GS for NGG. */174stage = MESA_SHADER_GEOMETRY;175has_previous_stage = true;176}177}178179ctx->main_function =180create_llvm_function(&ctx->ac, ctx->ac.module, ctx->ac.builder, &ctx->args->ac,181get_llvm_calling_convention(ctx->main_function, stage),182ctx->max_workgroup_size, ctx->args->options);183184ctx->ring_offsets = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.implicit.buffer.ptr",185LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_CONST), NULL, 0,186AC_FUNC_ATTR_READNONE);187ctx->ring_offsets = LLVMBuildBitCast(ctx->ac.builder, ctx->ring_offsets,188ac_array_in_const_addr_space(ctx->ac.v4i32), "");189190load_descriptor_sets(ctx);191192if (stage == MESA_SHADER_TESS_CTRL ||193(stage == MESA_SHADER_VERTEX && ctx->args->options->key.vs_common_out.as_ls) ||194/* GFX9 has the ESGS ring buffer in LDS. */195(stage == MESA_SHADER_GEOMETRY && has_previous_stage)) {196ac_declare_lds_as_pointer(&ctx->ac);197}198}199200static LLVMValueRef201radv_load_resource(struct ac_shader_abi *abi, LLVMValueRef index, unsigned desc_set,202unsigned binding)203{204struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);205LLVMValueRef desc_ptr = ctx->descriptor_sets[desc_set];206struct radv_pipeline_layout *pipeline_layout = ctx->args->options->layout;207struct radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;208unsigned base_offset = layout->binding[binding].offset;209LLVMValueRef offset, stride;210211if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||212layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {213unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start +214layout->binding[binding].dynamic_offset_offset;215desc_ptr = ac_get_arg(&ctx->ac, ctx->args->ac.push_constants);216base_offset = pipeline_layout->push_constant_size + 16 * idx;217stride = LLVMConstInt(ctx->ac.i32, 16, false);218} else219stride = LLVMConstInt(ctx->ac.i32, layout->binding[binding].size, false);220221offset = LLVMConstInt(ctx->ac.i32, base_offset, false);222223if (layout->binding[binding].type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {224offset = ac_build_imad(&ctx->ac, index, stride, offset);225}226227desc_ptr = LLVMBuildPtrToInt(ctx->ac.builder, desc_ptr, ctx->ac.i32, "");228229LLVMValueRef res[] = {desc_ptr, offset, ctx->ac.i32_0};230return ac_build_gather_values(&ctx->ac, res, 3);231}232233static uint32_t234radv_get_sample_pos_offset(uint32_t num_samples)235{236uint32_t sample_pos_offset = 0;237238switch (num_samples) {239case 2:240sample_pos_offset = 1;241break;242case 4:243sample_pos_offset = 3;244break;245case 8:246sample_pos_offset = 7;247break;248default:249break;250}251return sample_pos_offset;252}253254static LLVMValueRef255load_sample_position(struct ac_shader_abi *abi, LLVMValueRef sample_id)256{257struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);258259LLVMValueRef result;260LLVMValueRef index = LLVMConstInt(ctx->ac.i32, RING_PS_SAMPLE_POSITIONS, false);261LLVMValueRef ptr = LLVMBuildGEP(ctx->ac.builder, ctx->ring_offsets, &index, 1, "");262263ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ac_array_in_const_addr_space(ctx->ac.v2f32), "");264265uint32_t sample_pos_offset = radv_get_sample_pos_offset(ctx->args->options->key.fs.num_samples);266267sample_id = LLVMBuildAdd(ctx->ac.builder, sample_id,268LLVMConstInt(ctx->ac.i32, sample_pos_offset, false), "");269result = ac_build_load_invariant(&ctx->ac, ptr, sample_id);270271return result;272}273274static LLVMValueRef275load_sample_mask_in(struct ac_shader_abi *abi)276{277struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);278uint8_t log2_ps_iter_samples;279280if (ctx->args->shader_info->ps.uses_sample_shading) {281log2_ps_iter_samples = util_logbase2(ctx->args->options->key.fs.num_samples);282} else {283log2_ps_iter_samples = ctx->args->options->key.fs.log2_ps_iter_samples;284}285286LLVMValueRef result, sample_id;287if (log2_ps_iter_samples) {288/* gl_SampleMaskIn[0] = (SampleCoverage & (1 << gl_SampleID)). */289sample_id = ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.ancillary), 8, 4);290sample_id = LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, 1, false), sample_id, "");291result = LLVMBuildAnd(ctx->ac.builder, sample_id,292ac_get_arg(&ctx->ac, ctx->args->ac.sample_coverage), "");293} else {294result = ac_get_arg(&ctx->ac, ctx->args->ac.sample_coverage);295}296297return result;298}299300static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx, unsigned stream,301LLVMValueRef vertexidx, LLVMValueRef *addrs);302303static void304visit_emit_vertex_with_counter(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef vertexidx,305LLVMValueRef *addrs)306{307unsigned offset = 0;308struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);309310if (ctx->args->options->key.vs_common_out.as_ngg) {311gfx10_ngg_gs_emit_vertex(ctx, stream, vertexidx, addrs);312return;313}314315for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {316unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];317uint8_t output_stream = ctx->args->shader_info->gs.output_streams[i];318LLVMValueRef *out_ptr = &addrs[i * 4];319int length = util_last_bit(output_usage_mask);320321if (!(ctx->output_mask & (1ull << i)) || output_stream != stream)322continue;323324for (unsigned j = 0; j < length; j++) {325if (!(output_usage_mask & (1 << j)))326continue;327328LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], "");329LLVMValueRef voffset =330LLVMConstInt(ctx->ac.i32, offset * ctx->shader->info.gs.vertices_out, false);331332offset++;333334voffset = LLVMBuildAdd(ctx->ac.builder, voffset, vertexidx, "");335voffset = LLVMBuildMul(ctx->ac.builder, voffset, LLVMConstInt(ctx->ac.i32, 4, false), "");336337out_val = ac_to_integer(&ctx->ac, out_val);338out_val = LLVMBuildZExtOrBitCast(ctx->ac.builder, out_val, ctx->ac.i32, "");339340ac_build_buffer_store_dword(&ctx->ac, ctx->gsvs_ring[stream], out_val, 1, voffset,341ac_get_arg(&ctx->ac, ctx->args->ac.gs2vs_offset), 0,342ac_glc | ac_slc | ac_swizzled);343}344}345346ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),347ctx->gs_wave_id);348}349350static void351visit_end_primitive(struct ac_shader_abi *abi, unsigned stream)352{353struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);354355if (ctx->args->options->key.vs_common_out.as_ngg) {356LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]);357return;358}359360ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),361ctx->gs_wave_id);362}363364static LLVMValueRef365load_tess_coord(struct ac_shader_abi *abi)366{367struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);368369LLVMValueRef coord[4] = {370ac_get_arg(&ctx->ac, ctx->args->ac.tes_u),371ac_get_arg(&ctx->ac, ctx->args->ac.tes_v),372ctx->ac.f32_0,373ctx->ac.f32_0,374};375376if (ctx->shader->info.tess.primitive_mode == GL_TRIANGLES)377coord[2] = LLVMBuildFSub(ctx->ac.builder, ctx->ac.f32_1,378LLVMBuildFAdd(ctx->ac.builder, coord[0], coord[1], ""), "");379380return ac_build_gather_values(&ctx->ac, coord, 3);381}382383static LLVMValueRef384load_ring_tess_factors(struct ac_shader_abi *abi)385{386struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);387assert(ctx->stage == MESA_SHADER_TESS_CTRL);388389return ctx->hs_ring_tess_factor;390}391392static LLVMValueRef393load_ring_tess_offchip(struct ac_shader_abi *abi)394{395struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);396assert(ctx->stage == MESA_SHADER_TESS_CTRL || ctx->stage == MESA_SHADER_TESS_EVAL);397398return ctx->hs_ring_tess_offchip;399}400401static LLVMValueRef402load_ring_esgs(struct ac_shader_abi *abi)403{404struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);405assert(ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL ||406ctx->stage == MESA_SHADER_GEOMETRY);407408return ctx->esgs_ring;409}410411static LLVMValueRef412radv_load_base_vertex(struct ac_shader_abi *abi, bool non_indexed_is_zero)413{414struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);415return ac_get_arg(&ctx->ac, ctx->args->ac.base_vertex);416}417418static LLVMValueRef419get_desc_ptr(struct radv_shader_context *ctx, LLVMValueRef ptr, bool non_uniform)420{421LLVMValueRef set_ptr = ac_llvm_extract_elem(&ctx->ac, ptr, 0);422LLVMValueRef offset = ac_llvm_extract_elem(&ctx->ac, ptr, 1);423ptr = LLVMBuildNUWAdd(ctx->ac.builder, set_ptr, offset, "");424425unsigned addr_space = AC_ADDR_SPACE_CONST_32BIT;426if (non_uniform) {427/* 32-bit seems to always use SMEM. addrspacecast from 32-bit -> 64-bit is broken. */428LLVMValueRef dwords[] = {ptr,429LLVMConstInt(ctx->ac.i32, ctx->args->options->address32_hi, false)};430ptr = ac_build_gather_values(&ctx->ac, dwords, 2);431ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ctx->ac.i64, "");432addr_space = AC_ADDR_SPACE_CONST;433}434return LLVMBuildIntToPtr(ctx->ac.builder, ptr, LLVMPointerType(ctx->ac.v4i32, addr_space), "");435}436437static LLVMValueRef438radv_load_ssbo(struct ac_shader_abi *abi, LLVMValueRef buffer_ptr, bool write, bool non_uniform)439{440struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);441LLVMValueRef result;442443buffer_ptr = get_desc_ptr(ctx, buffer_ptr, non_uniform);444if (!non_uniform)445LLVMSetMetadata(buffer_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md);446447result = LLVMBuildLoad(ctx->ac.builder, buffer_ptr, "");448LLVMSetMetadata(result, ctx->ac.invariant_load_md_kind, ctx->ac.empty_md);449LLVMSetAlignment(result, 4);450451return result;452}453454static LLVMValueRef455radv_load_ubo(struct ac_shader_abi *abi, unsigned desc_set, unsigned binding, bool valid_binding,456LLVMValueRef buffer_ptr)457{458struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);459LLVMValueRef result;460461if (valid_binding) {462struct radv_pipeline_layout *pipeline_layout = ctx->args->options->layout;463struct radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;464465if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {466LLVMValueRef set_ptr = ac_llvm_extract_elem(&ctx->ac, buffer_ptr, 0);467LLVMValueRef offset = ac_llvm_extract_elem(&ctx->ac, buffer_ptr, 1);468buffer_ptr = LLVMBuildNUWAdd(ctx->ac.builder, set_ptr, offset, "");469470uint32_t desc_type =471S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |472S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);473474if (ctx->ac.chip_class >= GFX10) {475desc_type |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |476S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);477} else {478desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |479S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);480}481482LLVMValueRef desc_components[4] = {483LLVMBuildPtrToInt(ctx->ac.builder, buffer_ptr, ctx->ac.intptr, ""),484LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->args->options->address32_hi),485false),486LLVMConstInt(ctx->ac.i32, 0xffffffff, false),487LLVMConstInt(ctx->ac.i32, desc_type, false),488};489490return ac_build_gather_values(&ctx->ac, desc_components, 4);491}492}493494buffer_ptr = get_desc_ptr(ctx, buffer_ptr, false);495LLVMSetMetadata(buffer_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md);496497result = LLVMBuildLoad(ctx->ac.builder, buffer_ptr, "");498LLVMSetMetadata(result, ctx->ac.invariant_load_md_kind, ctx->ac.empty_md);499LLVMSetAlignment(result, 4);500501return result;502}503504static LLVMValueRef505radv_get_sampler_desc(struct ac_shader_abi *abi, unsigned descriptor_set, unsigned base_index,506unsigned constant_index, LLVMValueRef index,507enum ac_descriptor_type desc_type, bool image, bool write, bool bindless)508{509struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);510LLVMValueRef list = ctx->descriptor_sets[descriptor_set];511struct radv_descriptor_set_layout *layout =512ctx->args->options->layout->set[descriptor_set].layout;513struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index;514unsigned offset = binding->offset;515unsigned stride = binding->size;516unsigned type_size;517LLVMBuilderRef builder = ctx->ac.builder;518LLVMTypeRef type;519520assert(base_index < layout->binding_count);521522switch (desc_type) {523case AC_DESC_IMAGE:524type = ctx->ac.v8i32;525type_size = 32;526break;527case AC_DESC_FMASK:528type = ctx->ac.v8i32;529offset += 32;530type_size = 32;531break;532case AC_DESC_SAMPLER:533type = ctx->ac.v4i32;534if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) {535offset += radv_combined_image_descriptor_sampler_offset(binding);536}537538type_size = 16;539break;540case AC_DESC_BUFFER:541type = ctx->ac.v4i32;542type_size = 16;543break;544case AC_DESC_PLANE_0:545case AC_DESC_PLANE_1:546case AC_DESC_PLANE_2:547type = ctx->ac.v8i32;548type_size = 32;549offset += 32 * (desc_type - AC_DESC_PLANE_0);550break;551default:552unreachable("invalid desc_type\n");553}554555offset += constant_index * stride;556557if (desc_type == AC_DESC_SAMPLER && binding->immutable_samplers_offset &&558(!index || binding->immutable_samplers_equal)) {559if (binding->immutable_samplers_equal)560constant_index = 0;561562const uint32_t *samplers = radv_immutable_samplers(layout, binding);563564LLVMValueRef constants[] = {565LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 0], 0),566LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 1], 0),567LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 2], 0),568LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 3], 0),569};570return ac_build_gather_values(&ctx->ac, constants, 4);571}572573assert(stride % type_size == 0);574575LLVMValueRef adjusted_index = index;576if (!adjusted_index)577adjusted_index = ctx->ac.i32_0;578579adjusted_index =580LLVMBuildMul(builder, adjusted_index, LLVMConstInt(ctx->ac.i32, stride / type_size, 0), "");581582LLVMValueRef val_offset = LLVMConstInt(ctx->ac.i32, offset, 0);583list = LLVMBuildGEP(builder, list, &val_offset, 1, "");584list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(type), "");585586LLVMValueRef descriptor = ac_build_load_to_sgpr(&ctx->ac, list, adjusted_index);587588/* 3 plane formats always have same size and format for plane 1 & 2, so589* use the tail from plane 1 so that we can store only the first 16 bytes590* of the last plane. */591if (desc_type == AC_DESC_PLANE_2) {592LLVMValueRef descriptor2 =593radv_get_sampler_desc(abi, descriptor_set, base_index, constant_index, index,594AC_DESC_PLANE_1, image, write, bindless);595596LLVMValueRef components[8];597for (unsigned i = 0; i < 4; ++i)598components[i] = ac_llvm_extract_elem(&ctx->ac, descriptor, i);599600for (unsigned i = 4; i < 8; ++i)601components[i] = ac_llvm_extract_elem(&ctx->ac, descriptor2, i);602descriptor = ac_build_gather_values(&ctx->ac, components, 8);603} else if (desc_type == AC_DESC_IMAGE &&604ctx->args->options->has_image_load_dcc_bug &&605image && !write) {606LLVMValueRef components[8];607608for (unsigned i = 0; i < 8; i++)609components[i] = ac_llvm_extract_elem(&ctx->ac, descriptor, i);610611/* WRITE_COMPRESS_ENABLE must be 0 for all image loads to workaround a hardware bug. */612components[6] = LLVMBuildAnd(ctx->ac.builder, components[6],613LLVMConstInt(ctx->ac.i32, C_00A018_WRITE_COMPRESS_ENABLE, false), "");614615descriptor = ac_build_gather_values(&ctx->ac, components, 8);616}617618return descriptor;619}620621/* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.622* so we may need to fix it up. */623static LLVMValueRef624adjust_vertex_fetch_alpha(struct radv_shader_context *ctx, unsigned adjustment, LLVMValueRef alpha)625{626if (adjustment == AC_FETCH_FORMAT_NONE)627return alpha;628629LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0);630631alpha = LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.f32, "");632633if (adjustment == AC_FETCH_FORMAT_SSCALED)634alpha = LLVMBuildFPToUI(ctx->ac.builder, alpha, ctx->ac.i32, "");635else636alpha = ac_to_integer(&ctx->ac, alpha);637638/* For the integer-like cases, do a natural sign extension.639*640* For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0641* and happen to contain 0, 1, 2, 3 as the two LSBs of the642* exponent.643*/644alpha =645LLVMBuildShl(ctx->ac.builder, alpha,646adjustment == AC_FETCH_FORMAT_SNORM ? LLVMConstInt(ctx->ac.i32, 7, 0) : c30, "");647alpha = LLVMBuildAShr(ctx->ac.builder, alpha, c30, "");648649/* Convert back to the right type. */650if (adjustment == AC_FETCH_FORMAT_SNORM) {651LLVMValueRef clamp;652LLVMValueRef neg_one = LLVMConstReal(ctx->ac.f32, -1.0);653alpha = LLVMBuildSIToFP(ctx->ac.builder, alpha, ctx->ac.f32, "");654clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, alpha, neg_one, "");655alpha = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, alpha, "");656} else if (adjustment == AC_FETCH_FORMAT_SSCALED) {657alpha = LLVMBuildSIToFP(ctx->ac.builder, alpha, ctx->ac.f32, "");658}659660return LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.i32, "");661}662663static LLVMValueRef664radv_fixup_vertex_input_fetches(struct radv_shader_context *ctx, LLVMValueRef value,665unsigned num_channels, bool is_float)666{667LLVMValueRef zero = is_float ? ctx->ac.f32_0 : ctx->ac.i32_0;668LLVMValueRef one = is_float ? ctx->ac.f32_1 : ctx->ac.i32_1;669LLVMValueRef chan[4];670671if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {672unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));673674if (num_channels == 4 && num_channels == vec_size)675return value;676677num_channels = MIN2(num_channels, vec_size);678679for (unsigned i = 0; i < num_channels; i++)680chan[i] = ac_llvm_extract_elem(&ctx->ac, value, i);681} else {682assert(num_channels == 1);683chan[0] = value;684}685686for (unsigned i = num_channels; i < 4; i++) {687chan[i] = i == 3 ? one : zero;688chan[i] = ac_to_integer(&ctx->ac, chan[i]);689}690691return ac_build_gather_values(&ctx->ac, chan, 4);692}693694static void695handle_vs_input_decl(struct radv_shader_context *ctx, struct nir_variable *variable)696{697LLVMValueRef t_list_ptr = ac_get_arg(&ctx->ac, ctx->args->ac.vertex_buffers);698LLVMValueRef t_offset;699LLVMValueRef t_list;700LLVMValueRef input;701LLVMValueRef buffer_index;702unsigned attrib_count = glsl_count_attribute_slots(variable->type, true);703704enum glsl_base_type type = glsl_get_base_type(variable->type);705for (unsigned i = 0; i < attrib_count; ++i) {706LLVMValueRef output[4];707unsigned attrib_index = variable->data.location + i - VERT_ATTRIB_GENERIC0;708unsigned attrib_format = ctx->args->options->key.vs.vertex_attribute_formats[attrib_index];709unsigned data_format = attrib_format & 0x0f;710unsigned num_format = (attrib_format >> 4) & 0x07;711bool is_float =712num_format != V_008F0C_BUF_NUM_FORMAT_UINT && num_format != V_008F0C_BUF_NUM_FORMAT_SINT;713uint8_t input_usage_mask =714ctx->args->shader_info->vs.input_usage_mask[variable->data.location + i];715unsigned num_input_channels = util_last_bit(input_usage_mask);716717if (num_input_channels == 0)718continue;719720if (ctx->args->options->key.vs.instance_rate_inputs & (1u << attrib_index)) {721uint32_t divisor = ctx->args->options->key.vs.instance_rate_divisors[attrib_index];722723if (divisor) {724buffer_index = ctx->abi.instance_id;725726if (divisor != 1) {727buffer_index = LLVMBuildUDiv(ctx->ac.builder, buffer_index,728LLVMConstInt(ctx->ac.i32, divisor, 0), "");729}730} else {731buffer_index = ctx->ac.i32_0;732}733734buffer_index = LLVMBuildAdd(735ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->ac.start_instance), buffer_index, "");736} else {737buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id,738ac_get_arg(&ctx->ac, ctx->args->ac.base_vertex), "");739}740741const struct ac_data_format_info *vtx_info = ac_get_data_format_info(data_format);742743/* Adjust the number of channels to load based on the vertex744* attribute format.745*/746unsigned num_channels = MIN2(num_input_channels, vtx_info->num_channels);747unsigned attrib_binding = ctx->args->options->key.vs.vertex_attribute_bindings[attrib_index];748unsigned attrib_offset = ctx->args->options->key.vs.vertex_attribute_offsets[attrib_index];749unsigned attrib_stride = ctx->args->options->key.vs.vertex_attribute_strides[attrib_index];750unsigned alpha_adjust = ctx->args->options->key.vs.alpha_adjust[attrib_index];751752if (ctx->args->options->key.vs.post_shuffle & (1 << attrib_index)) {753/* Always load, at least, 3 channels for formats that754* need to be shuffled because X<->Z.755*/756num_channels = MAX2(num_channels, 3);757}758759unsigned desc_index =760ctx->args->shader_info->vs.use_per_attribute_vb_descs ? attrib_index : attrib_binding;761desc_index = util_bitcount(ctx->args->shader_info->vs.vb_desc_usage_mask &762u_bit_consecutive(0, desc_index));763t_offset = LLVMConstInt(ctx->ac.i32, desc_index, false);764t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);765766/* Always split typed vertex buffer loads on GFX6 and GFX10+767* to avoid any alignment issues that triggers memory768* violations and eventually a GPU hang. This can happen if769* the stride (static or dynamic) is unaligned and also if the770* VBO offset is aligned to a scalar (eg. stride is 8 and VBO771* offset is 2 for R16G16B16A16_SNORM).772*/773if (ctx->ac.chip_class == GFX6 || ctx->ac.chip_class >= GFX10) {774unsigned chan_format = vtx_info->chan_format;775LLVMValueRef values[4];776777assert(ctx->ac.chip_class == GFX6 || ctx->ac.chip_class >= GFX10);778779for (unsigned chan = 0; chan < num_channels; chan++) {780unsigned chan_offset = attrib_offset + chan * vtx_info->chan_byte_size;781LLVMValueRef chan_index = buffer_index;782783if (attrib_stride != 0 && chan_offset > attrib_stride) {784LLVMValueRef buffer_offset =785LLVMConstInt(ctx->ac.i32, chan_offset / attrib_stride, false);786787chan_index = LLVMBuildAdd(ctx->ac.builder, buffer_index, buffer_offset, "");788789chan_offset = chan_offset % attrib_stride;790}791792values[chan] = ac_build_struct_tbuffer_load(793&ctx->ac, t_list, chan_index, LLVMConstInt(ctx->ac.i32, chan_offset, false),794ctx->ac.i32_0, ctx->ac.i32_0, 1, chan_format, num_format, 0, true);795}796797input = ac_build_gather_values(&ctx->ac, values, num_channels);798} else {799if (attrib_stride != 0 && attrib_offset > attrib_stride) {800LLVMValueRef buffer_offset =801LLVMConstInt(ctx->ac.i32, attrib_offset / attrib_stride, false);802803buffer_index = LLVMBuildAdd(ctx->ac.builder, buffer_index, buffer_offset, "");804805attrib_offset = attrib_offset % attrib_stride;806}807808input = ac_build_struct_tbuffer_load(809&ctx->ac, t_list, buffer_index, LLVMConstInt(ctx->ac.i32, attrib_offset, false),810ctx->ac.i32_0, ctx->ac.i32_0, num_channels, data_format, num_format, 0, true);811}812813if (ctx->args->options->key.vs.post_shuffle & (1 << attrib_index)) {814LLVMValueRef c[4];815c[0] = ac_llvm_extract_elem(&ctx->ac, input, 2);816c[1] = ac_llvm_extract_elem(&ctx->ac, input, 1);817c[2] = ac_llvm_extract_elem(&ctx->ac, input, 0);818c[3] = ac_llvm_extract_elem(&ctx->ac, input, 3);819820input = ac_build_gather_values(&ctx->ac, c, 4);821}822823input = radv_fixup_vertex_input_fetches(ctx, input, num_channels, is_float);824825for (unsigned chan = 0; chan < 4; chan++) {826LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false);827output[chan] = LLVMBuildExtractElement(ctx->ac.builder, input, llvm_chan, "");828if (type == GLSL_TYPE_FLOAT16) {829output[chan] = LLVMBuildBitCast(ctx->ac.builder, output[chan], ctx->ac.f32, "");830output[chan] = LLVMBuildFPTrunc(ctx->ac.builder, output[chan], ctx->ac.f16, "");831}832}833834output[3] = adjust_vertex_fetch_alpha(ctx, alpha_adjust, output[3]);835836for (unsigned chan = 0; chan < 4; chan++) {837output[chan] = ac_to_integer(&ctx->ac, output[chan]);838if (type == GLSL_TYPE_UINT16 || type == GLSL_TYPE_INT16)839output[chan] = LLVMBuildTrunc(ctx->ac.builder, output[chan], ctx->ac.i16, "");840841ctx->inputs[ac_llvm_reg_index_soa(variable->data.location + i, chan)] = output[chan];842}843}844}845846static void847handle_vs_inputs(struct radv_shader_context *ctx, struct nir_shader *nir)848{849nir_foreach_shader_in_variable (variable, nir)850handle_vs_input_decl(ctx, variable);851}852853static void854prepare_interp_optimize(struct radv_shader_context *ctx, struct nir_shader *nir)855{856bool uses_center = false;857bool uses_centroid = false;858nir_foreach_shader_in_variable (variable, nir) {859if (glsl_get_base_type(glsl_without_array(variable->type)) != GLSL_TYPE_FLOAT ||860variable->data.sample)861continue;862863if (variable->data.centroid)864uses_centroid = true;865else866uses_center = true;867}868869ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args->ac.persp_centroid);870ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args->ac.linear_centroid);871872if (uses_center && uses_centroid) {873LLVMValueRef sel =874LLVMBuildICmp(ctx->ac.builder, LLVMIntSLT, ac_get_arg(&ctx->ac, ctx->args->ac.prim_mask),875ctx->ac.i32_0, "");876ctx->abi.persp_centroid =877LLVMBuildSelect(ctx->ac.builder, sel, ac_get_arg(&ctx->ac, ctx->args->ac.persp_center),878ctx->abi.persp_centroid, "");879ctx->abi.linear_centroid =880LLVMBuildSelect(ctx->ac.builder, sel, ac_get_arg(&ctx->ac, ctx->args->ac.linear_center),881ctx->abi.linear_centroid, "");882}883}884885static void886scan_shader_output_decl(struct radv_shader_context *ctx, struct nir_variable *variable,887struct nir_shader *shader, gl_shader_stage stage)888{889int idx = variable->data.driver_location;890unsigned attrib_count = glsl_count_attribute_slots(variable->type, false);891uint64_t mask_attribs;892893if (variable->data.compact) {894unsigned component_count = variable->data.location_frac + glsl_get_length(variable->type);895attrib_count = (component_count + 3) / 4;896}897898mask_attribs = ((1ull << attrib_count) - 1) << idx;899900ctx->output_mask |= mask_attribs;901}902903/* Initialize arguments for the shader export intrinsic */904static void905si_llvm_init_export_args(struct radv_shader_context *ctx, LLVMValueRef *values,906unsigned enabled_channels, unsigned target, struct ac_export_args *args)907{908/* Specify the channels that are enabled. */909args->enabled_channels = enabled_channels;910911/* Specify whether the EXEC mask represents the valid mask */912args->valid_mask = 0;913914/* Specify whether this is the last export */915args->done = 0;916917/* Specify the target we are exporting */918args->target = target;919920args->compr = false;921args->out[0] = LLVMGetUndef(ctx->ac.f32);922args->out[1] = LLVMGetUndef(ctx->ac.f32);923args->out[2] = LLVMGetUndef(ctx->ac.f32);924args->out[3] = LLVMGetUndef(ctx->ac.f32);925926if (!values)927return;928929bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2;930if (ctx->stage == MESA_SHADER_FRAGMENT) {931unsigned index = target - V_008DFC_SQ_EXP_MRT;932unsigned col_format = (ctx->args->options->key.fs.col_format >> (4 * index)) & 0xf;933bool is_int8 = (ctx->args->options->key.fs.is_int8 >> index) & 1;934bool is_int10 = (ctx->args->options->key.fs.is_int10 >> index) & 1;935936LLVMValueRef (*packf)(struct ac_llvm_context * ctx, LLVMValueRef args[2]) = NULL;937LLVMValueRef (*packi)(struct ac_llvm_context * ctx, LLVMValueRef args[2], unsigned bits,938bool hi) = NULL;939940switch (col_format) {941case V_028714_SPI_SHADER_ZERO:942args->enabled_channels = 0; /* writemask */943args->target = V_008DFC_SQ_EXP_NULL;944break;945946case V_028714_SPI_SHADER_32_R:947args->enabled_channels = 1;948args->out[0] = values[0];949break;950951case V_028714_SPI_SHADER_32_GR:952args->enabled_channels = 0x3;953args->out[0] = values[0];954args->out[1] = values[1];955break;956957case V_028714_SPI_SHADER_32_AR:958if (ctx->ac.chip_class >= GFX10) {959args->enabled_channels = 0x3;960args->out[0] = values[0];961args->out[1] = values[3];962} else {963args->enabled_channels = 0x9;964args->out[0] = values[0];965args->out[3] = values[3];966}967break;968969case V_028714_SPI_SHADER_FP16_ABGR:970args->enabled_channels = 0xf;971packf = ac_build_cvt_pkrtz_f16;972if (is_16bit) {973for (unsigned chan = 0; chan < 4; chan++)974values[chan] = LLVMBuildFPExt(ctx->ac.builder, values[chan], ctx->ac.f32, "");975}976break;977978case V_028714_SPI_SHADER_UNORM16_ABGR:979args->enabled_channels = 0xf;980packf = ac_build_cvt_pknorm_u16;981break;982983case V_028714_SPI_SHADER_SNORM16_ABGR:984args->enabled_channels = 0xf;985packf = ac_build_cvt_pknorm_i16;986break;987988case V_028714_SPI_SHADER_UINT16_ABGR:989args->enabled_channels = 0xf;990packi = ac_build_cvt_pk_u16;991if (is_16bit) {992for (unsigned chan = 0; chan < 4; chan++)993values[chan] = LLVMBuildZExt(ctx->ac.builder, ac_to_integer(&ctx->ac, values[chan]),994ctx->ac.i32, "");995}996break;997998case V_028714_SPI_SHADER_SINT16_ABGR:999args->enabled_channels = 0xf;1000packi = ac_build_cvt_pk_i16;1001if (is_16bit) {1002for (unsigned chan = 0; chan < 4; chan++)1003values[chan] = LLVMBuildSExt(ctx->ac.builder, ac_to_integer(&ctx->ac, values[chan]),1004ctx->ac.i32, "");1005}1006break;10071008default:1009case V_028714_SPI_SHADER_32_ABGR:1010memcpy(&args->out[0], values, sizeof(values[0]) * 4);1011break;1012}10131014/* Replace NaN by zero (only 32-bit) to fix game bugs if1015* requested.1016*/1017if (ctx->args->options->enable_mrt_output_nan_fixup && !is_16bit &&1018(col_format == V_028714_SPI_SHADER_32_R || col_format == V_028714_SPI_SHADER_32_GR ||1019col_format == V_028714_SPI_SHADER_32_AR || col_format == V_028714_SPI_SHADER_32_ABGR ||1020col_format == V_028714_SPI_SHADER_FP16_ABGR)) {1021for (unsigned i = 0; i < 4; i++) {1022LLVMValueRef class_args[2] = {values[i],1023LLVMConstInt(ctx->ac.i32, S_NAN | Q_NAN, false)};1024LLVMValueRef isnan = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.class.f32", ctx->ac.i1,1025class_args, 2, AC_FUNC_ATTR_READNONE);1026values[i] = LLVMBuildSelect(ctx->ac.builder, isnan, ctx->ac.f32_0, values[i], "");1027}1028}10291030/* Pack f16 or norm_i16/u16. */1031if (packf) {1032for (unsigned chan = 0; chan < 2; chan++) {1033LLVMValueRef pack_args[2] = {values[2 * chan], values[2 * chan + 1]};1034LLVMValueRef packed;10351036packed = packf(&ctx->ac, pack_args);1037args->out[chan] = ac_to_float(&ctx->ac, packed);1038}1039args->compr = 1; /* COMPR flag */1040}10411042/* Pack i16/u16. */1043if (packi) {1044for (unsigned chan = 0; chan < 2; chan++) {1045LLVMValueRef pack_args[2] = {ac_to_integer(&ctx->ac, values[2 * chan]),1046ac_to_integer(&ctx->ac, values[2 * chan + 1])};1047LLVMValueRef packed;10481049packed = packi(&ctx->ac, pack_args, is_int8 ? 8 : is_int10 ? 10 : 16, chan == 1);1050args->out[chan] = ac_to_float(&ctx->ac, packed);1051}1052args->compr = 1; /* COMPR flag */1053}1054return;1055}10561057if (is_16bit) {1058for (unsigned chan = 0; chan < 4; chan++) {1059values[chan] = LLVMBuildBitCast(ctx->ac.builder, values[chan], ctx->ac.i16, "");1060args->out[chan] = LLVMBuildZExt(ctx->ac.builder, values[chan], ctx->ac.i32, "");1061}1062} else1063memcpy(&args->out[0], values, sizeof(values[0]) * 4);10641065for (unsigned i = 0; i < 4; ++i)1066args->out[i] = ac_to_float(&ctx->ac, args->out[i]);1067}10681069static void1070radv_export_param(struct radv_shader_context *ctx, unsigned index, LLVMValueRef *values,1071unsigned enabled_channels)1072{1073struct ac_export_args args;10741075si_llvm_init_export_args(ctx, values, enabled_channels, V_008DFC_SQ_EXP_PARAM + index, &args);1076ac_build_export(&ctx->ac, &args);1077}10781079static LLVMValueRef1080radv_load_output(struct radv_shader_context *ctx, unsigned index, unsigned chan)1081{1082LLVMValueRef output = ctx->abi.outputs[ac_llvm_reg_index_soa(index, chan)];1083return LLVMBuildLoad(ctx->ac.builder, output, "");1084}10851086static void1087radv_emit_stream_output(struct radv_shader_context *ctx, LLVMValueRef const *so_buffers,1088LLVMValueRef const *so_write_offsets,1089const struct radv_stream_output *output,1090struct radv_shader_output_values *shader_out)1091{1092unsigned num_comps = util_bitcount(output->component_mask);1093unsigned buf = output->buffer;1094unsigned offset = output->offset;1095unsigned start;1096LLVMValueRef out[4];10971098assert(num_comps && num_comps <= 4);1099if (!num_comps || num_comps > 4)1100return;11011102/* Get the first component. */1103start = ffs(output->component_mask) - 1;11041105/* Load the output as int. */1106for (int i = 0; i < num_comps; i++) {1107out[i] = ac_to_integer(&ctx->ac, shader_out->values[start + i]);1108}11091110/* Pack the output. */1111LLVMValueRef vdata = NULL;11121113switch (num_comps) {1114case 1: /* as i32 */1115vdata = out[0];1116break;1117case 2: /* as v2i32 */1118case 3: /* as v4i32 (aligned to 4) */1119out[3] = LLVMGetUndef(ctx->ac.i32);1120FALLTHROUGH;1121case 4: /* as v4i32 */1122vdata = ac_build_gather_values(&ctx->ac, out,1123!ac_has_vec3_support(ctx->ac.chip_class, false)1124? util_next_power_of_two(num_comps)1125: num_comps);1126break;1127}11281129ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf], vdata, num_comps, so_write_offsets[buf],1130ctx->ac.i32_0, offset, ac_glc | ac_slc);1131}11321133static void1134radv_emit_streamout(struct radv_shader_context *ctx, unsigned stream)1135{1136int i;11371138/* Get bits [22:16], i.e. (so_param >> 16) & 127; */1139assert(ctx->args->ac.streamout_config.used);1140LLVMValueRef so_vtx_count = ac_build_bfe(1141&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.streamout_config),1142LLVMConstInt(ctx->ac.i32, 16, false), LLVMConstInt(ctx->ac.i32, 7, false), false);11431144LLVMValueRef tid = ac_get_thread_id(&ctx->ac);11451146/* can_emit = tid < so_vtx_count; */1147LLVMValueRef can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, tid, so_vtx_count, "");11481149/* Emit the streamout code conditionally. This actually avoids1150* out-of-bounds buffer access. The hw tells us via the SGPR1151* (so_vtx_count) which threads are allowed to emit streamout data.1152*/1153ac_build_ifcc(&ctx->ac, can_emit, 6501);1154{1155/* The buffer offset is computed as follows:1156* ByteOffset = streamout_offset[buffer_id]*4 +1157* (streamout_write_index + thread_id)*stride[buffer_id] +1158* attrib_offset1159*/1160LLVMValueRef so_write_index = ac_get_arg(&ctx->ac, ctx->args->ac.streamout_write_index);11611162/* Compute (streamout_write_index + thread_id). */1163so_write_index = LLVMBuildAdd(ctx->ac.builder, so_write_index, tid, "");11641165/* Load the descriptor and compute the write offset for each1166* enabled buffer.1167*/1168LLVMValueRef so_write_offset[4] = {0};1169LLVMValueRef so_buffers[4] = {0};1170LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->args->streamout_buffers);11711172for (i = 0; i < 4; i++) {1173uint16_t stride = ctx->args->shader_info->so.strides[i];11741175if (!stride)1176continue;11771178LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, i, false);11791180so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);11811182LLVMValueRef so_offset = ac_get_arg(&ctx->ac, ctx->args->ac.streamout_offset[i]);11831184so_offset =1185LLVMBuildMul(ctx->ac.builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, false), "");11861187so_write_offset[i] = ac_build_imad(1188&ctx->ac, so_write_index, LLVMConstInt(ctx->ac.i32, stride * 4, false), so_offset);1189}11901191/* Write streamout data. */1192for (i = 0; i < ctx->args->shader_info->so.num_outputs; i++) {1193struct radv_shader_output_values shader_out = {0};1194struct radv_stream_output *output = &ctx->args->shader_info->so.outputs[i];11951196if (stream != output->stream)1197continue;11981199for (int j = 0; j < 4; j++) {1200shader_out.values[j] = radv_load_output(ctx, output->location, j);1201}12021203radv_emit_stream_output(ctx, so_buffers, so_write_offset, output, &shader_out);1204}1205}1206ac_build_endif(&ctx->ac, 6501);1207}12081209static void1210radv_build_param_exports(struct radv_shader_context *ctx, struct radv_shader_output_values *outputs,1211unsigned noutput, struct radv_vs_output_info *outinfo,1212bool export_clip_dists)1213{1214unsigned param_count = 0;12151216for (unsigned i = 0; i < noutput; i++) {1217unsigned slot_name = outputs[i].slot_name;1218unsigned usage_mask = outputs[i].usage_mask;12191220if (slot_name != VARYING_SLOT_LAYER && slot_name != VARYING_SLOT_PRIMITIVE_ID &&1221slot_name != VARYING_SLOT_VIEWPORT && slot_name != VARYING_SLOT_CLIP_DIST0 &&1222slot_name != VARYING_SLOT_CLIP_DIST1 && slot_name < VARYING_SLOT_VAR0)1223continue;12241225if ((slot_name == VARYING_SLOT_CLIP_DIST0 || slot_name == VARYING_SLOT_CLIP_DIST1) &&1226!export_clip_dists)1227continue;12281229radv_export_param(ctx, param_count, outputs[i].values, usage_mask);12301231assert(i < ARRAY_SIZE(outinfo->vs_output_param_offset));1232outinfo->vs_output_param_offset[slot_name] = param_count++;1233}12341235outinfo->param_exports = param_count;1236}12371238/* Generate export instructions for hardware VS shader stage or NGG GS stage1239* (position and parameter data only).1240*/1241static void1242radv_llvm_export_vs(struct radv_shader_context *ctx, struct radv_shader_output_values *outputs,1243unsigned noutput, struct radv_vs_output_info *outinfo, bool export_clip_dists)1244{1245LLVMValueRef psize_value = NULL, layer_value = NULL, viewport_value = NULL;1246LLVMValueRef primitive_shading_rate = NULL;1247struct ac_export_args pos_args[4] = {0};1248unsigned pos_idx, index;1249int i;12501251/* Build position exports */1252for (i = 0; i < noutput; i++) {1253switch (outputs[i].slot_name) {1254case VARYING_SLOT_POS:1255si_llvm_init_export_args(ctx, outputs[i].values, 0xf, V_008DFC_SQ_EXP_POS, &pos_args[0]);1256break;1257case VARYING_SLOT_PSIZ:1258psize_value = outputs[i].values[0];1259break;1260case VARYING_SLOT_LAYER:1261layer_value = outputs[i].values[0];1262break;1263case VARYING_SLOT_VIEWPORT:1264viewport_value = outputs[i].values[0];1265break;1266case VARYING_SLOT_PRIMITIVE_SHADING_RATE:1267primitive_shading_rate = outputs[i].values[0];1268break;1269case VARYING_SLOT_CLIP_DIST0:1270case VARYING_SLOT_CLIP_DIST1:1271index = 2 + outputs[i].slot_index;1272si_llvm_init_export_args(ctx, outputs[i].values, 0xf, V_008DFC_SQ_EXP_POS + index,1273&pos_args[index]);1274break;1275default:1276break;1277}1278}12791280/* We need to add the position output manually if it's missing. */1281if (!pos_args[0].out[0]) {1282pos_args[0].enabled_channels = 0xf; /* writemask */1283pos_args[0].valid_mask = 0; /* EXEC mask */1284pos_args[0].done = 0; /* last export? */1285pos_args[0].target = V_008DFC_SQ_EXP_POS;1286pos_args[0].compr = 0; /* COMPR flag */1287pos_args[0].out[0] = ctx->ac.f32_0; /* X */1288pos_args[0].out[1] = ctx->ac.f32_0; /* Y */1289pos_args[0].out[2] = ctx->ac.f32_0; /* Z */1290pos_args[0].out[3] = ctx->ac.f32_1; /* W */1291}12921293bool writes_primitive_shading_rate = outinfo->writes_primitive_shading_rate ||1294ctx->args->options->force_vrs_rates;12951296if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_layer ||1297outinfo->writes_viewport_index || writes_primitive_shading_rate) {1298pos_args[1].enabled_channels = ((outinfo->writes_pointsize == true ? 1 : 0) |1299(writes_primitive_shading_rate == true ? 2 : 0) |1300(outinfo->writes_layer == true ? 4 : 0));1301pos_args[1].valid_mask = 0;1302pos_args[1].done = 0;1303pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;1304pos_args[1].compr = 0;1305pos_args[1].out[0] = ctx->ac.f32_0; /* X */1306pos_args[1].out[1] = ctx->ac.f32_0; /* Y */1307pos_args[1].out[2] = ctx->ac.f32_0; /* Z */1308pos_args[1].out[3] = ctx->ac.f32_0; /* W */13091310if (outinfo->writes_pointsize == true)1311pos_args[1].out[0] = psize_value;1312if (outinfo->writes_layer == true)1313pos_args[1].out[2] = layer_value;1314if (outinfo->writes_viewport_index == true) {1315if (ctx->args->options->chip_class >= GFX9) {1316/* GFX9 has the layer in out.z[10:0] and the viewport1317* index in out.z[19:16].1318*/1319LLVMValueRef v = viewport_value;1320v = ac_to_integer(&ctx->ac, v);1321v = LLVMBuildShl(ctx->ac.builder, v, LLVMConstInt(ctx->ac.i32, 16, false), "");1322v = LLVMBuildOr(ctx->ac.builder, v, ac_to_integer(&ctx->ac, pos_args[1].out[2]), "");13231324pos_args[1].out[2] = ac_to_float(&ctx->ac, v);1325pos_args[1].enabled_channels |= 1 << 2;1326} else {1327pos_args[1].out[3] = viewport_value;1328pos_args[1].enabled_channels |= 1 << 3;1329}1330}13311332if (outinfo->writes_primitive_shading_rate) {1333pos_args[1].out[1] = primitive_shading_rate;1334} else if (ctx->args->options->force_vrs_rates) {1335/* Bits [2:3] = VRS rate X1336* Bits [4:5] = VRS rate Y1337*1338* The range is [-2, 1]. Values:1339* 1: 2x coarser shading rate in that direction.1340* 0: normal shading rate1341* -1: 2x finer shading rate (sample shading, not directional)1342* -2: 4x finer shading rate (sample shading, not directional)1343*1344* Sample shading can't go above 8 samples, so both numbers can't be -2 at the same time.1345*/1346LLVMValueRef rates = LLVMConstInt(ctx->ac.i32, ctx->args->options->force_vrs_rates, false);1347LLVMValueRef cond;1348LLVMValueRef v;13491350/* If Pos.W != 1 (typical for non-GUI elements), use 2x2 coarse shading. */1351cond = LLVMBuildFCmp(ctx->ac.builder, LLVMRealUNE, pos_args[0].out[3], ctx->ac.f32_1, "");1352v = LLVMBuildSelect(ctx->ac.builder, cond, rates, ctx->ac.i32_0, "");13531354pos_args[1].out[1] = ac_to_float(&ctx->ac, v);1355}1356}13571358for (i = 0; i < 4; i++) {1359if (pos_args[i].out[0])1360outinfo->pos_exports++;1361}13621363/* GFX10 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.1364* Setting valid_mask=1 prevents it and has no other effect.1365*/1366if (ctx->ac.chip_class == GFX10)1367pos_args[0].valid_mask = 1;13681369pos_idx = 0;1370for (i = 0; i < 4; i++) {1371if (!pos_args[i].out[0])1372continue;13731374/* Specify the target we are exporting */1375pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;13761377if (pos_idx == outinfo->pos_exports)1378/* Specify that this is the last export */1379pos_args[i].done = 1;13801381ac_build_export(&ctx->ac, &pos_args[i]);1382}13831384/* Build parameter exports */1385radv_build_param_exports(ctx, outputs, noutput, outinfo, export_clip_dists);1386}13871388static void1389handle_vs_outputs_post(struct radv_shader_context *ctx, bool export_prim_id, bool export_clip_dists,1390struct radv_vs_output_info *outinfo)1391{1392struct radv_shader_output_values *outputs;1393unsigned noutput = 0;13941395if (ctx->args->options->key.has_multiview_view_index) {1396LLVMValueRef *tmp_out = &ctx->abi.outputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)];1397if (!*tmp_out) {1398for (unsigned i = 0; i < 4; ++i)1399ctx->abi.outputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, i)] =1400ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, "");1401}14021403LLVMValueRef view_index = ac_get_arg(&ctx->ac, ctx->args->ac.view_index);1404LLVMBuildStore(ctx->ac.builder, ac_to_float(&ctx->ac, view_index), *tmp_out);1405ctx->output_mask |= 1ull << VARYING_SLOT_LAYER;1406}14071408memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,1409sizeof(outinfo->vs_output_param_offset));1410outinfo->pos_exports = 0;14111412if (!ctx->args->options->use_ngg_streamout && ctx->args->shader_info->so.num_outputs &&1413!ctx->args->is_gs_copy_shader) {1414/* The GS copy shader emission already emits streamout. */1415radv_emit_streamout(ctx, 0);1416}14171418/* Allocate a temporary array for the output values. */1419unsigned num_outputs = util_bitcount64(ctx->output_mask) + export_prim_id;1420outputs = malloc(num_outputs * sizeof(outputs[0]));14211422for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {1423if (!(ctx->output_mask & (1ull << i)))1424continue;14251426outputs[noutput].slot_name = i;1427outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1;14281429if (ctx->stage == MESA_SHADER_VERTEX && !ctx->args->is_gs_copy_shader) {1430outputs[noutput].usage_mask = ctx->args->shader_info->vs.output_usage_mask[i];1431} else if (ctx->stage == MESA_SHADER_TESS_EVAL) {1432outputs[noutput].usage_mask = ctx->args->shader_info->tes.output_usage_mask[i];1433} else {1434assert(ctx->args->is_gs_copy_shader);1435outputs[noutput].usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];1436}14371438for (unsigned j = 0; j < 4; j++) {1439outputs[noutput].values[j] = ac_to_float(&ctx->ac, radv_load_output(ctx, i, j));1440}14411442noutput++;1443}14441445/* Export PrimitiveID. */1446if (export_prim_id) {1447outputs[noutput].slot_name = VARYING_SLOT_PRIMITIVE_ID;1448outputs[noutput].slot_index = 0;1449outputs[noutput].usage_mask = 0x1;1450if (ctx->stage == MESA_SHADER_TESS_EVAL)1451outputs[noutput].values[0] = ac_get_arg(&ctx->ac, ctx->args->ac.tes_patch_id);1452else1453outputs[noutput].values[0] = ac_get_arg(&ctx->ac, ctx->args->ac.vs_prim_id);1454for (unsigned j = 1; j < 4; j++)1455outputs[noutput].values[j] = ctx->ac.f32_0;1456noutput++;1457}14581459radv_llvm_export_vs(ctx, outputs, noutput, outinfo, export_clip_dists);14601461free(outputs);1462}14631464static LLVMValueRef1465get_wave_id_in_tg(struct radv_shader_context *ctx)1466{1467return ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 24, 4);1468}14691470static LLVMValueRef1471get_tgsize(struct radv_shader_context *ctx)1472{1473return ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 28, 4);1474}14751476static LLVMValueRef1477get_thread_id_in_tg(struct radv_shader_context *ctx)1478{1479LLVMBuilderRef builder = ctx->ac.builder;1480LLVMValueRef tmp;1481tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),1482LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), "");1483return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), "");1484}14851486static LLVMValueRef1487ngg_get_vtx_cnt(struct radv_shader_context *ctx)1488{1489return ac_build_bfe(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_tg_info),1490LLVMConstInt(ctx->ac.i32, 12, false), LLVMConstInt(ctx->ac.i32, 9, false),1491false);1492}14931494static LLVMValueRef1495ngg_get_prim_cnt(struct radv_shader_context *ctx)1496{1497return ac_build_bfe(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_tg_info),1498LLVMConstInt(ctx->ac.i32, 22, false), LLVMConstInt(ctx->ac.i32, 9, false),1499false);1500}15011502static LLVMValueRef1503ngg_get_ordered_id(struct radv_shader_context *ctx)1504{1505return ac_build_bfe(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_tg_info), ctx->ac.i32_0,1506LLVMConstInt(ctx->ac.i32, 12, false), false);1507}15081509static LLVMValueRef1510ngg_gs_get_vertex_storage(struct radv_shader_context *ctx)1511{1512unsigned num_outputs = util_bitcount64(ctx->output_mask);15131514if (ctx->args->options->key.has_multiview_view_index)1515num_outputs++;15161517LLVMTypeRef elements[2] = {1518LLVMArrayType(ctx->ac.i32, 4 * num_outputs),1519LLVMArrayType(ctx->ac.i8, 4),1520};1521LLVMTypeRef type = LLVMStructTypeInContext(ctx->ac.context, elements, 2, false);1522type = LLVMPointerType(LLVMArrayType(type, 0), AC_ADDR_SPACE_LDS);1523return LLVMBuildBitCast(ctx->ac.builder, ctx->gs_ngg_emit, type, "");1524}15251526/**1527* Return a pointer to the LDS storage reserved for the N'th vertex, where N1528* is in emit order; that is:1529* - during the epilogue, N is the threadidx (relative to the entire threadgroup)1530* - during vertex emit, i.e. while the API GS shader invocation is running,1531* N = threadidx * gs_max_out_vertices + emitidx1532*1533* Goals of the LDS memory layout:1534* 1. Eliminate bank conflicts on write for geometry shaders that have all emits1535* in uniform control flow1536* 2. Eliminate bank conflicts on read for export if, additionally, there is no1537* culling1538* 3. Agnostic to the number of waves (since we don't know it before compiling)1539* 4. Allow coalescing of LDS instructions (ds_write_b128 etc.)1540* 5. Avoid wasting memory.1541*1542* We use an AoS layout due to point 4 (this also helps point 3). In an AoS1543* layout, elimination of bank conflicts requires that each vertex occupy an1544* odd number of dwords. We use the additional dword to store the output stream1545* index as well as a flag to indicate whether this vertex ends a primitive1546* for rasterization.1547*1548* Swizzling is required to satisfy points 1 and 2 simultaneously.1549*1550* Vertices are stored in export order (gsthread * gs_max_out_vertices + emitidx).1551* Indices are swizzled in groups of 32, which ensures point 1 without1552* disturbing point 2.1553*1554* \return an LDS pointer to type {[N x i32], [4 x i8]}1555*/1556static LLVMValueRef1557ngg_gs_vertex_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexidx)1558{1559LLVMBuilderRef builder = ctx->ac.builder;1560LLVMValueRef storage = ngg_gs_get_vertex_storage(ctx);15611562/* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */1563unsigned write_stride_2exp = ffs(MAX2(ctx->shader->info.gs.vertices_out, 1)) - 1;1564if (write_stride_2exp) {1565LLVMValueRef row = LLVMBuildLShr(builder, vertexidx, LLVMConstInt(ctx->ac.i32, 5, false), "");1566LLVMValueRef swizzle = LLVMBuildAnd(1567builder, row, LLVMConstInt(ctx->ac.i32, (1u << write_stride_2exp) - 1, false), "");1568vertexidx = LLVMBuildXor(builder, vertexidx, swizzle, "");1569}15701571return ac_build_gep0(&ctx->ac, storage, vertexidx);1572}15731574static LLVMValueRef1575ngg_gs_emit_vertex_ptr(struct radv_shader_context *ctx, LLVMValueRef gsthread, LLVMValueRef emitidx)1576{1577LLVMBuilderRef builder = ctx->ac.builder;1578LLVMValueRef tmp;15791580tmp = LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false);1581tmp = LLVMBuildMul(builder, tmp, gsthread, "");1582const LLVMValueRef vertexidx = LLVMBuildAdd(builder, tmp, emitidx, "");1583return ngg_gs_vertex_ptr(ctx, vertexidx);1584}15851586static LLVMValueRef1587ngg_gs_get_emit_output_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexptr,1588unsigned out_idx)1589{1590LLVMValueRef gep_idx[3] = {1591ctx->ac.i32_0, /* implied C-style array */1592ctx->ac.i32_0, /* first struct entry */1593LLVMConstInt(ctx->ac.i32, out_idx, false),1594};1595return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");1596}15971598static LLVMValueRef1599ngg_gs_get_emit_primflag_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexptr,1600unsigned stream)1601{1602LLVMValueRef gep_idx[3] = {1603ctx->ac.i32_0, /* implied C-style array */1604ctx->ac.i32_1, /* second struct entry */1605LLVMConstInt(ctx->ac.i32, stream, false),1606};1607return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");1608}16091610static struct radv_stream_output *1611radv_get_stream_output_by_loc(struct radv_streamout_info *so, unsigned location)1612{1613for (unsigned i = 0; i < so->num_outputs; ++i) {1614if (so->outputs[i].location == location)1615return &so->outputs[i];1616}16171618return NULL;1619}16201621static void1622build_streamout_vertex(struct radv_shader_context *ctx, LLVMValueRef *so_buffer,1623LLVMValueRef *wg_offset_dw, unsigned stream, LLVMValueRef offset_vtx,1624LLVMValueRef vertexptr)1625{1626struct radv_streamout_info *so = &ctx->args->shader_info->so;1627LLVMBuilderRef builder = ctx->ac.builder;1628LLVMValueRef offset[4] = {0};1629LLVMValueRef tmp;16301631for (unsigned buffer = 0; buffer < 4; ++buffer) {1632if (!wg_offset_dw[buffer])1633continue;16341635tmp = LLVMBuildMul(builder, offset_vtx, LLVMConstInt(ctx->ac.i32, so->strides[buffer], false),1636"");1637tmp = LLVMBuildAdd(builder, wg_offset_dw[buffer], tmp, "");1638offset[buffer] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 2, false), "");1639}16401641if (ctx->stage == MESA_SHADER_GEOMETRY) {1642struct radv_shader_output_values outputs[AC_LLVM_MAX_OUTPUTS];1643unsigned noutput = 0;1644unsigned out_idx = 0;16451646for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {1647unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];1648uint8_t output_stream = ctx->args->shader_info->gs.output_streams[i];16491650if (!(ctx->output_mask & (1ull << i)) || output_stream != stream)1651continue;16521653outputs[noutput].slot_name = i;1654outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1;1655outputs[noutput].usage_mask = output_usage_mask;16561657int length = util_last_bit(output_usage_mask);16581659for (unsigned j = 0; j < length; j++, out_idx++) {1660if (!(output_usage_mask & (1 << j)))1661continue;16621663tmp = ac_build_gep0(&ctx->ac, vertexptr, LLVMConstInt(ctx->ac.i32, out_idx, false));1664outputs[noutput].values[j] = LLVMBuildLoad(builder, tmp, "");1665}16661667for (unsigned j = length; j < 4; j++)1668outputs[noutput].values[j] = LLVMGetUndef(ctx->ac.f32);16691670noutput++;1671}16721673for (unsigned i = 0; i < noutput; i++) {1674struct radv_stream_output *output =1675radv_get_stream_output_by_loc(so, outputs[i].slot_name);16761677if (!output || output->stream != stream)1678continue;16791680struct radv_shader_output_values out = {0};16811682for (unsigned j = 0; j < 4; j++) {1683out.values[j] = outputs[i].values[j];1684}16851686radv_emit_stream_output(ctx, so_buffer, offset, output, &out);1687}1688} else {1689for (unsigned i = 0; i < so->num_outputs; ++i) {1690struct radv_stream_output *output = &ctx->args->shader_info->so.outputs[i];16911692if (stream != output->stream)1693continue;16941695struct radv_shader_output_values out = {0};16961697for (unsigned comp = 0; comp < 4; comp++) {1698if (!(output->component_mask & (1 << comp)))1699continue;17001701tmp =1702ac_build_gep0(&ctx->ac, vertexptr, LLVMConstInt(ctx->ac.i32, 4 * i + comp, false));1703out.values[comp] = LLVMBuildLoad(builder, tmp, "");1704}17051706radv_emit_stream_output(ctx, so_buffer, offset, output, &out);1707}1708}1709}17101711struct ngg_streamout {1712LLVMValueRef num_vertices;17131714/* per-thread data */1715LLVMValueRef prim_enable[4]; /* i1 per stream */1716LLVMValueRef vertices[3]; /* [N x i32] addrspace(LDS)* */17171718/* Output */1719LLVMValueRef emit[4]; /* per-stream emitted primitives (only valid for used streams) */1720};17211722/**1723* Build streamout logic.1724*1725* Implies a barrier.1726*1727* Writes number of emitted primitives to gs_ngg_scratch[4:7].1728*1729* Clobbers gs_ngg_scratch[8:].1730*/1731static void1732build_streamout(struct radv_shader_context *ctx, struct ngg_streamout *nggso)1733{1734struct radv_streamout_info *so = &ctx->args->shader_info->so;1735LLVMBuilderRef builder = ctx->ac.builder;1736LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->args->streamout_buffers);1737LLVMValueRef tid = get_thread_id_in_tg(ctx);1738LLVMValueRef cond, tmp, tmp2;1739LLVMValueRef i32_2 = LLVMConstInt(ctx->ac.i32, 2, false);1740LLVMValueRef i32_4 = LLVMConstInt(ctx->ac.i32, 4, false);1741LLVMValueRef i32_8 = LLVMConstInt(ctx->ac.i32, 8, false);1742LLVMValueRef so_buffer[4] = {0};1743unsigned max_num_vertices = 1 + (nggso->vertices[1] ? 1 : 0) + (nggso->vertices[2] ? 1 : 0);1744LLVMValueRef prim_stride_dw[4] = {0};1745LLVMValueRef prim_stride_dw_vgpr = LLVMGetUndef(ctx->ac.i32);1746int stream_for_buffer[4] = {-1, -1, -1, -1};1747unsigned bufmask_for_stream[4] = {0};1748bool isgs = ctx->stage == MESA_SHADER_GEOMETRY;1749unsigned scratch_emit_base = isgs ? 4 : 0;1750LLVMValueRef scratch_emit_basev = isgs ? i32_4 : ctx->ac.i32_0;1751unsigned scratch_offset_base = isgs ? 8 : 4;1752LLVMValueRef scratch_offset_basev = isgs ? i32_8 : i32_4;17531754ac_llvm_add_target_dep_function_attr(ctx->main_function, "amdgpu-gds-size", 256);17551756/* Determine the mapping of streamout buffers to vertex streams. */1757for (unsigned i = 0; i < so->num_outputs; ++i) {1758unsigned buf = so->outputs[i].buffer;1759unsigned stream = so->outputs[i].stream;1760assert(stream_for_buffer[buf] < 0 || stream_for_buffer[buf] == stream);1761stream_for_buffer[buf] = stream;1762bufmask_for_stream[stream] |= 1 << buf;1763}17641765for (unsigned buffer = 0; buffer < 4; ++buffer) {1766if (stream_for_buffer[buffer] == -1)1767continue;17681769assert(so->strides[buffer]);17701771LLVMValueRef stride_for_buffer = LLVMConstInt(ctx->ac.i32, so->strides[buffer], false);1772prim_stride_dw[buffer] = LLVMBuildMul(builder, stride_for_buffer, nggso->num_vertices, "");1773prim_stride_dw_vgpr =1774ac_build_writelane(&ctx->ac, prim_stride_dw_vgpr, prim_stride_dw[buffer],1775LLVMConstInt(ctx->ac.i32, buffer, false));17761777LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, buffer, false);1778so_buffer[buffer] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);1779}17801781cond = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");1782ac_build_ifcc(&ctx->ac, cond, 5200);1783{1784LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);1785LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, "");17861787/* Advance the streamout offsets in GDS. */1788LLVMValueRef offsets_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");1789LLVMValueRef generated_by_stream_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");17901791cond = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");1792ac_build_ifcc(&ctx->ac, cond, 5210);1793{1794/* Fetch the number of generated primitives and store1795* it in GDS for later use.1796*/1797if (isgs) {1798tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid);1799tmp = LLVMBuildLoad(builder, tmp, "");1800} else {1801tmp = ac_build_writelane(&ctx->ac, ctx->ac.i32_0, ngg_get_prim_cnt(ctx), ctx->ac.i32_0);1802}1803LLVMBuildStore(builder, tmp, generated_by_stream_vgpr);18041805unsigned swizzle[4];1806int unused_stream = -1;1807for (unsigned stream = 0; stream < 4; ++stream) {1808if (!ctx->args->shader_info->gs.num_stream_output_components[stream]) {1809unused_stream = stream;1810break;1811}1812}1813for (unsigned buffer = 0; buffer < 4; ++buffer) {1814if (stream_for_buffer[buffer] >= 0) {1815swizzle[buffer] = stream_for_buffer[buffer];1816} else {1817assert(unused_stream >= 0);1818swizzle[buffer] = unused_stream;1819}1820}18211822tmp = ac_build_quad_swizzle(&ctx->ac, tmp, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);1823tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");18241825LLVMValueRef args[] = {1826LLVMBuildIntToPtr(builder, ngg_get_ordered_id(ctx), gdsptr, ""),1827tmp,1828ctx->ac.i32_0, // ordering1829ctx->ac.i32_0, // scope1830ctx->ac.i1false, // isVolatile1831LLVMConstInt(ctx->ac.i32, 4 << 24, false), // OA index1832ctx->ac.i1true, // wave release1833ctx->ac.i1true, // wave done1834};18351836tmp = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add", ctx->ac.i32, args,1837ARRAY_SIZE(args), 0);18381839/* Keep offsets in a VGPR for quick retrieval via readlane by1840* the first wave for bounds checking, and also store in LDS1841* for retrieval by all waves later. */1842LLVMBuildStore(builder, tmp, offsets_vgpr);18431844tmp2 = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), scratch_offset_basev, "");1845tmp2 = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp2);1846LLVMBuildStore(builder, tmp, tmp2);1847}1848ac_build_endif(&ctx->ac, 5210);18491850/* Determine the max emit per buffer. This is done via the SALU, in part1851* because LLVM can't generate divide-by-multiply if we try to do this1852* via VALU with one lane per buffer.1853*/1854LLVMValueRef max_emit[4] = {0};1855for (unsigned buffer = 0; buffer < 4; ++buffer) {1856if (stream_for_buffer[buffer] == -1)1857continue;18581859/* Compute the streamout buffer size in DWORD. */1860LLVMValueRef bufsize_dw = LLVMBuildLShr(1861builder, LLVMBuildExtractElement(builder, so_buffer[buffer], i32_2, ""), i32_2, "");18621863/* Load the streamout buffer offset from GDS. */1864tmp = LLVMBuildLoad(builder, offsets_vgpr, "");1865LLVMValueRef offset_dw =1866ac_build_readlane(&ctx->ac, tmp, LLVMConstInt(ctx->ac.i32, buffer, false));18671868/* Compute the remaining size to emit. */1869LLVMValueRef remaining_dw = LLVMBuildSub(builder, bufsize_dw, offset_dw, "");1870tmp = LLVMBuildUDiv(builder, remaining_dw, prim_stride_dw[buffer], "");18711872cond = LLVMBuildICmp(builder, LLVMIntULT, bufsize_dw, offset_dw, "");1873max_emit[buffer] = LLVMBuildSelect(builder, cond, ctx->ac.i32_0, tmp, "");1874}18751876/* Determine the number of emitted primitives per stream and fixup the1877* GDS counter if necessary.1878*1879* This is complicated by the fact that a single stream can emit to1880* multiple buffers (but luckily not vice versa).1881*/1882LLVMValueRef emit_vgpr = ctx->ac.i32_0;18831884for (unsigned stream = 0; stream < 4; ++stream) {1885if (!ctx->args->shader_info->gs.num_stream_output_components[stream])1886continue;18871888/* Load the number of generated primitives from GDS and1889* determine that number for the given stream.1890*/1891tmp = LLVMBuildLoad(builder, generated_by_stream_vgpr, "");1892LLVMValueRef generated =1893ac_build_readlane(&ctx->ac, tmp, LLVMConstInt(ctx->ac.i32, stream, false));18941895/* Compute the number of emitted primitives. */1896LLVMValueRef emit = generated;1897for (unsigned buffer = 0; buffer < 4; ++buffer) {1898if (stream_for_buffer[buffer] == stream)1899emit = ac_build_umin(&ctx->ac, emit, max_emit[buffer]);1900}19011902/* Store the number of emitted primitives for that1903* stream.1904*/1905emit_vgpr =1906ac_build_writelane(&ctx->ac, emit_vgpr, emit, LLVMConstInt(ctx->ac.i32, stream, false));19071908/* Fixup the offset using a plain GDS atomic if we overflowed. */1909cond = LLVMBuildICmp(builder, LLVMIntULT, emit, generated, "");1910ac_build_ifcc(&ctx->ac, cond, 5221); /* scalar branch */1911tmp = LLVMBuildLShr(builder, LLVMConstInt(ctx->ac.i32, bufmask_for_stream[stream], false),1912ac_get_thread_id(&ctx->ac), "");1913tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");1914ac_build_ifcc(&ctx->ac, tmp, 5222);1915{1916tmp = LLVMBuildSub(builder, generated, emit, "");1917tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");1918tmp2 = LLVMBuildGEP(builder, gdsbase, &tid, 1, "");1919LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpSub, tmp2, tmp,1920LLVMAtomicOrderingMonotonic, false);1921}1922ac_build_endif(&ctx->ac, 5222);1923ac_build_endif(&ctx->ac, 5221);1924}19251926/* Store the number of emitted primitives to LDS for later use. */1927cond = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");1928ac_build_ifcc(&ctx->ac, cond, 5225);1929{1930tmp = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), scratch_emit_basev, "");1931tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp);1932LLVMBuildStore(builder, emit_vgpr, tmp);1933}1934ac_build_endif(&ctx->ac, 5225);1935}1936ac_build_endif(&ctx->ac, 5200);19371938/* Determine the workgroup-relative per-thread / primitive offset into1939* the streamout buffers */1940struct ac_wg_scan primemit_scan[4] = {0};19411942if (isgs) {1943for (unsigned stream = 0; stream < 4; ++stream) {1944if (!ctx->args->shader_info->gs.num_stream_output_components[stream])1945continue;19461947primemit_scan[stream].enable_exclusive = true;1948primemit_scan[stream].op = nir_op_iadd;1949primemit_scan[stream].src = nggso->prim_enable[stream];1950primemit_scan[stream].scratch = ac_build_gep0(1951&ctx->ac, ctx->gs_ngg_scratch, LLVMConstInt(ctx->ac.i32, 12 + 8 * stream, false));1952primemit_scan[stream].waveidx = get_wave_id_in_tg(ctx);1953primemit_scan[stream].numwaves = get_tgsize(ctx);1954primemit_scan[stream].maxwaves = 8;1955ac_build_wg_scan_top(&ctx->ac, &primemit_scan[stream]);1956}1957}19581959ac_build_s_barrier(&ctx->ac);19601961/* Fetch the per-buffer offsets and per-stream emit counts in all waves. */1962LLVMValueRef wgoffset_dw[4] = {0};19631964{1965LLVMValueRef scratch_vgpr;19661967tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ac_get_thread_id(&ctx->ac));1968scratch_vgpr = LLVMBuildLoad(builder, tmp, "");19691970for (unsigned buffer = 0; buffer < 4; ++buffer) {1971if (stream_for_buffer[buffer] >= 0) {1972wgoffset_dw[buffer] =1973ac_build_readlane(&ctx->ac, scratch_vgpr,1974LLVMConstInt(ctx->ac.i32, scratch_offset_base + buffer, false));1975}1976}19771978for (unsigned stream = 0; stream < 4; ++stream) {1979if (ctx->args->shader_info->gs.num_stream_output_components[stream]) {1980nggso->emit[stream] =1981ac_build_readlane(&ctx->ac, scratch_vgpr,1982LLVMConstInt(ctx->ac.i32, scratch_emit_base + stream, false));1983}1984}1985}19861987/* Write out primitive data */1988for (unsigned stream = 0; stream < 4; ++stream) {1989if (!ctx->args->shader_info->gs.num_stream_output_components[stream])1990continue;19911992if (isgs) {1993ac_build_wg_scan_bottom(&ctx->ac, &primemit_scan[stream]);1994} else {1995primemit_scan[stream].result_exclusive = tid;1996}19971998cond = LLVMBuildICmp(builder, LLVMIntULT, primemit_scan[stream].result_exclusive,1999nggso->emit[stream], "");2000cond = LLVMBuildAnd(builder, cond, nggso->prim_enable[stream], "");2001ac_build_ifcc(&ctx->ac, cond, 5240);2002{2003LLVMValueRef offset_vtx =2004LLVMBuildMul(builder, primemit_scan[stream].result_exclusive, nggso->num_vertices, "");20052006for (unsigned i = 0; i < max_num_vertices; ++i) {2007cond = LLVMBuildICmp(builder, LLVMIntULT, LLVMConstInt(ctx->ac.i32, i, false),2008nggso->num_vertices, "");2009ac_build_ifcc(&ctx->ac, cond, 5241);2010build_streamout_vertex(ctx, so_buffer, wgoffset_dw, stream, offset_vtx,2011nggso->vertices[i]);2012ac_build_endif(&ctx->ac, 5241);2013offset_vtx = LLVMBuildAdd(builder, offset_vtx, ctx->ac.i32_1, "");2014}2015}2016ac_build_endif(&ctx->ac, 5240);2017}2018}20192020static unsigned2021ngg_nogs_vertex_size(struct radv_shader_context *ctx)2022{2023unsigned lds_vertex_size = 0;20242025if (ctx->args->shader_info->so.num_outputs)2026lds_vertex_size = 4 * ctx->args->shader_info->so.num_outputs + 1;20272028return lds_vertex_size;2029}20302031/**2032* Returns an `[N x i32] addrspace(LDS)*` pointing at contiguous LDS storage2033* for the vertex outputs.2034*/2035static LLVMValueRef2036ngg_nogs_vertex_ptr(struct radv_shader_context *ctx, LLVMValueRef vtxid)2037{2038/* The extra dword is used to avoid LDS bank conflicts. */2039unsigned vertex_size = ngg_nogs_vertex_size(ctx);2040LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, vertex_size);2041LLVMTypeRef pai32 = LLVMPointerType(ai32, AC_ADDR_SPACE_LDS);2042LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, ctx->esgs_ring, pai32, "");2043return LLVMBuildGEP(ctx->ac.builder, tmp, &vtxid, 1, "");2044}20452046static void2047handle_ngg_outputs_post_1(struct radv_shader_context *ctx)2048{2049struct radv_streamout_info *so = &ctx->args->shader_info->so;2050LLVMBuilderRef builder = ctx->ac.builder;2051LLVMValueRef vertex_ptr = NULL;2052LLVMValueRef tmp, tmp2;20532054assert((ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) &&2055!ctx->args->is_gs_copy_shader);20562057if (!ctx->args->shader_info->so.num_outputs)2058return;20592060vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));20612062for (unsigned i = 0; i < so->num_outputs; ++i) {2063struct radv_stream_output *output = &ctx->args->shader_info->so.outputs[i];20642065unsigned loc = output->location;20662067for (unsigned comp = 0; comp < 4; comp++) {2068if (!(output->component_mask & (1 << comp)))2069continue;20702071tmp = ac_build_gep0(&ctx->ac, vertex_ptr, LLVMConstInt(ctx->ac.i32, 4 * i + comp, false));2072tmp2 = LLVMBuildLoad(builder, ctx->abi.outputs[4 * loc + comp], "");2073tmp2 = ac_to_integer(&ctx->ac, tmp2);2074LLVMBuildStore(builder, tmp2, tmp);2075}2076}2077}20782079static void2080handle_ngg_outputs_post_2(struct radv_shader_context *ctx)2081{2082LLVMBuilderRef builder = ctx->ac.builder;2083LLVMValueRef tmp;20842085assert((ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) &&2086!ctx->args->is_gs_copy_shader);20872088LLVMValueRef prims_in_wave =2089ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 8, 8);2090LLVMValueRef vtx_in_wave =2091ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 0, 8);2092LLVMValueRef is_gs_thread =2093LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), prims_in_wave, "");2094LLVMValueRef is_es_thread =2095LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), vtx_in_wave, "");2096LLVMValueRef vtxindex[] = {2097ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[0]), 0, 16),2098ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[0]), 16, 16),2099ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[2]), 0, 16),2100};21012102/* Determine the number of vertices per primitive. */2103unsigned num_vertices;2104LLVMValueRef num_vertices_val;21052106if (ctx->stage == MESA_SHADER_VERTEX) {2107LLVMValueRef outprim_val =2108LLVMConstInt(ctx->ac.i32, ctx->args->options->key.vs.outprim, false);2109num_vertices_val = LLVMBuildAdd(builder, outprim_val, ctx->ac.i32_1, "");2110num_vertices = 3; /* TODO: optimize for points & lines */2111} else {2112assert(ctx->stage == MESA_SHADER_TESS_EVAL);21132114if (ctx->shader->info.tess.point_mode)2115num_vertices = 1;2116else if (ctx->shader->info.tess.primitive_mode == GL_ISOLINES)2117num_vertices = 2;2118else2119num_vertices = 3;21202121num_vertices_val = LLVMConstInt(ctx->ac.i32, num_vertices, false);2122}21232124/* Streamout */2125if (ctx->args->shader_info->so.num_outputs) {2126struct ngg_streamout nggso = {0};21272128nggso.num_vertices = num_vertices_val;2129nggso.prim_enable[0] = is_gs_thread;21302131for (unsigned i = 0; i < num_vertices; ++i)2132nggso.vertices[i] = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);21332134build_streamout(ctx, &nggso);2135}21362137/* Copy Primitive IDs from GS threads to the LDS address corresponding2138* to the ES thread of the provoking vertex.2139*/2140if (ctx->stage == MESA_SHADER_VERTEX && ctx->args->options->key.vs_common_out.export_prim_id) {2141if (ctx->args->shader_info->so.num_outputs)2142ac_build_s_barrier(&ctx->ac);21432144ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);21452146LLVMValueRef provoking_vtx_in_prim = LLVMConstInt(ctx->ac.i32, 0, false);21472148/* For provoking vertex last mode, use num_vtx_in_prim - 1. */2149if (ctx->args->options->key.vs.provoking_vtx_last)2150provoking_vtx_in_prim = LLVMConstInt(ctx->ac.i32, ctx->args->options->key.vs.outprim, false);21512152/* provoking_vtx_index = vtxindex[provoking_vtx_in_prim]; */2153LLVMValueRef indices = ac_build_gather_values(&ctx->ac, vtxindex, 3);2154LLVMValueRef provoking_vtx_index =2155LLVMBuildExtractElement(builder, indices, provoking_vtx_in_prim, "");21562157LLVMBuildStore(builder, ac_get_arg(&ctx->ac, ctx->args->ac.gs_prim_id),2158ac_build_gep0(&ctx->ac, ctx->esgs_ring, provoking_vtx_index));2159ac_build_endif(&ctx->ac, 5400);2160}21612162/* TODO: primitive culling */21632164ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), ngg_get_vtx_cnt(ctx),2165ngg_get_prim_cnt(ctx));21662167/* TODO: streamout queries */2168/* Export primitive data to the index buffer.2169*2170* For the first version, we will always build up all three indices2171* independent of the primitive type. The additional garbage data2172* shouldn't hurt.2173*2174* TODO: culling depends on the primitive type, so can have some2175* interaction here.2176*/2177ac_build_ifcc(&ctx->ac, is_gs_thread, 6001);2178{2179struct ac_ngg_prim prim = {0};21802181if (ctx->args->options->key.vs_common_out.as_ngg_passthrough) {2182prim.passthrough = ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[0]);2183} else {2184prim.num_vertices = num_vertices;2185prim.isnull = ctx->ac.i1false;2186memcpy(prim.index, vtxindex, sizeof(vtxindex[0]) * 3);21872188for (unsigned i = 0; i < num_vertices; ++i) {2189tmp = LLVMBuildLShr(builder, ac_get_arg(&ctx->ac, ctx->args->ac.gs_invocation_id),2190LLVMConstInt(ctx->ac.i32, 8 + i, false), "");2191prim.edgeflag[i] = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");2192}2193}21942195ac_build_export_prim(&ctx->ac, &prim);2196}2197ac_build_endif(&ctx->ac, 6001);21982199/* Export per-vertex data (positions and parameters). */2200ac_build_ifcc(&ctx->ac, is_es_thread, 6002);2201{2202struct radv_vs_output_info *outinfo = ctx->stage == MESA_SHADER_TESS_EVAL2203? &ctx->args->shader_info->tes.outinfo2204: &ctx->args->shader_info->vs.outinfo;22052206/* Exporting the primitive ID is handled below. */2207/* TODO: use the new VS export path */2208handle_vs_outputs_post(ctx, false, ctx->args->options->key.vs_common_out.export_clip_dists,2209outinfo);22102211if (ctx->args->options->key.vs_common_out.export_prim_id) {2212unsigned param_count = outinfo->param_exports;2213LLVMValueRef values[4];22142215if (ctx->stage == MESA_SHADER_VERTEX) {2216/* Wait for GS stores to finish. */2217ac_build_s_barrier(&ctx->ac);22182219tmp = ac_build_gep0(&ctx->ac, ctx->esgs_ring, get_thread_id_in_tg(ctx));2220values[0] = LLVMBuildLoad(builder, tmp, "");2221} else {2222assert(ctx->stage == MESA_SHADER_TESS_EVAL);2223values[0] = ac_get_arg(&ctx->ac, ctx->args->ac.tes_patch_id);2224}22252226values[0] = ac_to_float(&ctx->ac, values[0]);2227for (unsigned j = 1; j < 4; j++)2228values[j] = ctx->ac.f32_0;22292230radv_export_param(ctx, param_count, values, 0x1);22312232outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = param_count++;2233outinfo->param_exports = param_count;2234}2235}2236ac_build_endif(&ctx->ac, 6002);2237}22382239static void2240gfx10_ngg_gs_emit_prologue(struct radv_shader_context *ctx)2241{2242/* Zero out the part of LDS scratch that is used to accumulate the2243* per-stream generated primitive count.2244*/2245LLVMBuilderRef builder = ctx->ac.builder;2246LLVMValueRef scratchptr = ctx->gs_ngg_scratch;2247LLVMValueRef tid = get_thread_id_in_tg(ctx);2248LLVMBasicBlockRef merge_block;2249LLVMValueRef cond;22502251LLVMValueRef fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->ac.builder));2252LLVMBasicBlockRef then_block = LLVMAppendBasicBlockInContext(ctx->ac.context, fn, "");2253merge_block = LLVMAppendBasicBlockInContext(ctx->ac.context, fn, "");22542255cond = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->ac.i32, 4, false), "");2256LLVMBuildCondBr(ctx->ac.builder, cond, then_block, merge_block);2257LLVMPositionBuilderAtEnd(ctx->ac.builder, then_block);22582259LLVMValueRef ptr = ac_build_gep0(&ctx->ac, scratchptr, tid);2260LLVMBuildStore(builder, ctx->ac.i32_0, ptr);22612262LLVMBuildBr(ctx->ac.builder, merge_block);2263LLVMPositionBuilderAtEnd(ctx->ac.builder, merge_block);22642265ac_build_s_barrier(&ctx->ac);2266}22672268static void2269gfx10_ngg_gs_emit_epilogue_1(struct radv_shader_context *ctx)2270{2271LLVMBuilderRef builder = ctx->ac.builder;2272LLVMValueRef i8_0 = LLVMConstInt(ctx->ac.i8, 0, false);2273LLVMValueRef tmp;22742275/* Zero out remaining (non-emitted) primitive flags.2276*2277* Note: Alternatively, we could pass the relevant gs_next_vertex to2278* the emit threads via LDS. This is likely worse in the expected2279* typical case where each GS thread emits the full set of2280* vertices.2281*/2282for (unsigned stream = 0; stream < 4; ++stream) {2283unsigned num_components;22842285num_components = ctx->args->shader_info->gs.num_stream_output_components[stream];2286if (!num_components)2287continue;22882289const LLVMValueRef gsthread = get_thread_id_in_tg(ctx);22902291ac_build_bgnloop(&ctx->ac, 5100);22922293const LLVMValueRef vertexidx = LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");2294tmp = LLVMBuildICmp(builder, LLVMIntUGE, vertexidx,2295LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false), "");2296ac_build_ifcc(&ctx->ac, tmp, 5101);2297ac_build_break(&ctx->ac);2298ac_build_endif(&ctx->ac, 5101);22992300tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");2301LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);23022303tmp = ngg_gs_emit_vertex_ptr(ctx, gsthread, vertexidx);2304LLVMBuildStore(builder, i8_0, ngg_gs_get_emit_primflag_ptr(ctx, tmp, stream));23052306ac_build_endloop(&ctx->ac, 5100);2307}23082309/* Accumulate generated primitives counts across the entire threadgroup. */2310for (unsigned stream = 0; stream < 4; ++stream) {2311unsigned num_components;23122313num_components = ctx->args->shader_info->gs.num_stream_output_components[stream];2314if (!num_components)2315continue;23162317LLVMValueRef numprims = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");2318numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, ctx->ac.wave_size);23192320tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->ac.i32_0, "");2321ac_build_ifcc(&ctx->ac, tmp, 5105);2322{2323LLVMBuildAtomicRMW(2324builder, LLVMAtomicRMWBinOpAdd,2325ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, LLVMConstInt(ctx->ac.i32, stream, false)),2326numprims, LLVMAtomicOrderingMonotonic, false);2327}2328ac_build_endif(&ctx->ac, 5105);2329}2330}23312332static void2333gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx)2334{2335const unsigned verts_per_prim =2336si_conv_gl_prim_to_vertices(ctx->shader->info.gs.output_primitive);2337LLVMBuilderRef builder = ctx->ac.builder;2338LLVMValueRef tmp, tmp2;23392340ac_build_s_barrier(&ctx->ac);23412342const LLVMValueRef tid = get_thread_id_in_tg(ctx);2343LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx);23442345/* Streamout */2346if (ctx->args->shader_info->so.num_outputs) {2347struct ngg_streamout nggso = {0};23482349nggso.num_vertices = LLVMConstInt(ctx->ac.i32, verts_per_prim, false);23502351LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tid);2352for (unsigned stream = 0; stream < 4; ++stream) {2353if (!ctx->args->shader_info->gs.num_stream_output_components[stream])2354continue;23552356tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream), "");2357tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");2358tmp2 = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");2359nggso.prim_enable[stream] = LLVMBuildAnd(builder, tmp, tmp2, "");2360}23612362for (unsigned i = 0; i < verts_per_prim; ++i) {2363tmp = LLVMBuildSub(builder, tid, LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false),2364"");2365tmp = ngg_gs_vertex_ptr(ctx, tmp);2366nggso.vertices[i] = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0);2367}23682369build_streamout(ctx, &nggso);2370}23712372/* Write shader query data. */2373tmp = ac_get_arg(&ctx->ac, ctx->args->ngg_gs_state);2374tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");2375ac_build_ifcc(&ctx->ac, tmp, 5109);2376tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->ac.i32, 4, false), "");2377ac_build_ifcc(&ctx->ac, tmp, 5110);2378{2379tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), "");23802381ac_llvm_add_target_dep_function_attr(ctx->main_function, "amdgpu-gds-size", 256);23822383LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);2384LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, "");23852386const char *sync_scope = "workgroup-one-as";23872388/* Use a plain GDS atomic to accumulate the number of generated2389* primitives.2390*/2391ac_build_atomic_rmw(&ctx->ac, LLVMAtomicRMWBinOpAdd, gdsbase, tmp, sync_scope);2392}2393ac_build_endif(&ctx->ac, 5110);2394ac_build_endif(&ctx->ac, 5109);23952396/* TODO: culling */23972398/* Determine vertex liveness. */2399LLVMValueRef vertliveptr = ac_build_alloca(&ctx->ac, ctx->ac.i1, "vertexlive");24002401tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");2402ac_build_ifcc(&ctx->ac, tmp, 5120);2403{2404for (unsigned i = 0; i < verts_per_prim; ++i) {2405const LLVMValueRef primidx =2406LLVMBuildAdd(builder, tid, LLVMConstInt(ctx->ac.i32, i, false), "");24072408if (i > 0) {2409tmp = LLVMBuildICmp(builder, LLVMIntULT, primidx, num_emit_threads, "");2410ac_build_ifcc(&ctx->ac, tmp, 5121 + i);2411}24122413/* Load primitive liveness */2414tmp = ngg_gs_vertex_ptr(ctx, primidx);2415tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");2416const LLVMValueRef primlive = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");24172418tmp = LLVMBuildLoad(builder, vertliveptr, "");2419tmp = LLVMBuildOr(builder, tmp, primlive, ""), LLVMBuildStore(builder, tmp, vertliveptr);24202421if (i > 0)2422ac_build_endif(&ctx->ac, 5121 + i);2423}2424}2425ac_build_endif(&ctx->ac, 5120);24262427/* Inclusive scan addition across the current wave. */2428LLVMValueRef vertlive = LLVMBuildLoad(builder, vertliveptr, "");2429struct ac_wg_scan vertlive_scan = {0};2430vertlive_scan.op = nir_op_iadd;2431vertlive_scan.enable_reduce = true;2432vertlive_scan.enable_exclusive = true;2433vertlive_scan.src = vertlive;2434vertlive_scan.scratch = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ctx->ac.i32_0);2435vertlive_scan.waveidx = get_wave_id_in_tg(ctx);2436vertlive_scan.numwaves = get_tgsize(ctx);2437vertlive_scan.maxwaves = 8;24382439ac_build_wg_scan(&ctx->ac, &vertlive_scan);24402441/* Skip all exports (including index exports) when possible. At least on2442* early gfx10 revisions this is also to avoid hangs.2443*/2444LLVMValueRef have_exports =2445LLVMBuildICmp(builder, LLVMIntNE, vertlive_scan.result_reduce, ctx->ac.i32_0, "");2446num_emit_threads = LLVMBuildSelect(builder, have_exports, num_emit_threads, ctx->ac.i32_0, "");24472448/* Allocate export space. Send this message as early as possible, to2449* hide the latency of the SQ <-> SPI roundtrip.2450*2451* Note: We could consider compacting primitives for export as well.2452* PA processes 1 non-null prim / clock, but it fetches 4 DW of2453* prim data per clock and skips null primitives at no additional2454* cost. So compacting primitives can only be beneficial when2455* there are 4 or more contiguous null primitives in the export2456* (in the common case of single-dword prim exports).2457*/2458ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), vertlive_scan.result_reduce,2459num_emit_threads);24602461/* Setup the reverse vertex compaction permutation. We re-use stream 12462* of the primitive liveness flags, relying on the fact that each2463* threadgroup can have at most 256 threads. */2464ac_build_ifcc(&ctx->ac, vertlive, 5130);2465{2466tmp = ngg_gs_vertex_ptr(ctx, vertlive_scan.result_exclusive);2467tmp2 = LLVMBuildTrunc(builder, tid, ctx->ac.i8, "");2468LLVMBuildStore(builder, tmp2, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1));2469}2470ac_build_endif(&ctx->ac, 5130);24712472ac_build_s_barrier(&ctx->ac);24732474/* Export primitive data */2475tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");2476ac_build_ifcc(&ctx->ac, tmp, 5140);2477{2478LLVMValueRef flags;2479struct ac_ngg_prim prim = {0};2480prim.num_vertices = verts_per_prim;24812482tmp = ngg_gs_vertex_ptr(ctx, tid);2483flags = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");2484prim.isnull = LLVMBuildNot(builder, LLVMBuildTrunc(builder, flags, ctx->ac.i1, ""), "");24852486for (unsigned i = 0; i < verts_per_prim; ++i) {2487prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive,2488LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), "");2489prim.edgeflag[i] = ctx->ac.i1false;2490}24912492/* Geometry shaders output triangle strips, but NGG expects triangles. */2493if (verts_per_prim == 3) {2494LLVMValueRef is_odd = LLVMBuildLShr(builder, flags, ctx->ac.i8_1, "");2495is_odd = LLVMBuildTrunc(builder, is_odd, ctx->ac.i1, "");24962497LLVMValueRef flatshade_first =2498LLVMConstInt(ctx->ac.i32, !ctx->args->options->key.vs.provoking_vtx_last, false);24992500ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, flatshade_first, prim.index);2501}25022503ac_build_export_prim(&ctx->ac, &prim);2504}2505ac_build_endif(&ctx->ac, 5140);25062507/* Export position and parameter data */2508tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, vertlive_scan.result_reduce, "");2509ac_build_ifcc(&ctx->ac, tmp, 5145);2510{2511struct radv_vs_output_info *outinfo = &ctx->args->shader_info->vs.outinfo;2512bool export_view_index = ctx->args->options->key.has_multiview_view_index;2513struct radv_shader_output_values *outputs;2514unsigned noutput = 0;25152516/* Allocate a temporary array for the output values. */2517unsigned num_outputs = util_bitcount64(ctx->output_mask) + export_view_index;2518outputs = calloc(num_outputs, sizeof(outputs[0]));25192520memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,2521sizeof(outinfo->vs_output_param_offset));2522outinfo->pos_exports = 0;25232524tmp = ngg_gs_vertex_ptr(ctx, tid);2525tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1), "");2526tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");2527const LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tmp);25282529unsigned out_idx = 0;2530for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {2531unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];2532int length = util_last_bit(output_usage_mask);25332534if (!(ctx->output_mask & (1ull << i)))2535continue;25362537outputs[noutput].slot_name = i;2538outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1;2539outputs[noutput].usage_mask = output_usage_mask;25402541for (unsigned j = 0; j < length; j++, out_idx++) {2542if (!(output_usage_mask & (1 << j)))2543continue;25442545tmp = ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx);2546tmp = LLVMBuildLoad(builder, tmp, "");25472548LLVMTypeRef type = LLVMGetAllocatedType(ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]);2549if (ac_get_type_size(type) == 2) {2550tmp = ac_to_integer(&ctx->ac, tmp);2551tmp = LLVMBuildTrunc(ctx->ac.builder, tmp, ctx->ac.i16, "");2552}25532554outputs[noutput].values[j] = ac_to_float(&ctx->ac, tmp);2555}25562557for (unsigned j = length; j < 4; j++)2558outputs[noutput].values[j] = LLVMGetUndef(ctx->ac.f32);25592560noutput++;2561}25622563/* Export ViewIndex. */2564if (export_view_index) {2565outputs[noutput].slot_name = VARYING_SLOT_LAYER;2566outputs[noutput].slot_index = 0;2567outputs[noutput].usage_mask = 0x1;2568outputs[noutput].values[0] =2569ac_to_float(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.view_index));2570for (unsigned j = 1; j < 4; j++)2571outputs[noutput].values[j] = ctx->ac.f32_0;2572noutput++;2573}25742575radv_llvm_export_vs(ctx, outputs, noutput, outinfo,2576ctx->args->options->key.vs_common_out.export_clip_dists);2577FREE(outputs);2578}2579ac_build_endif(&ctx->ac, 5145);2580}25812582static void2583gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx, unsigned stream, LLVMValueRef vertexidx,2584LLVMValueRef *addrs)2585{2586LLVMBuilderRef builder = ctx->ac.builder;2587LLVMValueRef tmp;25882589const LLVMValueRef vertexptr = ngg_gs_emit_vertex_ptr(ctx, get_thread_id_in_tg(ctx), vertexidx);2590unsigned out_idx = 0;2591for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {2592unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];2593uint8_t output_stream = ctx->args->shader_info->gs.output_streams[i];2594LLVMValueRef *out_ptr = &addrs[i * 4];2595int length = util_last_bit(output_usage_mask);25962597if (!(ctx->output_mask & (1ull << i)) || output_stream != stream)2598continue;25992600for (unsigned j = 0; j < length; j++, out_idx++) {2601if (!(output_usage_mask & (1 << j)))2602continue;26032604LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], "");2605out_val = ac_to_integer(&ctx->ac, out_val);2606out_val = LLVMBuildZExtOrBitCast(ctx->ac.builder, out_val, ctx->ac.i32, "");26072608LLVMBuildStore(builder, out_val, ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx));2609}2610}2611assert(out_idx * 4 <= ctx->args->shader_info->gs.gsvs_vertex_size);26122613/* Store the current number of emitted vertices to zero out remaining2614* primitive flags in case the geometry shader doesn't emit the maximum2615* number of vertices.2616*/2617tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");2618LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);26192620/* Determine and store whether this vertex completed a primitive. */2621const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], "");26222623tmp = LLVMConstInt(2624ctx->ac.i32, si_conv_gl_prim_to_vertices(ctx->shader->info.gs.output_primitive) - 1, false);2625const LLVMValueRef iscompleteprim = LLVMBuildICmp(builder, LLVMIntUGE, curverts, tmp, "");26262627/* Since the geometry shader emits triangle strips, we need to2628* track which primitive is odd and swap vertex indices to get2629* the correct vertex order.2630*/2631LLVMValueRef is_odd = ctx->ac.i1false;2632if (stream == 0 && si_conv_gl_prim_to_vertices(ctx->shader->info.gs.output_primitive) == 3) {2633tmp = LLVMBuildAnd(builder, curverts, ctx->ac.i32_1, "");2634is_odd = LLVMBuildICmp(builder, LLVMIntEQ, tmp, ctx->ac.i32_1, "");2635}26362637tmp = LLVMBuildAdd(builder, curverts, ctx->ac.i32_1, "");2638LLVMBuildStore(builder, tmp, ctx->gs_curprim_verts[stream]);26392640/* The per-vertex primitive flag encoding:2641* bit 0: whether this vertex finishes a primitive2642* bit 1: whether the primitive is odd (if we are emitting triangle strips)2643*/2644tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, "");2645tmp = LLVMBuildOr(2646builder, tmp,2647LLVMBuildShl(builder, LLVMBuildZExt(builder, is_odd, ctx->ac.i8, ""), ctx->ac.i8_1, ""), "");2648LLVMBuildStore(builder, tmp, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream));26492650tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");2651tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), "");2652LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]);2653}26542655static bool2656si_export_mrt_color(struct radv_shader_context *ctx, LLVMValueRef *color, unsigned index,2657struct ac_export_args *args)2658{2659/* Export */2660si_llvm_init_export_args(ctx, color, 0xf, V_008DFC_SQ_EXP_MRT + index, args);2661if (!args->enabled_channels)2662return false; /* unnecessary NULL export */26632664return true;2665}26662667static void2668radv_export_mrt_z(struct radv_shader_context *ctx, LLVMValueRef depth, LLVMValueRef stencil,2669LLVMValueRef samplemask)2670{2671struct ac_export_args args;26722673ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, &args);26742675ac_build_export(&ctx->ac, &args);2676}26772678static void2679handle_fs_outputs_post(struct radv_shader_context *ctx)2680{2681unsigned index = 0;2682LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;2683struct ac_export_args color_args[8];26842685for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {2686LLVMValueRef values[4];26872688if (!(ctx->output_mask & (1ull << i)))2689continue;26902691if (i < FRAG_RESULT_DATA0)2692continue;26932694for (unsigned j = 0; j < 4; j++)2695values[j] = ac_to_float(&ctx->ac, radv_load_output(ctx, i, j));26962697bool ret = si_export_mrt_color(ctx, values, i - FRAG_RESULT_DATA0, &color_args[index]);2698if (ret)2699index++;2700}27012702/* Process depth, stencil, samplemask. */2703if (ctx->args->shader_info->ps.writes_z) {2704depth = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_DEPTH, 0));2705}2706if (ctx->args->shader_info->ps.writes_stencil) {2707stencil = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_STENCIL, 0));2708}2709if (ctx->args->shader_info->ps.writes_sample_mask) {2710samplemask = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_SAMPLE_MASK, 0));2711}27122713/* Set the DONE bit on last non-null color export only if Z isn't2714* exported.2715*/2716if (index > 0 && !ctx->args->shader_info->ps.writes_z &&2717!ctx->args->shader_info->ps.writes_stencil &&2718!ctx->args->shader_info->ps.writes_sample_mask) {2719unsigned last = index - 1;27202721color_args[last].valid_mask = 1; /* whether the EXEC mask is valid */2722color_args[last].done = 1; /* DONE bit */2723}27242725/* Export PS outputs. */2726for (unsigned i = 0; i < index; i++)2727ac_build_export(&ctx->ac, &color_args[i]);27282729if (depth || stencil || samplemask)2730radv_export_mrt_z(ctx, depth, stencil, samplemask);2731else if (!index)2732ac_build_export_null(&ctx->ac);2733}27342735static void2736emit_gs_epilogue(struct radv_shader_context *ctx)2737{2738if (ctx->args->options->key.vs_common_out.as_ngg) {2739gfx10_ngg_gs_emit_epilogue_1(ctx);2740return;2741}27422743if (ctx->ac.chip_class >= GFX10)2744LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, "");27452746ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, ctx->gs_wave_id);2747}27482749static void2750handle_shader_outputs_post(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)2751{2752struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);27532754switch (ctx->stage) {2755case MESA_SHADER_VERTEX:2756if (ctx->args->options->key.vs_common_out.as_ls)2757break; /* Lowered in NIR */2758else if (ctx->args->options->key.vs_common_out.as_es)2759break; /* Lowered in NIR */2760else if (ctx->args->options->key.vs_common_out.as_ngg)2761handle_ngg_outputs_post_1(ctx);2762else2763handle_vs_outputs_post(ctx, ctx->args->options->key.vs_common_out.export_prim_id,2764ctx->args->options->key.vs_common_out.export_clip_dists,2765&ctx->args->shader_info->vs.outinfo);2766break;2767case MESA_SHADER_FRAGMENT:2768handle_fs_outputs_post(ctx);2769break;2770case MESA_SHADER_GEOMETRY:2771emit_gs_epilogue(ctx);2772break;2773case MESA_SHADER_TESS_CTRL:2774break; /* Lowered in NIR */2775case MESA_SHADER_TESS_EVAL:2776if (ctx->args->options->key.vs_common_out.as_es)2777break; /* Lowered in NIR */2778else if (ctx->args->options->key.vs_common_out.as_ngg)2779handle_ngg_outputs_post_1(ctx);2780else2781handle_vs_outputs_post(ctx, ctx->args->options->key.vs_common_out.export_prim_id,2782ctx->args->options->key.vs_common_out.export_clip_dists,2783&ctx->args->shader_info->tes.outinfo);2784break;2785default:2786break;2787}2788}27892790static void2791ac_llvm_finalize_module(struct radv_shader_context *ctx, LLVMPassManagerRef passmgr,2792const struct radv_nir_compiler_options *options)2793{2794LLVMRunPassManager(passmgr, ctx->ac.module);2795LLVMDisposeBuilder(ctx->ac.builder);27962797ac_llvm_context_dispose(&ctx->ac);2798}27992800static void2801ac_nir_eliminate_const_vs_outputs(struct radv_shader_context *ctx)2802{2803struct radv_vs_output_info *outinfo;28042805switch (ctx->stage) {2806case MESA_SHADER_FRAGMENT:2807case MESA_SHADER_COMPUTE:2808case MESA_SHADER_TESS_CTRL:2809case MESA_SHADER_GEOMETRY:2810return;2811case MESA_SHADER_VERTEX:2812if (ctx->args->options->key.vs_common_out.as_ls ||2813ctx->args->options->key.vs_common_out.as_es)2814return;2815outinfo = &ctx->args->shader_info->vs.outinfo;2816break;2817case MESA_SHADER_TESS_EVAL:2818if (ctx->args->options->key.vs_common_out.as_es)2819return;2820outinfo = &ctx->args->shader_info->tes.outinfo;2821break;2822default:2823unreachable("Unhandled shader type");2824}28252826ac_optimize_vs_outputs(&ctx->ac, ctx->main_function, outinfo->vs_output_param_offset,2827VARYING_SLOT_MAX, 0, &outinfo->param_exports);2828}28292830static void2831ac_setup_rings(struct radv_shader_context *ctx)2832{2833if (ctx->args->options->chip_class <= GFX8 &&2834(ctx->stage == MESA_SHADER_GEOMETRY || ctx->args->options->key.vs_common_out.as_es)) {2835unsigned ring = ctx->stage == MESA_SHADER_GEOMETRY ? RING_ESGS_GS : RING_ESGS_VS;2836LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, false);28372838ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, offset);2839}28402841if (ctx->args->is_gs_copy_shader) {2842ctx->gsvs_ring[0] = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets,2843LLVMConstInt(ctx->ac.i32, RING_GSVS_VS, false));2844}28452846if (ctx->stage == MESA_SHADER_GEOMETRY) {2847/* The conceptual layout of the GSVS ring is2848* v0c0 .. vLv0 v0c1 .. vLc1 ..2849* but the real memory layout is swizzled across2850* threads:2851* t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL2852* t16v0c0 ..2853* Override the buffer descriptor accordingly.2854*/2855LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2);2856uint64_t stream_offset = 0;2857unsigned num_records = ctx->ac.wave_size;2858LLVMValueRef base_ring;28592860base_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets,2861LLVMConstInt(ctx->ac.i32, RING_GSVS_GS, false));28622863for (unsigned stream = 0; stream < 4; stream++) {2864unsigned num_components, stride;2865LLVMValueRef ring, tmp;28662867num_components = ctx->args->shader_info->gs.num_stream_output_components[stream];28682869if (!num_components)2870continue;28712872stride = 4 * num_components * ctx->shader->info.gs.vertices_out;28732874/* Limit on the stride field for <= GFX7. */2875assert(stride < (1 << 14));28762877ring = LLVMBuildBitCast(ctx->ac.builder, base_ring, v2i64, "");2878tmp = LLVMBuildExtractElement(ctx->ac.builder, ring, ctx->ac.i32_0, "");2879tmp = LLVMBuildAdd(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i64, stream_offset, 0), "");2880ring = LLVMBuildInsertElement(ctx->ac.builder, ring, tmp, ctx->ac.i32_0, "");28812882stream_offset += stride * ctx->ac.wave_size;28832884ring = LLVMBuildBitCast(ctx->ac.builder, ring, ctx->ac.v4i32, "");28852886tmp = LLVMBuildExtractElement(ctx->ac.builder, ring, ctx->ac.i32_1, "");2887tmp = LLVMBuildOr(ctx->ac.builder, tmp,2888LLVMConstInt(ctx->ac.i32, S_008F04_STRIDE(stride), false), "");2889ring = LLVMBuildInsertElement(ctx->ac.builder, ring, tmp, ctx->ac.i32_1, "");28902891ring = LLVMBuildInsertElement(ctx->ac.builder, ring,2892LLVMConstInt(ctx->ac.i32, num_records, false),2893LLVMConstInt(ctx->ac.i32, 2, false), "");28942895ctx->gsvs_ring[stream] = ring;2896}2897}28982899if (ctx->stage == MESA_SHADER_TESS_CTRL || ctx->stage == MESA_SHADER_TESS_EVAL) {2900ctx->hs_ring_tess_offchip = ac_build_load_to_sgpr(2901&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_HS_TESS_OFFCHIP, false));2902ctx->hs_ring_tess_factor = ac_build_load_to_sgpr(2903&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_HS_TESS_FACTOR, false));2904}2905}29062907unsigned2908radv_nir_get_max_workgroup_size(enum chip_class chip_class, gl_shader_stage stage,2909const struct nir_shader *nir)2910{2911const unsigned backup_sizes[] = {chip_class >= GFX9 ? 128 : 64, 1, 1};2912unsigned sizes[3];2913for (unsigned i = 0; i < 3; i++)2914sizes[i] = nir ? nir->info.workgroup_size[i] : backup_sizes[i];2915return radv_get_max_workgroup_size(chip_class, stage, sizes);2916}29172918/* Fixup the HW not emitting the TCS regs if there are no HS threads. */2919static void2920ac_nir_fixup_ls_hs_input_vgprs(struct radv_shader_context *ctx)2921{2922LLVMValueRef count =2923ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 8, 8);2924LLVMValueRef hs_empty = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, count, ctx->ac.i32_0, "");2925ctx->abi.instance_id =2926LLVMBuildSelect(ctx->ac.builder, hs_empty, ac_get_arg(&ctx->ac, ctx->args->ac.vertex_id),2927ctx->abi.instance_id, "");2928ctx->vs_rel_patch_id =2929LLVMBuildSelect(ctx->ac.builder, hs_empty, ac_get_arg(&ctx->ac, ctx->args->ac.tcs_rel_ids),2930ctx->vs_rel_patch_id, "");2931ctx->abi.vertex_id =2932LLVMBuildSelect(ctx->ac.builder, hs_empty, ac_get_arg(&ctx->ac, ctx->args->ac.tcs_patch_id),2933ctx->abi.vertex_id, "");2934}29352936static void2937prepare_gs_input_vgprs(struct radv_shader_context *ctx, bool merged)2938{2939if (merged) {2940for (int i = 5; i >= 0; --i) {2941ctx->gs_vtx_offset[i] = ac_unpack_param(2942&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[i & ~1]), (i & 1) * 16, 16);2943}29442945ctx->gs_wave_id =2946ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 16, 8);2947} else {2948for (int i = 0; i < 6; i++)2949ctx->gs_vtx_offset[i] = ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[i]);2950ctx->gs_wave_id = ac_get_arg(&ctx->ac, ctx->args->ac.gs_wave_id);2951}2952}29532954/* Ensure that the esgs ring is declared.2955*2956* We declare it with 64KB alignment as a hint that the2957* pointer value will always be 0.2958*/2959static void2960declare_esgs_ring(struct radv_shader_context *ctx)2961{2962if (ctx->esgs_ring)2963return;29642965assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring"));29662967ctx->esgs_ring = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),2968"esgs_ring", AC_ADDR_SPACE_LDS);2969LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage);2970LLVMSetAlignment(ctx->esgs_ring, 64 * 1024);2971}29722973static LLVMModuleRef2974ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, struct nir_shader *const *shaders,2975int shader_count, const struct radv_shader_args *args)2976{2977struct radv_shader_context ctx = {0};2978ctx.args = args;29792980enum ac_float_mode float_mode = AC_FLOAT_MODE_DEFAULT;29812982if (args->shader_info->float_controls_mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32) {2983float_mode = AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO;2984}29852986ac_llvm_context_init(&ctx.ac, ac_llvm, args->options->chip_class, args->options->family,2987args->options->info, float_mode, args->shader_info->wave_size,2988args->shader_info->ballot_bit_size);2989ctx.context = ctx.ac.context;29902991ctx.max_workgroup_size = 0;2992for (int i = 0; i < shader_count; ++i) {2993ctx.max_workgroup_size = MAX2(2994ctx.max_workgroup_size, radv_nir_get_max_workgroup_size(2995args->options->chip_class, shaders[i]->info.stage, shaders[i]));2996}29972998if (ctx.ac.chip_class >= GFX10) {2999if (is_pre_gs_stage(shaders[0]->info.stage) && args->options->key.vs_common_out.as_ngg) {3000ctx.max_workgroup_size = 128;3001}3002}30033004create_function(&ctx, shaders[shader_count - 1]->info.stage, shader_count >= 2);30053006ctx.abi.inputs = &ctx.inputs[0];3007ctx.abi.emit_outputs = handle_shader_outputs_post;3008ctx.abi.emit_vertex_with_counter = visit_emit_vertex_with_counter;3009ctx.abi.load_ubo = radv_load_ubo;3010ctx.abi.load_ssbo = radv_load_ssbo;3011ctx.abi.load_sampler_desc = radv_get_sampler_desc;3012ctx.abi.load_resource = radv_load_resource;3013ctx.abi.load_ring_tess_factors = load_ring_tess_factors;3014ctx.abi.load_ring_tess_offchip = load_ring_tess_offchip;3015ctx.abi.load_ring_esgs = load_ring_esgs;3016ctx.abi.clamp_shadow_reference = false;3017ctx.abi.adjust_frag_coord_z = args->options->adjust_frag_coord_z;3018ctx.abi.robust_buffer_access = args->options->robust_buffer_access;30193020bool is_ngg = is_pre_gs_stage(shaders[0]->info.stage) && args->options->key.vs_common_out.as_ngg;3021if (shader_count >= 2 || is_ngg)3022ac_init_exec_full_mask(&ctx.ac);30233024if (args->ac.vertex_id.used)3025ctx.abi.vertex_id = ac_get_arg(&ctx.ac, args->ac.vertex_id);3026if (args->ac.vs_rel_patch_id.used)3027ctx.vs_rel_patch_id = ac_get_arg(&ctx.ac, args->ac.vs_rel_patch_id);3028if (args->ac.instance_id.used)3029ctx.abi.instance_id = ac_get_arg(&ctx.ac, args->ac.instance_id);30303031if (args->options->has_ls_vgpr_init_bug &&3032shaders[shader_count - 1]->info.stage == MESA_SHADER_TESS_CTRL)3033ac_nir_fixup_ls_hs_input_vgprs(&ctx);30343035if (is_ngg) {3036/* Declare scratch space base for streamout and vertex3037* compaction. Whether space is actually allocated is3038* determined during linking / PM4 creation.3039*3040* Add an extra dword per vertex to ensure an odd stride, which3041* avoids bank conflicts for SoA accesses.3042*/3043if (!args->options->key.vs_common_out.as_ngg_passthrough)3044declare_esgs_ring(&ctx);30453046/* This is really only needed when streamout and / or vertex3047* compaction is enabled.3048*/3049if (args->shader_info->so.num_outputs) {3050LLVMTypeRef asi32 = LLVMArrayType(ctx.ac.i32, 8);3051ctx.gs_ngg_scratch =3052LLVMAddGlobalInAddressSpace(ctx.ac.module, asi32, "ngg_scratch", AC_ADDR_SPACE_LDS);3053LLVMSetInitializer(ctx.gs_ngg_scratch, LLVMGetUndef(asi32));3054LLVMSetAlignment(ctx.gs_ngg_scratch, 4);3055}30563057/* GFX10 hang workaround - there needs to be an s_barrier before gs_alloc_req always */3058if (ctx.ac.chip_class == GFX10 && shader_count == 1)3059ac_build_s_barrier(&ctx.ac);3060}30613062for (int shader_idx = 0; shader_idx < shader_count; ++shader_idx) {3063ctx.stage = shaders[shader_idx]->info.stage;3064ctx.shader = shaders[shader_idx];3065ctx.output_mask = 0;30663067if (shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY) {3068for (int i = 0; i < 4; i++) {3069ctx.gs_next_vertex[i] = ac_build_alloca(&ctx.ac, ctx.ac.i32, "");3070}3071if (args->options->key.vs_common_out.as_ngg) {3072for (unsigned i = 0; i < 4; ++i) {3073ctx.gs_curprim_verts[i] = ac_build_alloca(&ctx.ac, ctx.ac.i32, "");3074ctx.gs_generated_prims[i] = ac_build_alloca(&ctx.ac, ctx.ac.i32, "");3075}30763077unsigned scratch_size = 8;3078if (args->shader_info->so.num_outputs)3079scratch_size = 44;30803081LLVMTypeRef ai32 = LLVMArrayType(ctx.ac.i32, scratch_size);3082ctx.gs_ngg_scratch =3083LLVMAddGlobalInAddressSpace(ctx.ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);3084LLVMSetInitializer(ctx.gs_ngg_scratch, LLVMGetUndef(ai32));3085LLVMSetAlignment(ctx.gs_ngg_scratch, 4);30863087ctx.gs_ngg_emit = LLVMAddGlobalInAddressSpace(3088ctx.ac.module, LLVMArrayType(ctx.ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);3089LLVMSetLinkage(ctx.gs_ngg_emit, LLVMExternalLinkage);3090LLVMSetAlignment(ctx.gs_ngg_emit, 4);3091}30923093ctx.abi.emit_primitive = visit_end_primitive;3094} else if (shaders[shader_idx]->info.stage == MESA_SHADER_TESS_EVAL) {3095ctx.abi.load_tess_coord = load_tess_coord;3096} else if (shaders[shader_idx]->info.stage == MESA_SHADER_VERTEX) {3097ctx.abi.load_base_vertex = radv_load_base_vertex;3098} else if (shaders[shader_idx]->info.stage == MESA_SHADER_FRAGMENT) {3099ctx.abi.load_sample_position = load_sample_position;3100ctx.abi.load_sample_mask_in = load_sample_mask_in;3101}31023103if (shaders[shader_idx]->info.stage == MESA_SHADER_VERTEX &&3104args->options->key.vs_common_out.as_ngg &&3105args->options->key.vs_common_out.export_prim_id) {3106declare_esgs_ring(&ctx);3107}31083109bool nested_barrier = false;31103111if (shader_idx) {3112if (shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY &&3113args->options->key.vs_common_out.as_ngg) {3114gfx10_ngg_gs_emit_prologue(&ctx);3115nested_barrier = false;3116} else {3117nested_barrier = true;3118}3119}31203121if (nested_barrier) {3122/* Execute a barrier before the second shader in3123* a merged shader.3124*3125* Execute the barrier inside the conditional block,3126* so that empty waves can jump directly to s_endpgm,3127* which will also signal the barrier.3128*3129* This is possible in gfx9, because an empty wave3130* for the second shader does not participate in3131* the epilogue. With NGG, empty waves may still3132* be required to export data (e.g. GS output vertices),3133* so we cannot let them exit early.3134*3135* If the shader is TCS and the TCS epilog is present3136* and contains a barrier, it will wait there and then3137* reach s_endpgm.3138*/3139ac_emit_barrier(&ctx.ac, ctx.stage);3140}31413142nir_foreach_shader_out_variable(variable, shaders[shader_idx]) scan_shader_output_decl(3143&ctx, variable, shaders[shader_idx], shaders[shader_idx]->info.stage);31443145ac_setup_rings(&ctx);31463147LLVMBasicBlockRef merge_block = NULL;3148if (shader_count >= 2 || is_ngg) {3149LLVMValueRef fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder));3150LLVMBasicBlockRef then_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, "");3151merge_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, "");31523153LLVMValueRef count = ac_unpack_param(3154&ctx.ac, ac_get_arg(&ctx.ac, args->ac.merged_wave_info), 8 * shader_idx, 8);3155LLVMValueRef thread_id = ac_get_thread_id(&ctx.ac);3156LLVMValueRef cond = LLVMBuildICmp(ctx.ac.builder, LLVMIntULT, thread_id, count, "");3157LLVMBuildCondBr(ctx.ac.builder, cond, then_block, merge_block);31583159LLVMPositionBuilderAtEnd(ctx.ac.builder, then_block);3160}31613162if (shaders[shader_idx]->info.stage == MESA_SHADER_FRAGMENT)3163prepare_interp_optimize(&ctx, shaders[shader_idx]);3164else if (shaders[shader_idx]->info.stage == MESA_SHADER_VERTEX)3165handle_vs_inputs(&ctx, shaders[shader_idx]);3166else if (shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY)3167prepare_gs_input_vgprs(&ctx, shader_count >= 2);31683169ac_nir_translate(&ctx.ac, &ctx.abi, &args->ac, shaders[shader_idx]);31703171if (shader_count >= 2 || is_ngg) {3172LLVMBuildBr(ctx.ac.builder, merge_block);3173LLVMPositionBuilderAtEnd(ctx.ac.builder, merge_block);3174}31753176/* This needs to be outside the if wrapping the shader body, as sometimes3177* the HW generates waves with 0 es/vs threads. */3178if (is_pre_gs_stage(shaders[shader_idx]->info.stage) &&3179args->options->key.vs_common_out.as_ngg && shader_idx == shader_count - 1) {3180handle_ngg_outputs_post_2(&ctx);3181} else if (shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY &&3182args->options->key.vs_common_out.as_ngg) {3183gfx10_ngg_gs_emit_epilogue_2(&ctx);3184}3185}31863187LLVMBuildRetVoid(ctx.ac.builder);31883189if (args->options->dump_preoptir) {3190fprintf(stderr, "%s LLVM IR:\n\n",3191radv_get_shader_name(args->shader_info, shaders[shader_count - 1]->info.stage));3192ac_dump_module(ctx.ac.module);3193fprintf(stderr, "\n");3194}31953196ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, args->options);31973198if (shader_count == 1)3199ac_nir_eliminate_const_vs_outputs(&ctx);32003201if (args->options->dump_shader) {3202args->shader_info->private_mem_vgprs = ac_count_scratch_private_memory(ctx.main_function);3203}32043205return ctx.ac.module;3206}32073208static void3209ac_diagnostic_handler(LLVMDiagnosticInfoRef di, void *context)3210{3211unsigned *retval = (unsigned *)context;3212LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di);3213char *description = LLVMGetDiagInfoDescription(di);32143215if (severity == LLVMDSError) {3216*retval = 1;3217fprintf(stderr, "LLVM triggered Diagnostic Handler: %s\n", description);3218}32193220LLVMDisposeMessage(description);3221}32223223static unsigned3224radv_llvm_compile(LLVMModuleRef M, char **pelf_buffer, size_t *pelf_size,3225struct ac_llvm_compiler *ac_llvm)3226{3227unsigned retval = 0;3228LLVMContextRef llvm_ctx;32293230/* Setup Diagnostic Handler*/3231llvm_ctx = LLVMGetModuleContext(M);32323233LLVMContextSetDiagnosticHandler(llvm_ctx, ac_diagnostic_handler, &retval);32343235/* Compile IR*/3236if (!radv_compile_to_elf(ac_llvm, M, pelf_buffer, pelf_size))3237retval = 1;3238return retval;3239}32403241static void3242ac_compile_llvm_module(struct ac_llvm_compiler *ac_llvm, LLVMModuleRef llvm_module,3243struct radv_shader_binary **rbinary, gl_shader_stage stage, const char *name,3244const struct radv_nir_compiler_options *options)3245{3246char *elf_buffer = NULL;3247size_t elf_size = 0;3248char *llvm_ir_string = NULL;32493250if (options->dump_shader) {3251fprintf(stderr, "%s LLVM IR:\n\n", name);3252ac_dump_module(llvm_module);3253fprintf(stderr, "\n");3254}32553256if (options->record_ir) {3257char *llvm_ir = LLVMPrintModuleToString(llvm_module);3258llvm_ir_string = strdup(llvm_ir);3259LLVMDisposeMessage(llvm_ir);3260}32613262int v = radv_llvm_compile(llvm_module, &elf_buffer, &elf_size, ac_llvm);3263if (v) {3264fprintf(stderr, "compile failed\n");3265}32663267LLVMContextRef ctx = LLVMGetModuleContext(llvm_module);3268LLVMDisposeModule(llvm_module);3269LLVMContextDispose(ctx);32703271size_t llvm_ir_size = llvm_ir_string ? strlen(llvm_ir_string) : 0;3272size_t alloc_size = sizeof(struct radv_shader_binary_rtld) + elf_size + llvm_ir_size + 1;3273struct radv_shader_binary_rtld *rbin = calloc(1, alloc_size);3274memcpy(rbin->data, elf_buffer, elf_size);3275if (llvm_ir_string)3276memcpy(rbin->data + elf_size, llvm_ir_string, llvm_ir_size + 1);32773278rbin->base.type = RADV_BINARY_TYPE_RTLD;3279rbin->base.stage = stage;3280rbin->base.total_size = alloc_size;3281rbin->elf_size = elf_size;3282rbin->llvm_ir_size = llvm_ir_size;3283*rbinary = &rbin->base;32843285free(llvm_ir_string);3286free(elf_buffer);3287}32883289static void3290radv_compile_nir_shader(struct ac_llvm_compiler *ac_llvm, struct radv_shader_binary **rbinary,3291const struct radv_shader_args *args, struct nir_shader *const *nir,3292int nir_count)3293{32943295LLVMModuleRef llvm_module;32963297llvm_module = ac_translate_nir_to_llvm(ac_llvm, nir, nir_count, args);32983299ac_compile_llvm_module(ac_llvm, llvm_module, rbinary, nir[nir_count - 1]->info.stage,3300radv_get_shader_name(args->shader_info, nir[nir_count - 1]->info.stage),3301args->options);33023303/* Determine the ES type (VS or TES) for the GS on GFX9. */3304if (args->options->chip_class >= GFX9) {3305if (nir_count == 2 && nir[1]->info.stage == MESA_SHADER_GEOMETRY) {3306args->shader_info->gs.es_type = nir[0]->info.stage;3307}3308}3309}33103311static void3312ac_gs_copy_shader_emit(struct radv_shader_context *ctx)3313{3314LLVMValueRef vtx_offset =3315LLVMBuildMul(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->ac.vertex_id),3316LLVMConstInt(ctx->ac.i32, 4, false), "");3317LLVMValueRef stream_id;33183319/* Fetch the vertex stream ID. */3320if (!ctx->args->options->use_ngg_streamout && ctx->args->shader_info->so.num_outputs) {3321stream_id =3322ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.streamout_config), 24, 2);3323} else {3324stream_id = ctx->ac.i32_0;3325}33263327LLVMBasicBlockRef end_bb;3328LLVMValueRef switch_inst;33293330end_bb = LLVMAppendBasicBlockInContext(ctx->ac.context, ctx->main_function, "end");3331switch_inst = LLVMBuildSwitch(ctx->ac.builder, stream_id, end_bb, 4);33323333for (unsigned stream = 0; stream < 4; stream++) {3334unsigned num_components = ctx->args->shader_info->gs.num_stream_output_components[stream];3335LLVMBasicBlockRef bb;3336unsigned offset;33373338if (stream > 0 && !num_components)3339continue;33403341if (stream > 0 && !ctx->args->shader_info->so.num_outputs)3342continue;33433344bb = LLVMInsertBasicBlockInContext(ctx->ac.context, end_bb, "out");3345LLVMAddCase(switch_inst, LLVMConstInt(ctx->ac.i32, stream, 0), bb);3346LLVMPositionBuilderAtEnd(ctx->ac.builder, bb);33473348offset = 0;3349for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {3350unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];3351unsigned output_stream = ctx->args->shader_info->gs.output_streams[i];3352int length = util_last_bit(output_usage_mask);33533354if (!(ctx->output_mask & (1ull << i)) || output_stream != stream)3355continue;33563357for (unsigned j = 0; j < length; j++) {3358LLVMValueRef value, soffset;33593360if (!(output_usage_mask & (1 << j)))3361continue;33623363soffset = LLVMConstInt(ctx->ac.i32, offset * ctx->shader->info.gs.vertices_out * 16 * 4,3364false);33653366offset++;33673368value = ac_build_buffer_load(&ctx->ac, ctx->gsvs_ring[0], 1, ctx->ac.i32_0, vtx_offset,3369soffset, 0, ctx->ac.f32, ac_glc | ac_slc, true, false);33703371LLVMTypeRef type = LLVMGetAllocatedType(ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]);3372if (ac_get_type_size(type) == 2) {3373value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->ac.i32, "");3374value = LLVMBuildTrunc(ctx->ac.builder, value, ctx->ac.i16, "");3375}33763377LLVMBuildStore(ctx->ac.builder, ac_to_float(&ctx->ac, value),3378ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]);3379}3380}33813382if (!ctx->args->options->use_ngg_streamout && ctx->args->shader_info->so.num_outputs)3383radv_emit_streamout(ctx, stream);33843385if (stream == 0) {3386handle_vs_outputs_post(ctx, false, true, &ctx->args->shader_info->vs.outinfo);3387}33883389LLVMBuildBr(ctx->ac.builder, end_bb);3390}33913392LLVMPositionBuilderAtEnd(ctx->ac.builder, end_bb);3393}33943395static void3396radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm, struct nir_shader *geom_shader,3397struct radv_shader_binary **rbinary,3398const struct radv_shader_args *args)3399{3400struct radv_shader_context ctx = {0};3401ctx.args = args;34023403assert(args->is_gs_copy_shader);34043405ac_llvm_context_init(&ctx.ac, ac_llvm, args->options->chip_class, args->options->family,3406args->options->info, AC_FLOAT_MODE_DEFAULT, 64, 64);3407ctx.context = ctx.ac.context;34083409ctx.stage = MESA_SHADER_VERTEX;3410ctx.shader = geom_shader;34113412create_function(&ctx, MESA_SHADER_VERTEX, false);34133414ac_setup_rings(&ctx);34153416nir_foreach_shader_out_variable(variable, geom_shader)3417{3418scan_shader_output_decl(&ctx, variable, geom_shader, MESA_SHADER_VERTEX);3419ac_handle_shader_output_decl(&ctx.ac, &ctx.abi, geom_shader, variable, MESA_SHADER_VERTEX);3420}34213422ac_gs_copy_shader_emit(&ctx);34233424LLVMBuildRetVoid(ctx.ac.builder);34253426ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, args->options);34273428ac_compile_llvm_module(ac_llvm, ctx.ac.module, rbinary, MESA_SHADER_VERTEX, "GS Copy Shader",3429args->options);3430(*rbinary)->is_gs_copy_shader = true;3431}34323433void3434llvm_compile_shader(struct radv_device *device, unsigned shader_count,3435struct nir_shader *const *shaders, struct radv_shader_binary **binary,3436struct radv_shader_args *args)3437{3438enum ac_target_machine_options tm_options = 0;3439struct ac_llvm_compiler ac_llvm;34403441tm_options |= AC_TM_SUPPORTS_SPILL;3442if (args->options->check_ir)3443tm_options |= AC_TM_CHECK_IR;34443445radv_init_llvm_compiler(&ac_llvm, args->options->family, tm_options,3446args->shader_info->wave_size);34473448if (args->is_gs_copy_shader) {3449radv_compile_gs_copy_shader(&ac_llvm, *shaders, binary, args);3450} else {3451radv_compile_nir_shader(&ac_llvm, binary, args, shaders, shader_count);3452}3453}345434553456