Path: blob/21.2-virgl/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
4570 views
/*1* Copyright 2020 Advanced Micro Devices, Inc.2* All Rights Reserved.3*4* Permission is hereby granted, free of charge, to any person obtaining a5* copy of this software and associated documentation files (the "Software"),6* to deal in the Software without restriction, including without limitation7* on the rights to use, copy, modify, merge, publish, distribute, sub8* license, and/or sell copies of the Software, and to permit persons to whom9* the Software is furnished to do so, subject to the following conditions:10*11* The above copyright notice and this permission notice (including the next12* paragraph) shall be included in all copies or substantial portions of the13* Software.14*15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,17* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL18* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,19* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR20* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE21* USE OR OTHER DEALINGS IN THE SOFTWARE.22*/2324#include "si_pipe.h"25#include "si_shader_internal.h"26#include "sid.h"27#include "util/u_memory.h"2829static LLVMValueRef unpack_sint16(struct si_shader_context *ctx, LLVMValueRef i32, unsigned index)30{31assert(index <= 1);3233if (index == 1)34return LLVMBuildAShr(ctx->ac.builder, i32, LLVMConstInt(ctx->ac.i32, 16, 0), "");3536return LLVMBuildSExt(ctx->ac.builder, LLVMBuildTrunc(ctx->ac.builder, i32, ctx->ac.i16, ""),37ctx->ac.i32, "");38}3940static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, LLVMValueRef out[4])41{42const struct si_shader_info *info = &ctx->shader->selector->info;43unsigned vs_blit_property = info->base.vs.blit_sgprs_amd;4445if (vs_blit_property) {46LLVMValueRef vertex_id = ctx->abi.vertex_id;47LLVMValueRef sel_x1 =48LLVMBuildICmp(ctx->ac.builder, LLVMIntULE, vertex_id, ctx->ac.i32_1, "");49/* Use LLVMIntNE, because we have 3 vertices and only50* the middle one should use y2.51*/52LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, vertex_id, ctx->ac.i32_1, "");5354unsigned param_vs_blit_inputs = ctx->vs_blit_inputs.arg_index;55if (input_index == 0) {56/* Position: */57LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs);58LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 1);5960LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0);61LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1);62LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0);63LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1);6465LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1, x1, x2, "");66LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1, y1, y2, "");6768out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->ac.f32, "");69out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->ac.f32, "");70out[2] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 2);71out[3] = ctx->ac.f32_1;72return;73}7475/* Color or texture coordinates: */76assert(input_index == 1);7778if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {79for (int i = 0; i < 4; i++) {80out[i] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 3 + i);81}82} else {83assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD);84LLVMValueRef x1 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 3);85LLVMValueRef y1 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 4);86LLVMValueRef x2 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 5);87LLVMValueRef y2 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 6);8889out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1, x1, x2, "");90out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1, y1, y2, "");91out[2] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 7);92out[3] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 8);93}94return;95}9697/* Set can_speculate=false to help keep all loads grouped together98* for better latency hiding. If it was true, LLVM could move the loads forward99* and accidentally double memory latency by doing:100*101* buffer_load_dword_xyzw102* s_waitcnt vmcnt(0)103* buffer_load_dword_xyzw104* s_waitcnt vmcnt(0)105*106* ... which is what we must prevent at all cost.107*/108const bool can_speculate = false;109unsigned bit_size = info->input_fp16_lo_hi_valid[input_index] & 0x1 ? 16 : 32;110LLVMTypeRef int_type = bit_size == 16 ? ctx->ac.i16 : ctx->ac.i32;111LLVMTypeRef float_type = bit_size == 16 ? ctx->ac.f16 : ctx->ac.f32;112unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;113union si_vs_fix_fetch fix_fetch;114LLVMValueRef vb_desc;115LLVMValueRef vertex_index;116LLVMValueRef tmp;117118if (input_index < num_vbos_in_user_sgprs) {119vb_desc = ac_get_arg(&ctx->ac, ctx->vb_descriptors[input_index]);120} else {121unsigned index = input_index - num_vbos_in_user_sgprs;122vb_desc = ac_build_load_to_sgpr(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.vertex_buffers),123LLVMConstInt(ctx->ac.i32, index, 0));124}125126vertex_index = LLVMGetParam(ctx->main_fn, ctx->vertex_index0.arg_index + input_index);127128/* Use the open-coded implementation for all loads of doubles and129* of dword-sized data that needs fixups. We need to insert conversion130* code anyway, and the amd/common code does it for us.131*/132bool opencode = ctx->shader->key.mono.vs_fetch_opencode & (1 << input_index);133fix_fetch.bits = ctx->shader->key.mono.vs_fix_fetch[input_index].bits;134if (opencode || (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) ||135(fix_fetch.u.log_size == 2)) {136tmp = ac_build_opencoded_load_format(&ctx->ac, fix_fetch.u.log_size,137fix_fetch.u.num_channels_m1 + 1, fix_fetch.u.format,138fix_fetch.u.reverse, !opencode, vb_desc, vertex_index,139ctx->ac.i32_0, ctx->ac.i32_0, 0, can_speculate);140for (unsigned i = 0; i < 4; ++i)141out[i] =142LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i32, i, false), "");143144if (bit_size == 16) {145if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT ||146fix_fetch.u.format == AC_FETCH_FORMAT_SINT) {147for (unsigned i = 0; i < 4; i++)148out[i] = LLVMBuildTrunc(ctx->ac.builder, out[i], ctx->ac.i16, "");149} else {150for (unsigned i = 0; i < 4; i++) {151out[i] = ac_to_float(&ctx->ac, out[i]);152out[i] = LLVMBuildFPTrunc(ctx->ac.builder, out[i], ctx->ac.f16, "");153}154}155}156return;157}158159unsigned required_channels = util_last_bit(info->input_usage_mask[input_index]);160if (required_channels == 0) {161for (unsigned i = 0; i < 4; ++i)162out[i] = LLVMGetUndef(ctx->ac.f32);163return;164}165166/* Do multiple loads for special formats. */167LLVMValueRef fetches[4];168unsigned num_fetches;169unsigned fetch_stride;170unsigned channels_per_fetch;171172if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) {173num_fetches = MIN2(required_channels, 3);174fetch_stride = 1 << fix_fetch.u.log_size;175channels_per_fetch = 1;176} else {177num_fetches = 1;178fetch_stride = 0;179channels_per_fetch = required_channels;180}181182for (unsigned i = 0; i < num_fetches; ++i) {183LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, fetch_stride * i, 0);184fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset,185channels_per_fetch, 0, can_speculate,186bit_size == 16, false);187}188189if (num_fetches == 1 && channels_per_fetch > 1) {190LLVMValueRef fetch = fetches[0];191for (unsigned i = 0; i < channels_per_fetch; ++i) {192tmp = LLVMConstInt(ctx->ac.i32, i, false);193fetches[i] = LLVMBuildExtractElement(ctx->ac.builder, fetch, tmp, "");194}195num_fetches = channels_per_fetch;196channels_per_fetch = 1;197}198199for (unsigned i = num_fetches; i < 4; ++i)200fetches[i] = LLVMGetUndef(float_type);201202if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 && required_channels == 4) {203if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || fix_fetch.u.format == AC_FETCH_FORMAT_SINT)204fetches[3] = LLVMConstInt(int_type, 1, 0);205else206fetches[3] = LLVMConstReal(float_type, 1);207} else if (fix_fetch.u.log_size == 3 &&208(fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ||209fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED ||210fix_fetch.u.format == AC_FETCH_FORMAT_SINT) &&211required_channels == 4) {212213/* For 2_10_10_10, the hardware returns an unsigned value;214* convert it to a signed one.215*/216LLVMValueRef tmp = fetches[3];217LLVMValueRef c30 = LLVMConstInt(int_type, 30, 0);218219/* First, recover the sign-extended signed integer value. */220if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED)221tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, int_type, "");222else223tmp = ac_to_integer(&ctx->ac, tmp);224225/* For the integer-like cases, do a natural sign extension.226*227* For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0228* and happen to contain 0, 1, 2, 3 as the two LSBs of the229* exponent.230*/231tmp = LLVMBuildShl(232ctx->ac.builder, tmp,233fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ? LLVMConstInt(int_type, 7, 0) : c30, "");234tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, "");235236/* Convert back to the right type. */237if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) {238LLVMValueRef clamp;239LLVMValueRef neg_one = LLVMConstReal(float_type, -1.0);240tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, float_type, "");241clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, "");242tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, "");243} else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) {244tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, float_type, "");245}246247fetches[3] = tmp;248}249250for (unsigned i = 0; i < 4; ++i)251out[i] = ac_to_float(&ctx->ac, fetches[i]);252}253254void si_llvm_load_vs_inputs(struct si_shader_context *ctx, struct nir_shader *nir)255{256const struct si_shader_info *info = &ctx->shader->selector->info;257258for (unsigned i = 0; i < info->num_inputs; i++) {259LLVMValueRef values[4];260261load_input_vs(ctx, i, values);262263for (unsigned chan = 0; chan < 4; chan++)264ctx->inputs[i * 4 + chan] = ac_to_integer(&ctx->ac, values[chan]);265}266}267268void si_llvm_streamout_store_output(struct si_shader_context *ctx, LLVMValueRef const *so_buffers,269LLVMValueRef const *so_write_offsets,270struct pipe_stream_output *stream_out,271struct si_shader_output_values *shader_out)272{273unsigned buf_idx = stream_out->output_buffer;274unsigned start = stream_out->start_component;275unsigned num_comps = stream_out->num_components;276LLVMValueRef out[4];277278assert(num_comps && num_comps <= 4);279if (!num_comps || num_comps > 4)280return;281282/* Load the output as int. */283for (int j = 0; j < num_comps; j++) {284assert(stream_out->stream == shader_out->vertex_stream[start + j]);285286out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]);287}288289/* Pack the output. */290LLVMValueRef vdata = NULL;291292switch (num_comps) {293case 1: /* as i32 */294vdata = out[0];295break;296case 2: /* as v2i32 */297case 3: /* as v3i32 */298if (ac_has_vec3_support(ctx->screen->info.chip_class, false)) {299vdata = ac_build_gather_values(&ctx->ac, out, num_comps);300break;301}302/* as v4i32 (aligned to 4) */303out[3] = LLVMGetUndef(ctx->ac.i32);304FALLTHROUGH;305case 4: /* as v4i32 */306vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps));307break;308}309310ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx], vdata, num_comps,311so_write_offsets[buf_idx], ctx->ac.i32_0, stream_out->dst_offset * 4,312ac_glc | ac_slc);313}314315/**316* Write streamout data to buffers for vertex stream @p stream (different317* vertex streams can occur for GS copy shaders).318*/319void si_llvm_emit_streamout(struct si_shader_context *ctx, struct si_shader_output_values *outputs,320unsigned noutput, unsigned stream)321{322struct si_shader_selector *sel = ctx->shader->selector;323struct pipe_stream_output_info *so = &sel->so;324LLVMBuilderRef builder = ctx->ac.builder;325int i;326327/* Get bits [22:16], i.e. (so_param >> 16) & 127; */328LLVMValueRef so_vtx_count = si_unpack_param(ctx, ctx->args.streamout_config, 16, 7);329330LLVMValueRef tid = ac_get_thread_id(&ctx->ac);331332/* can_emit = tid < so_vtx_count; */333LLVMValueRef can_emit = LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");334335/* Emit the streamout code conditionally. This actually avoids336* out-of-bounds buffer access. The hw tells us via the SGPR337* (so_vtx_count) which threads are allowed to emit streamout data. */338ac_build_ifcc(&ctx->ac, can_emit, 6501);339{340/* The buffer offset is computed as follows:341* ByteOffset = streamout_offset[buffer_id]*4 +342* (streamout_write_index + thread_id)*stride[buffer_id] +343* attrib_offset344*/345346LLVMValueRef so_write_index = ac_get_arg(&ctx->ac, ctx->args.streamout_write_index);347348/* Compute (streamout_write_index + thread_id). */349so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");350351/* Load the descriptor and compute the write offset for each352* enabled buffer. */353LLVMValueRef so_write_offset[4] = {};354LLVMValueRef so_buffers[4];355LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->internal_bindings);356357for (i = 0; i < 4; i++) {358if (!so->stride[i])359continue;360361LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, SI_VS_STREAMOUT_BUF0 + i, 0);362363so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);364365LLVMValueRef so_offset = ac_get_arg(&ctx->ac, ctx->args.streamout_offset[i]);366so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, 0), "");367368so_write_offset[i] = ac_build_imad(369&ctx->ac, so_write_index, LLVMConstInt(ctx->ac.i32, so->stride[i] * 4, 0), so_offset);370}371372/* Write streamout data. */373for (i = 0; i < so->num_outputs; i++) {374unsigned reg = so->output[i].register_index;375376if (reg >= noutput)377continue;378379if (stream != so->output[i].stream)380continue;381382si_llvm_streamout_store_output(ctx, so_buffers, so_write_offset, &so->output[i],383&outputs[reg]);384}385}386ac_build_endif(&ctx->ac, 6501);387}388389static void si_llvm_emit_clipvertex(struct si_shader_context *ctx, struct ac_export_args *pos,390LLVMValueRef *out_elts)391{392unsigned reg_index;393unsigned chan;394unsigned const_chan;395LLVMValueRef base_elt;396LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->internal_bindings);397LLVMValueRef constbuf_index = LLVMConstInt(ctx->ac.i32, SI_VS_CONST_CLIP_PLANES, 0);398LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index);399unsigned clipdist_mask = ctx->shader->selector->clipdist_mask &400~ctx->shader->key.opt.kill_clip_distances;401402for (reg_index = 0; reg_index < 2; reg_index++) {403struct ac_export_args *args = &pos[2 + reg_index];404405if (!(clipdist_mask & BITFIELD_RANGE(reg_index * 4, 4)))406continue;407408args->out[0] = args->out[1] = args->out[2] = args->out[3] = LLVMGetUndef(ctx->ac.f32);409410/* Compute dot products of position and user clip plane vectors */411for (chan = 0; chan < 4; chan++) {412if (!(clipdist_mask & BITFIELD_BIT(reg_index * 4 + chan)))413continue;414415for (const_chan = 0; const_chan < 4; const_chan++) {416LLVMValueRef addr =417LLVMConstInt(ctx->ac.i32, ((reg_index * 4 + chan) * 4 + const_chan) * 4, 0);418base_elt = si_buffer_load_const(ctx, const_resource, addr);419args->out[chan] =420ac_build_fmad(&ctx->ac, base_elt, out_elts[const_chan],421const_chan == 0 ? ctx->ac.f32_0 : args->out[chan]);422}423}424425args->enabled_channels = 0xf;426args->valid_mask = 0;427args->done = 0;428args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;429args->compr = 0;430}431}432433/* Initialize arguments for the shader export intrinsic */434static void si_llvm_init_vs_export_args(struct si_shader_context *ctx, const LLVMValueRef *values,435unsigned target, struct ac_export_args *args)436{437args->enabled_channels = 0xf; /* writemask - default is 0xf */438args->valid_mask = 0; /* Specify whether the EXEC mask represents the valid mask */439args->done = 0; /* Specify whether this is the last export */440args->target = target; /* Specify the target we are exporting */441args->compr = false;442443memcpy(&args->out[0], values, sizeof(values[0]) * 4);444}445446static void si_prepare_param_exports(struct si_shader_context *ctx,447const struct si_shader_output_values *outputs, unsigned noutput,448struct ac_export_args exports[32])449{450struct si_shader *shader = ctx->shader;451unsigned param_count = 0;452453for (unsigned i = 0; i < noutput; i++) {454unsigned semantic = outputs[i].semantic;455456if (outputs[i].vertex_stream[0] != 0 && outputs[i].vertex_stream[1] != 0 &&457outputs[i].vertex_stream[2] != 0 && outputs[i].vertex_stream[3] != 0)458continue;459460switch (semantic) {461case VARYING_SLOT_LAYER:462case VARYING_SLOT_VIEWPORT:463case VARYING_SLOT_CLIP_DIST0:464case VARYING_SLOT_CLIP_DIST1:465case VARYING_SLOT_COL0:466case VARYING_SLOT_COL1:467case VARYING_SLOT_BFC0:468case VARYING_SLOT_BFC1:469case VARYING_SLOT_PRIMITIVE_ID:470case VARYING_SLOT_FOGC:471break;472default:473if ((semantic >= VARYING_SLOT_TEX0 && semantic <= VARYING_SLOT_TEX7) ||474semantic >= VARYING_SLOT_VAR0)475break;476else477continue;478}479480if ((semantic <= VARYING_SLOT_VAR31 || semantic >= VARYING_SLOT_VAR0_16BIT) &&481shader->key.opt.kill_outputs &482(1ull << si_shader_io_get_unique_index(semantic, true)))483continue;484485si_llvm_init_vs_export_args(ctx, outputs[i].values, V_008DFC_SQ_EXP_PARAM + param_count,486&exports[param_count]);487488assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));489shader->info.vs_output_param_offset[i] = param_count++;490}491492shader->info.nr_param_exports = param_count;493}494495/**496* Vertex color clamping.497*498* This uses a state constant loaded in a user data SGPR and499* an IF statement is added that clamps all colors if the constant500* is true.501*/502static void si_vertex_color_clamping(struct si_shader_context *ctx,503struct si_shader_output_values *outputs, unsigned noutput)504{505LLVMValueRef addr[SI_MAX_VS_OUTPUTS][4];506bool has_colors = false;507508/* Store original colors to alloca variables. */509for (unsigned i = 0; i < noutput; i++) {510if (outputs[i].semantic != VARYING_SLOT_COL0 &&511outputs[i].semantic != VARYING_SLOT_COL1 &&512outputs[i].semantic != VARYING_SLOT_BFC0 &&513outputs[i].semantic != VARYING_SLOT_BFC1)514continue;515516for (unsigned j = 0; j < 4; j++)517addr[i][j] = ac_build_alloca_init(&ctx->ac, outputs[i].values[j], "");518519has_colors = true;520}521522if (!has_colors)523return;524525/* The state is in the first bit of the user SGPR. */526LLVMValueRef cond = ac_get_arg(&ctx->ac, ctx->vs_state_bits);527cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->ac.i1, "");528529ac_build_ifcc(&ctx->ac, cond, 6502);530531/* Store clamped colors to alloca variables within the conditional block. */532for (unsigned i = 0; i < noutput; i++) {533if (outputs[i].semantic != VARYING_SLOT_COL0 &&534outputs[i].semantic != VARYING_SLOT_COL1 &&535outputs[i].semantic != VARYING_SLOT_BFC0 &&536outputs[i].semantic != VARYING_SLOT_BFC1)537continue;538539for (unsigned j = 0; j < 4; j++) {540LLVMBuildStore(ctx->ac.builder, ac_build_clamp(&ctx->ac, outputs[i].values[j]),541addr[i][j]);542}543}544ac_build_endif(&ctx->ac, 6502);545546/* Load clamped colors */547for (unsigned i = 0; i < noutput; i++) {548if (outputs[i].semantic != VARYING_SLOT_COL0 &&549outputs[i].semantic != VARYING_SLOT_COL1 &&550outputs[i].semantic != VARYING_SLOT_BFC0 &&551outputs[i].semantic != VARYING_SLOT_BFC1)552continue;553554for (unsigned j = 0; j < 4; j++) {555outputs[i].values[j] = LLVMBuildLoad(ctx->ac.builder, addr[i][j], "");556}557}558}559560/* Generate export instructions for hardware VS shader stage or NGG GS stage561* (position and parameter data only).562*/563void si_llvm_build_vs_exports(struct si_shader_context *ctx,564struct si_shader_output_values *outputs, unsigned noutput)565{566struct si_shader *shader = ctx->shader;567struct ac_export_args pos_args[4] = {};568LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL,569viewport_index_value = NULL;570unsigned pos_idx, index;571unsigned clipdist_mask = (shader->selector->clipdist_mask &572~shader->key.opt.kill_clip_distances) |573shader->selector->culldist_mask;574int i;575576si_vertex_color_clamping(ctx, outputs, noutput);577578struct ac_export_args param_exports[32];579si_prepare_param_exports(ctx, outputs, noutput, param_exports);580581/* Build position exports. */582for (i = 0; i < noutput; i++) {583switch (outputs[i].semantic) {584case VARYING_SLOT_POS:585si_llvm_init_vs_export_args(ctx, outputs[i].values, V_008DFC_SQ_EXP_POS, &pos_args[0]);586break;587case VARYING_SLOT_PSIZ:588psize_value = outputs[i].values[0];589break;590case VARYING_SLOT_LAYER:591layer_value = outputs[i].values[0];592break;593case VARYING_SLOT_VIEWPORT:594viewport_index_value = outputs[i].values[0];595break;596case VARYING_SLOT_EDGE:597edgeflag_value = outputs[i].values[0];598break;599case VARYING_SLOT_CLIP_DIST0:600case VARYING_SLOT_CLIP_DIST1:601index = outputs[i].semantic - VARYING_SLOT_CLIP_DIST0;602if (clipdist_mask & BITFIELD_RANGE(index * 4, 4)) {603si_llvm_init_vs_export_args(ctx, outputs[i].values, V_008DFC_SQ_EXP_POS + 2 + index,604&pos_args[2 + index]);605}606break;607case VARYING_SLOT_CLIP_VERTEX:608si_llvm_emit_clipvertex(ctx, pos_args, outputs[i].values);609break;610}611}612613/* We need to add the position output manually if it's missing. */614if (!pos_args[0].out[0]) {615pos_args[0].enabled_channels = 0xf; /* writemask */616pos_args[0].valid_mask = 0; /* EXEC mask */617pos_args[0].done = 0; /* last export? */618pos_args[0].target = V_008DFC_SQ_EXP_POS;619pos_args[0].compr = 0; /* COMPR flag */620pos_args[0].out[0] = ctx->ac.f32_0; /* X */621pos_args[0].out[1] = ctx->ac.f32_0; /* Y */622pos_args[0].out[2] = ctx->ac.f32_0; /* Z */623pos_args[0].out[3] = ctx->ac.f32_1; /* W */624}625626bool writes_psize = shader->selector->info.writes_psize && !shader->key.opt.kill_pointsize;627bool pos_writes_edgeflag = shader->selector->info.writes_edgeflag && !shader->key.as_ngg;628bool writes_vrs = ctx->screen->options.vrs2x2;629630/* Write the misc vector (point size, edgeflag, layer, viewport). */631if (writes_psize || pos_writes_edgeflag || writes_vrs ||632shader->selector->info.writes_viewport_index || shader->selector->info.writes_layer) {633pos_args[1].enabled_channels = writes_psize |634((pos_writes_edgeflag | writes_vrs) << 1) |635(shader->selector->info.writes_layer << 2);636637pos_args[1].valid_mask = 0; /* EXEC mask */638pos_args[1].done = 0; /* last export? */639pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;640pos_args[1].compr = 0; /* COMPR flag */641pos_args[1].out[0] = ctx->ac.f32_0; /* X */642pos_args[1].out[1] = ctx->ac.f32_0; /* Y */643pos_args[1].out[2] = ctx->ac.f32_0; /* Z */644pos_args[1].out[3] = ctx->ac.f32_0; /* W */645646if (writes_psize)647pos_args[1].out[0] = psize_value;648649if (pos_writes_edgeflag) {650/* The output is a float, but the hw expects an integer651* with the first bit containing the edge flag. */652edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder, edgeflag_value, ctx->ac.i32, "");653edgeflag_value = ac_build_umin(&ctx->ac, edgeflag_value, ctx->ac.i32_1);654655/* The LLVM intrinsic expects a float. */656pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value);657}658659if (writes_vrs) {660/* Bits [2:3] = VRS rate X661* Bits [4:5] = VRS rate Y662*663* The range is [-2, 1]. Values:664* 1: 2x coarser shading rate in that direction.665* 0: normal shading rate666* -1: 2x finer shading rate (sample shading, not directional)667* -2: 4x finer shading rate (sample shading, not directional)668*669* Sample shading can't go above 8 samples, so both numbers can't be -2670* at the same time.671*/672LLVMValueRef rates = LLVMConstInt(ctx->ac.i32, (1 << 2) | (1 << 4), 0);673674/* If Pos.W != 1 (typical for non-GUI elements), use 2x2 coarse shading. */675rates = LLVMBuildSelect(ctx->ac.builder,676LLVMBuildFCmp(ctx->ac.builder, LLVMRealUNE,677pos_args[0].out[3], ctx->ac.f32_1, ""),678rates, ctx->ac.i32_0, "");679680LLVMValueRef v = ac_to_integer(&ctx->ac, pos_args[1].out[1]);681v = LLVMBuildOr(ctx->ac.builder, v, rates, "");682pos_args[1].out[1] = ac_to_float(&ctx->ac, v);683}684685if (ctx->screen->info.chip_class >= GFX9) {686/* GFX9 has the layer in out.z[10:0] and the viewport687* index in out.z[19:16].688*/689if (shader->selector->info.writes_layer)690pos_args[1].out[2] = layer_value;691692if (shader->selector->info.writes_viewport_index) {693LLVMValueRef v = viewport_index_value;694695v = ac_to_integer(&ctx->ac, v);696v = LLVMBuildShl(ctx->ac.builder, v, LLVMConstInt(ctx->ac.i32, 16, 0), "");697v = LLVMBuildOr(ctx->ac.builder, v, ac_to_integer(&ctx->ac, pos_args[1].out[2]), "");698pos_args[1].out[2] = ac_to_float(&ctx->ac, v);699pos_args[1].enabled_channels |= 1 << 2;700}701} else {702if (shader->selector->info.writes_layer)703pos_args[1].out[2] = layer_value;704705if (shader->selector->info.writes_viewport_index) {706pos_args[1].out[3] = viewport_index_value;707pos_args[1].enabled_channels |= 1 << 3;708}709}710}711712for (i = 0; i < 4; i++)713if (pos_args[i].out[0])714shader->info.nr_pos_exports++;715716/* GFX10 (Navi1x) skip POS0 exports if EXEC=0 and DONE=0, causing a hang.717* Setting valid_mask=1 prevents it and has no other effect.718*/719if (ctx->screen->info.chip_class == GFX10)720pos_args[0].valid_mask = 1;721722pos_idx = 0;723for (i = 0; i < 4; i++) {724if (!pos_args[i].out[0])725continue;726727/* Specify the target we are exporting */728pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;729730if (pos_idx == shader->info.nr_pos_exports) {731/* Specify that this is the last export */732pos_args[i].done = 1;733734/* If a shader has no param exports, rasterization can start before735* the shader finishes and thus memory stores might not finish before736* the pixel shader starts.737*738* VLOAD is for atomics with return.739*/740if (ctx->screen->info.chip_class >= GFX10 &&741!shader->info.nr_param_exports &&742shader->selector->info.base.writes_memory)743ac_build_waitcnt(&ctx->ac, AC_WAIT_VLOAD | AC_WAIT_VSTORE);744}745746ac_build_export(&ctx->ac, &pos_args[i]);747}748749/* Build parameter exports. */750for (unsigned i = 0; i < shader->info.nr_param_exports; i++)751ac_build_export(&ctx->ac, ¶m_exports[i]);752}753754void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)755{756struct si_shader_context *ctx = si_shader_context_from_abi(abi);757struct si_shader_info *info = &ctx->shader->selector->info;758struct si_shader_output_values *outputs = NULL;759int i, j;760761assert(!ctx->shader->is_gs_copy_shader);762assert(info->num_outputs <= max_outputs);763764outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));765766for (i = 0; i < info->num_outputs; i++) {767outputs[i].semantic = info->output_semantic[i];768769for (j = 0; j < 4; j++) {770outputs[i].values[j] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + j], "");771outputs[i].vertex_stream[j] = (info->output_streams[i] >> (2 * j)) & 3;772}773}774775if (!ctx->screen->use_ngg_streamout && ctx->shader->selector->so.num_outputs)776si_llvm_emit_streamout(ctx, outputs, i, 0);777778/* Export PrimitiveID. */779if (ctx->shader->key.mono.u.vs_export_prim_id) {780outputs[i].semantic = VARYING_SLOT_PRIMITIVE_ID;781outputs[i].values[0] = ac_to_float(&ctx->ac, si_get_primitive_id(ctx, 0));782for (j = 1; j < 4; j++)783outputs[i].values[j] = LLVMConstReal(ctx->ac.f32, 0);784785memset(outputs[i].vertex_stream, 0, sizeof(outputs[i].vertex_stream));786i++;787}788789si_llvm_build_vs_exports(ctx, outputs, i);790FREE(outputs);791}792793static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,794LLVMValueRef *addrs)795{796struct si_shader_context *ctx = si_shader_context_from_abi(abi);797struct si_shader_info *info = &ctx->shader->selector->info;798LLVMValueRef pos[4] = {};799800assert(info->num_outputs <= max_outputs);801802for (unsigned i = 0; i < info->num_outputs; i++) {803if (info->output_semantic[i] != VARYING_SLOT_POS)804continue;805806for (unsigned chan = 0; chan < 4; chan++)807pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");808break;809}810assert(pos[0] != NULL);811812/* Return the position output. */813LLVMValueRef ret = ctx->return_value;814for (unsigned chan = 0; chan < 4; chan++)815ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, "");816ctx->return_value = ret;817}818819/**820* Build the vertex shader prolog function.821*822* The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).823* All inputs are returned unmodified. The vertex load indices are824* stored after them, which will be used by the API VS for fetching inputs.825*826* For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:827* input_v0,828* input_v1,829* input_v2,830* input_v3,831* (VertexID + BaseVertex),832* (InstanceID + StartInstance),833* (InstanceID / 2 + StartInstance)834*/835void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key)836{837LLVMTypeRef *returns;838LLVMValueRef ret, func;839int num_returns, i;840unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;841unsigned num_input_vgprs =842key->vs_prolog.num_merged_next_stage_vgprs + 4;843struct ac_arg input_sgpr_param[key->vs_prolog.num_input_sgprs];844struct ac_arg input_vgpr_param[10];845LLVMValueRef input_vgprs[10];846unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs + num_input_vgprs;847unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;848849memset(&ctx->args, 0, sizeof(ctx->args));850851/* 4 preloaded VGPRs + vertex load indices as prolog outputs */852returns = alloca((num_all_input_regs + key->vs_prolog.num_inputs) * sizeof(LLVMTypeRef));853num_returns = 0;854855/* Declare input and output SGPRs. */856for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {857ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &input_sgpr_param[i]);858returns[num_returns++] = ctx->ac.i32;859}860861struct ac_arg merged_wave_info = input_sgpr_param[3];862863/* Preloaded VGPRs (outputs must be floats) */864for (i = 0; i < num_input_vgprs; i++) {865ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &input_vgpr_param[i]);866returns[num_returns++] = ctx->ac.f32;867}868869/* Vertex load indices. */870for (i = 0; i < key->vs_prolog.num_inputs; i++)871returns[num_returns++] = ctx->ac.f32;872873/* Create the function. */874si_llvm_create_func(ctx, "vs_prolog", returns, num_returns, 0);875func = ctx->main_fn;876877for (i = 0; i < num_input_vgprs; i++) {878input_vgprs[i] = ac_get_arg(&ctx->ac, input_vgpr_param[i]);879}880881if (key->vs_prolog.num_merged_next_stage_vgprs) {882if (!key->vs_prolog.is_monolithic)883ac_init_exec_full_mask(&ctx->ac);884885if (key->vs_prolog.as_ls && ctx->screen->info.has_ls_vgpr_init_bug) {886/* If there are no HS threads, SPI loads the LS VGPRs887* starting at VGPR 0. Shift them back to where they888* belong.889*/890LLVMValueRef has_hs_threads =891LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,892si_unpack_param(ctx, input_sgpr_param[3], 8, 8), ctx->ac.i32_0, "");893894for (i = 4; i > 0; --i) {895input_vgprs[i + 1] = LLVMBuildSelect(ctx->ac.builder, has_hs_threads,896input_vgprs[i + 1], input_vgprs[i - 1], "");897}898}899}900901/* The culling code stored the LDS addresses of the VGPRs into those VGPRs. Load them. */902if (key->vs_prolog.load_vgprs_after_culling) {903for (i = 5; i <= 8; i++) {904bool is_tes_rel_patch_id = i == 7;905input_vgprs[i] = LLVMBuildIntToPtr(ctx->ac.builder, input_vgprs[i],906LLVMPointerType(is_tes_rel_patch_id ? ctx->ac.i8 : ctx->ac.i32,907AC_ADDR_SPACE_LDS), "");908input_vgprs[i] = LLVMBuildLoad(ctx->ac.builder, input_vgprs[i], "");909if (is_tes_rel_patch_id)910input_vgprs[i] = LLVMBuildZExt(ctx->ac.builder, input_vgprs[i], ctx->ac.i32, "");911}912}913914if (key->vs_prolog.gs_fast_launch_tri_list || key->vs_prolog.gs_fast_launch_tri_strip) {915LLVMValueRef wave_id, thread_id_in_tg;916917wave_id = si_unpack_param(ctx, input_sgpr_param[3], 24, 4);918thread_id_in_tg =919ac_build_imad(&ctx->ac, wave_id, LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false),920ac_get_thread_id(&ctx->ac));921922/* The GS fast launch initializes all VGPRs to the value of923* the first thread, so we have to add the thread ID.924*925* Only these are initialized by the hw:926* VGPR2: Base Primitive ID927* VGPR5: Base Vertex ID928* VGPR6: Instance ID929*/930931/* Put the vertex thread IDs into VGPRs as-is instead of packing them.932* The NGG cull shader will read them from there.933*/934if (key->vs_prolog.gs_fast_launch_tri_list) {935input_vgprs[0] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx01_offset */936LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 0 */937LLVMConstInt(ctx->ac.i32, 0, 0));938input_vgprs[1] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx23_offset */939LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 1 */940LLVMConstInt(ctx->ac.i32, 1, 0));941input_vgprs[4] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx45_offset */942LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 2 */943LLVMConstInt(ctx->ac.i32, 2, 0));944} else {945assert(key->vs_prolog.gs_fast_launch_tri_strip);946LLVMBuilderRef builder = ctx->ac.builder;947/* Triangle indices: */948LLVMValueRef index[3] = {949thread_id_in_tg,950LLVMBuildAdd(builder, thread_id_in_tg, LLVMConstInt(ctx->ac.i32, 1, 0), ""),951LLVMBuildAdd(builder, thread_id_in_tg, LLVMConstInt(ctx->ac.i32, 2, 0), ""),952};953LLVMValueRef is_odd = LLVMBuildTrunc(ctx->ac.builder, thread_id_in_tg, ctx->ac.i1, "");954LLVMValueRef flatshade_first = LLVMBuildICmp(955builder, LLVMIntEQ,956si_unpack_param(ctx, input_sgpr_param[8 + SI_SGPR_VS_STATE_BITS], 4, 2),957ctx->ac.i32_0, "");958959ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, flatshade_first, index);960input_vgprs[0] = index[0];961input_vgprs[1] = index[1];962input_vgprs[4] = index[2];963}964965/* Triangles always have all edge flags set initially. */966input_vgprs[3] = LLVMConstInt(ctx->ac.i32, 0x7 << 8, 0);967968input_vgprs[2] =969LLVMBuildAdd(ctx->ac.builder, input_vgprs[2], thread_id_in_tg, ""); /* PrimID */970input_vgprs[5] =971LLVMBuildAdd(ctx->ac.builder, input_vgprs[5], thread_id_in_tg, ""); /* VertexID */972input_vgprs[8] = input_vgprs[6]; /* InstanceID */973974if (key->vs_prolog.gs_fast_launch_index_size_packed) {975LLVMTypeRef index_type = ctx->ac.voidt;976977switch (key->vs_prolog.gs_fast_launch_index_size_packed) {978case 1:979index_type = ctx->ac.i8;980break;981case 2:982index_type = ctx->ac.i16;983break;984case 3:985index_type = ctx->ac.i32;986break;987default:988unreachable("invalid gs_fast_launch_index_size_packed");989}990991LLVMValueRef sgprs[2] = {992ac_get_arg(&ctx->ac, input_sgpr_param[0]),993ac_get_arg(&ctx->ac, input_sgpr_param[1]),994};995LLVMValueRef indices = ac_build_gather_values(&ctx->ac, sgprs, 2);996indices = LLVMBuildBitCast(ctx->ac.builder, indices, ctx->ac.i64, "");997indices = LLVMBuildIntToPtr(ctx->ac.builder, indices,998LLVMPointerType(index_type, AC_ADDR_SPACE_CONST), "");9991000LLVMValueRef vertex_id = ac_build_alloca_init(&ctx->ac, input_vgprs[5], "");10011002/* if (is ES thread...) */1003ac_build_ifcc(&ctx->ac,1004LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac),1005si_unpack_param(ctx, merged_wave_info, 0, 8), ""), 0);1006/* VertexID = indexBufferLoad(VertexID); */1007LLVMValueRef index = LLVMBuildGEP(ctx->ac.builder, indices, &input_vgprs[5], 1, "");1008index = LLVMBuildLoad(ctx->ac.builder, index, "");1009index = LLVMBuildZExt(ctx->ac.builder, index, ctx->ac.i32, "");1010LLVMBuildStore(ctx->ac.builder, index, vertex_id);1011ac_build_endif(&ctx->ac, 0);10121013input_vgprs[5] = LLVMBuildLoad(ctx->ac.builder, vertex_id, "");1014}1015}10161017unsigned vertex_id_vgpr = first_vs_vgpr;1018unsigned instance_id_vgpr = ctx->screen->info.chip_class >= GFX101019? first_vs_vgpr + 31020: first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1);10211022ctx->abi.vertex_id = input_vgprs[vertex_id_vgpr];1023ctx->abi.instance_id = input_vgprs[instance_id_vgpr];10241025/* InstanceID = VertexID >> 16;1026* VertexID = VertexID & 0xffff;1027*/1028if (key->vs_prolog.states.unpack_instance_id_from_vertex_id) {1029ctx->abi.instance_id =1030LLVMBuildLShr(ctx->ac.builder, ctx->abi.vertex_id, LLVMConstInt(ctx->ac.i32, 16, 0), "");1031ctx->abi.vertex_id = LLVMBuildAnd(ctx->ac.builder, ctx->abi.vertex_id,1032LLVMConstInt(ctx->ac.i32, 0xffff, 0), "");1033}10341035/* Copy inputs to outputs. This should be no-op, as the registers match,1036* but it will prevent the compiler from overwriting them unintentionally.1037*/1038ret = ctx->return_value;1039for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {1040LLVMValueRef p = LLVMGetParam(func, i);1041ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");1042}1043for (i = 0; i < num_input_vgprs; i++) {1044LLVMValueRef p = input_vgprs[i];10451046if (i == vertex_id_vgpr)1047p = ctx->abi.vertex_id;1048else if (i == instance_id_vgpr)1049p = ctx->abi.instance_id;10501051p = ac_to_float(&ctx->ac, p);1052ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, key->vs_prolog.num_input_sgprs + i, "");1053}10541055/* Compute vertex load indices from instance divisors. */1056LLVMValueRef instance_divisor_constbuf = NULL;10571058if (key->vs_prolog.states.instance_divisor_is_fetched) {1059LLVMValueRef list = si_prolog_get_internal_bindings(ctx);1060LLVMValueRef buf_index = LLVMConstInt(ctx->ac.i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);1061instance_divisor_constbuf = ac_build_load_to_sgpr(&ctx->ac, list, buf_index);1062}10631064for (i = 0; i < key->vs_prolog.num_inputs; i++) {1065bool divisor_is_one = key->vs_prolog.states.instance_divisor_is_one & (1u << i);1066bool divisor_is_fetched = key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);1067LLVMValueRef index = NULL;10681069if (divisor_is_one) {1070index = ctx->abi.instance_id;1071} else if (divisor_is_fetched) {1072LLVMValueRef udiv_factors[4];10731074for (unsigned j = 0; j < 4; j++) {1075udiv_factors[j] = si_buffer_load_const(ctx, instance_divisor_constbuf,1076LLVMConstInt(ctx->ac.i32, i * 16 + j * 4, 0));1077udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]);1078}1079/* The faster NUW version doesn't work when InstanceID == UINT_MAX.1080* Such InstanceID might not be achievable in a reasonable time though.1081*/1082index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id, udiv_factors[0],1083udiv_factors[1], udiv_factors[2], udiv_factors[3]);1084}10851086if (divisor_is_one || divisor_is_fetched) {1087/* Add StartInstance. */1088index =1089LLVMBuildAdd(ctx->ac.builder, index,1090LLVMGetParam(ctx->main_fn, user_sgpr_base + SI_SGPR_START_INSTANCE), "");1091} else {1092/* VertexID + BaseVertex */1093index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id,1094LLVMGetParam(func, user_sgpr_base + SI_SGPR_BASE_VERTEX), "");1095}10961097index = ac_to_float(&ctx->ac, index);1098ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index, ctx->args.arg_count + i, "");1099}11001101si_llvm_build_ret(ctx, ret);1102}11031104static LLVMValueRef get_base_vertex(struct ac_shader_abi *abi, bool non_indexed_is_zero)1105{1106struct si_shader_context *ctx = si_shader_context_from_abi(abi);11071108/* This doesn't happen with GL: */1109if (!non_indexed_is_zero)1110return ac_get_arg(&ctx->ac, ctx->args.base_vertex);11111112/* For non-indexed draws, the base vertex set by the driver1113* (for direct draws) or the CP (for indirect draws) is the1114* first vertex ID, but GLSL expects 0 to be returned.1115*/1116LLVMValueRef indexed = si_unpack_param(ctx, ctx->vs_state_bits, 1, 1);1117indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->ac.i1, "");11181119return LLVMBuildSelect(ctx->ac.builder, indexed, ac_get_arg(&ctx->ac, ctx->args.base_vertex),1120ctx->ac.i32_0, "");1121}11221123void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader)1124{1125struct si_shader *shader = ctx->shader;11261127if (shader->key.as_ls)1128ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;1129else if (shader->key.as_es)1130ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;1131else if (shader->key.opt.vs_as_prim_discard_cs)1132ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue;1133else if (ngg_cull_shader)1134ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue;1135else if (shader->key.as_ngg)1136ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;1137else1138ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;11391140ctx->abi.load_base_vertex = get_base_vertex;1141}114211431144