Path: blob/21.2-virgl/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c
4570 views
/*1* Copyright 2020 Advanced Micro Devices, Inc.2* All Rights Reserved.3*4* Permission is hereby granted, free of charge, to any person obtaining a5* copy of this software and associated documentation files (the "Software"),6* to deal in the Software without restriction, including without limitation7* on the rights to use, copy, modify, merge, publish, distribute, sub8* license, and/or sell copies of the Software, and to permit persons to whom9* the Software is furnished to do so, subject to the following conditions:10*11* The above copyright notice and this permission notice (including the next12* paragraph) shall be included in all copies or substantial portions of the13* Software.14*15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,17* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL18* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,19* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR20* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE21* USE OR OTHER DEALINGS IN THE SOFTWARE.22*/2324#include "si_pipe.h"25#include "si_shader_internal.h"26#include "sid.h"2728/**29* Return a value that is equal to the given i32 \p index if it lies in [0,num)30* or an undefined value in the same interval otherwise.31*/32static LLVMValueRef si_llvm_bound_index(struct si_shader_context *ctx, LLVMValueRef index,33unsigned num)34{35LLVMBuilderRef builder = ctx->ac.builder;36LLVMValueRef c_max = LLVMConstInt(ctx->ac.i32, num - 1, 0);37LLVMValueRef cc;3839if (util_is_power_of_two_or_zero(num)) {40index = LLVMBuildAnd(builder, index, c_max, "");41} else {42/* In theory, this MAX pattern should result in code that is43* as good as the bit-wise AND above.44*45* In practice, LLVM generates worse code (at the time of46* writing), because its value tracking is not strong enough.47*/48cc = LLVMBuildICmp(builder, LLVMIntULE, index, c_max, "");49index = LLVMBuildSelect(builder, cc, index, c_max, "");50}5152return index;53}5455static LLVMValueRef load_const_buffer_desc_fast_path(struct si_shader_context *ctx)56{57LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);58struct si_shader_selector *sel = ctx->shader->selector;5960/* Do the bounds checking with a descriptor, because61* doing computation and manual bounds checking of 64-bit62* addresses generates horrible VALU code with very high63* VGPR usage and very low SIMD occupancy.64*/65ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->ac.intptr, "");6667LLVMValueRef desc0, desc1;68desc0 = ptr;69desc1 = LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);7071uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |72S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);7374if (ctx->screen->info.chip_class >= GFX10)75rsrc3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |76S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);77else78rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |79S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);8081LLVMValueRef desc_elems[] = {desc0, desc1,82LLVMConstInt(ctx->ac.i32, sel->info.constbuf0_num_slots * 16, 0),83LLVMConstInt(ctx->ac.i32, rsrc3, false)};8485return ac_build_gather_values(&ctx->ac, desc_elems, 4);86}8788static LLVMValueRef load_ubo(struct ac_shader_abi *abi,89unsigned desc_set, unsigned binding,90bool valid_binding, LLVMValueRef index)91{92struct si_shader_context *ctx = si_shader_context_from_abi(abi);93struct si_shader_selector *sel = ctx->shader->selector;9495LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);9697if (sel->info.base.num_ubos == 1 && sel->info.base.num_ssbos == 0) {98return load_const_buffer_desc_fast_path(ctx);99}100101index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers);102index =103LLVMBuildAdd(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS, 0), "");104105return ac_build_load_to_sgpr(&ctx->ac, ptr, index);106}107108static LLVMValueRef load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write, bool non_uniform)109{110struct si_shader_context *ctx = si_shader_context_from_abi(abi);111112/* Fast path if the shader buffer is in user SGPRs. */113if (LLVMIsConstant(index) &&114LLVMConstIntGetZExtValue(index) < ctx->shader->selector->cs_num_shaderbufs_in_user_sgprs)115return ac_get_arg(&ctx->ac, ctx->cs_shaderbuf[LLVMConstIntGetZExtValue(index)]);116117LLVMValueRef rsrc_ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);118119index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers);120index = LLVMBuildSub(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS - 1, 0),121index, "");122123return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index);124}125126/**127* Given a 256-bit resource descriptor, force the DCC enable bit to off.128*129* At least on Tonga, executing image stores on images with DCC enabled and130* non-trivial can eventually lead to lockups. This can occur when an131* application binds an image as read-only but then uses a shader that writes132* to it. The OpenGL spec allows almost arbitrarily bad behavior (including133* program termination) in this case, but it doesn't cost much to be a bit134* nicer: disabling DCC in the shader still leads to undefined results but135* avoids the lockup.136*/137static LLVMValueRef force_dcc_off(struct si_shader_context *ctx, LLVMValueRef rsrc)138{139if (ctx->screen->info.chip_class <= GFX7) {140return rsrc;141} else {142LLVMValueRef i32_6 = LLVMConstInt(ctx->ac.i32, 6, 0);143LLVMValueRef i32_C = LLVMConstInt(ctx->ac.i32, C_008F28_COMPRESSION_EN, 0);144LLVMValueRef tmp;145146tmp = LLVMBuildExtractElement(ctx->ac.builder, rsrc, i32_6, "");147tmp = LLVMBuildAnd(ctx->ac.builder, tmp, i32_C, "");148return LLVMBuildInsertElement(ctx->ac.builder, rsrc, tmp, i32_6, "");149}150}151152/* AC_DESC_FMASK is handled exactly like AC_DESC_IMAGE. The caller should153* adjust "index" to point to FMASK. */154static LLVMValueRef si_load_image_desc(struct si_shader_context *ctx, LLVMValueRef list,155LLVMValueRef index, enum ac_descriptor_type desc_type,156bool uses_store, bool bindless)157{158LLVMBuilderRef builder = ctx->ac.builder;159LLVMValueRef rsrc;160161if (desc_type == AC_DESC_BUFFER) {162index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), ctx->ac.i32_1);163list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->ac.v4i32), "");164} else {165assert(desc_type == AC_DESC_IMAGE || desc_type == AC_DESC_FMASK);166}167168if (bindless)169rsrc = ac_build_load_to_sgpr_uint_wraparound(&ctx->ac, list, index);170else171rsrc = ac_build_load_to_sgpr(&ctx->ac, list, index);172173if (desc_type == AC_DESC_IMAGE && uses_store && ctx->ac.chip_class <= GFX9)174rsrc = force_dcc_off(ctx, rsrc);175return rsrc;176}177178/**179* Load an image view, fmask view. or sampler state descriptor.180*/181static LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx, LLVMValueRef list,182LLVMValueRef index, enum ac_descriptor_type type)183{184LLVMBuilderRef builder = ctx->ac.builder;185186switch (type) {187case AC_DESC_IMAGE:188/* The image is at [0:7]. */189index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->ac.i32, 2, 0), "");190break;191case AC_DESC_BUFFER:192/* The buffer is in [4:7]. */193index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0), ctx->ac.i32_1);194list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->ac.v4i32), "");195break;196case AC_DESC_FMASK:197/* The FMASK is at [8:15]. */198index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), ctx->ac.i32_1);199break;200case AC_DESC_SAMPLER:201/* The sampler state is at [12:15]. */202index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0),203LLVMConstInt(ctx->ac.i32, 3, 0));204list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->ac.v4i32), "");205break;206case AC_DESC_PLANE_0:207case AC_DESC_PLANE_1:208case AC_DESC_PLANE_2:209/* Only used for the multiplane image support for Vulkan. Should210* never be reached in radeonsi.211*/212unreachable("Plane descriptor requested in radeonsi.");213}214215return ac_build_load_to_sgpr(&ctx->ac, list, index);216}217218static LLVMValueRef si_nir_load_sampler_desc(struct ac_shader_abi *abi, unsigned descriptor_set,219unsigned base_index, unsigned constant_index,220LLVMValueRef dynamic_index,221enum ac_descriptor_type desc_type, bool image,222bool write, bool bindless)223{224struct si_shader_context *ctx = si_shader_context_from_abi(abi);225LLVMBuilderRef builder = ctx->ac.builder;226unsigned const_index = base_index + constant_index;227228assert(!descriptor_set);229assert(desc_type <= AC_DESC_BUFFER);230231if (bindless) {232LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->bindless_samplers_and_images);233234/* dynamic_index is the bindless handle */235if (image) {236/* Bindless image descriptors use 16-dword slots. */237dynamic_index =238LLVMBuildMul(ctx->ac.builder, dynamic_index, LLVMConstInt(ctx->ac.i64, 2, 0), "");239/* FMASK is right after the image. */240if (desc_type == AC_DESC_FMASK) {241dynamic_index = LLVMBuildAdd(ctx->ac.builder, dynamic_index, ctx->ac.i32_1, "");242}243244return si_load_image_desc(ctx, list, dynamic_index, desc_type, write, true);245}246247/* Since bindless handle arithmetic can contain an unsigned integer248* wraparound and si_load_sampler_desc assumes there isn't any,249* use GEP without "inbounds" (inside ac_build_pointer_add)250* to prevent incorrect code generation and hangs.251*/252dynamic_index =253LLVMBuildMul(ctx->ac.builder, dynamic_index, LLVMConstInt(ctx->ac.i64, 2, 0), "");254list = ac_build_pointer_add(&ctx->ac, list, dynamic_index);255return si_load_sampler_desc(ctx, list, ctx->ac.i32_0, desc_type);256}257258unsigned num_slots = image ? ctx->num_images : ctx->num_samplers;259assert(const_index < num_slots || dynamic_index);260261LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->samplers_and_images);262LLVMValueRef index = LLVMConstInt(ctx->ac.i32, const_index, false);263264if (dynamic_index) {265index = LLVMBuildAdd(builder, index, dynamic_index, "");266267/* From the GL_ARB_shader_image_load_store extension spec:268*269* If a shader performs an image load, store, or atomic270* operation using an image variable declared as an array,271* and if the index used to select an individual element is272* negative or greater than or equal to the size of the273* array, the results of the operation are undefined but may274* not lead to termination.275*/276index = si_llvm_bound_index(ctx, index, num_slots);277}278279if (image) {280/* Fast path if the image is in user SGPRs. */281if (!dynamic_index &&282const_index < ctx->shader->selector->cs_num_images_in_user_sgprs &&283(desc_type == AC_DESC_IMAGE || desc_type == AC_DESC_BUFFER))284return ac_get_arg(&ctx->ac, ctx->cs_image[const_index]);285286/* FMASKs are separate from images. */287if (desc_type == AC_DESC_FMASK) {288index =289LLVMBuildAdd(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGES, 0), "");290}291index = LLVMBuildSub(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS - 1, 0),292index, "");293return si_load_image_desc(ctx, list, index, desc_type, write, false);294}295296index = LLVMBuildAdd(ctx->ac.builder, index,297LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS / 2, 0), "");298return si_load_sampler_desc(ctx, list, index, desc_type);299}300301void si_llvm_init_resource_callbacks(struct si_shader_context *ctx)302{303ctx->abi.load_ubo = load_ubo;304ctx->abi.load_ssbo = load_ssbo;305ctx->abi.load_sampler_desc = si_nir_load_sampler_desc;306}307308309