Path: blob/21.2-virgl/src/amd/vulkan/radv_shader.c
7233 views
/*1* Copyright © 2016 Red Hat.2* Copyright © 2016 Bas Nieuwenhuizen3*4* based in part on anv driver which is:5* Copyright © 2015 Intel Corporation6*7* Permission is hereby granted, free of charge, to any person obtaining a8* copy of this software and associated documentation files (the "Software"),9* to deal in the Software without restriction, including without limitation10* the rights to use, copy, modify, merge, publish, distribute, sublicense,11* and/or sell copies of the Software, and to permit persons to whom the12* Software is furnished to do so, subject to the following conditions:13*14* The above copyright notice and this permission notice (including the next15* paragraph) shall be included in all copies or substantial portions of the16* Software.17*18* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR19* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,20* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL21* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER22* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING23* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS24* IN THE SOFTWARE.25*/2627#include "radv_shader.h"28#include "nir/nir.h"29#include "nir/nir_builder.h"30#include "spirv/nir_spirv.h"31#include "util/memstream.h"32#include "util/mesa-sha1.h"33#include "util/u_atomic.h"34#include "radv_debug.h"35#include "radv_private.h"36#include "radv_shader_args.h"37#include "radv_shader_helper.h"3839#include "util/debug.h"40#include "ac_binary.h"41#include "ac_exp_param.h"42#include "ac_llvm_util.h"43#include "ac_nir.h"44#include "ac_rtld.h"45#include "aco_interface.h"46#include "sid.h"47#include "vk_format.h"4849static const struct nir_shader_compiler_options nir_options = {50.vertex_id_zero_based = true,51.lower_scmp = true,52.lower_flrp16 = true,53.lower_flrp32 = true,54.lower_flrp64 = true,55.lower_device_index_to_zero = true,56.lower_fdiv = true,57.lower_fmod = true,58.lower_ineg = true,59.lower_bitfield_insert_to_bitfield_select = true,60.lower_bitfield_extract = true,61.lower_pack_snorm_2x16 = true,62.lower_pack_snorm_4x8 = true,63.lower_pack_unorm_2x16 = true,64.lower_pack_unorm_4x8 = true,65.lower_pack_half_2x16 = true,66.lower_pack_64_2x32 = true,67.lower_pack_64_4x16 = true,68.lower_pack_32_2x16 = true,69.lower_unpack_snorm_2x16 = true,70.lower_unpack_snorm_4x8 = true,71.lower_unpack_unorm_2x16 = true,72.lower_unpack_unorm_4x8 = true,73.lower_unpack_half_2x16 = true,74.lower_ffma16 = true,75.lower_ffma32 = true,76.lower_ffma64 = true,77.lower_fpow = true,78.lower_mul_2x32_64 = true,79.lower_rotate = true,80.has_fsub = true,81.has_isub = true,82.use_scoped_barrier = true,83.max_unroll_iterations = 32,84.max_unroll_iterations_aggressive = 128,85.use_interpolated_input_intrinsics = true,86.vectorize_vec2_16bit = true,87/* nir_lower_int64() isn't actually called for the LLVM backend, but88* this helps the loop unrolling heuristics. */89.lower_int64_options = nir_lower_imul64 | nir_lower_imul_high64 | nir_lower_imul_2x32_64 |90nir_lower_divmod64 | nir_lower_minmax64 | nir_lower_iabs64,91.lower_doubles_options = nir_lower_drcp | nir_lower_dsqrt | nir_lower_drsq | nir_lower_ddiv,92.divergence_analysis_options = nir_divergence_view_index_uniform,93};9495bool96radv_can_dump_shader(struct radv_device *device, struct vk_shader_module *module,97bool meta_shader)98{99if (!(device->instance->debug_flags & RADV_DEBUG_DUMP_SHADERS))100return false;101if (module)102return !module->nir || (device->instance->debug_flags & RADV_DEBUG_DUMP_META_SHADERS);103104return meta_shader;105}106107bool108radv_can_dump_shader_stats(struct radv_device *device, struct vk_shader_module *module)109{110/* Only dump non-meta shader stats. */111return device->instance->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS && module && !module->nir;112}113114void115radv_optimize_nir(const struct radv_device *device, struct nir_shader *shader,116bool optimize_conservatively, bool allow_copies)117{118bool progress;119unsigned lower_flrp = (shader->options->lower_flrp16 ? 16 : 0) |120(shader->options->lower_flrp32 ? 32 : 0) |121(shader->options->lower_flrp64 ? 64 : 0);122123do {124progress = false;125126NIR_PASS(progress, shader, nir_split_array_vars, nir_var_function_temp);127NIR_PASS(progress, shader, nir_shrink_vec_array_vars, nir_var_function_temp);128129NIR_PASS_V(shader, nir_lower_vars_to_ssa);130131if (allow_copies) {132/* Only run this pass in the first call to133* radv_optimize_nir. Later calls assume that we've134* lowered away any copy_deref instructions and we135* don't want to introduce any more.136*/137NIR_PASS(progress, shader, nir_opt_find_array_copies);138}139140NIR_PASS(progress, shader, nir_opt_copy_prop_vars);141NIR_PASS(progress, shader, nir_opt_dead_write_vars);142NIR_PASS(progress, shader, nir_remove_dead_variables,143nir_var_function_temp | nir_var_shader_in | nir_var_shader_out, NULL);144145NIR_PASS_V(shader, nir_lower_alu_to_scalar, NULL, NULL);146NIR_PASS_V(shader, nir_lower_phis_to_scalar, true);147148NIR_PASS(progress, shader, nir_copy_prop);149NIR_PASS(progress, shader, nir_opt_remove_phis);150NIR_PASS(progress, shader, nir_opt_dce);151if (nir_opt_trivial_continues(shader)) {152progress = true;153NIR_PASS(progress, shader, nir_copy_prop);154NIR_PASS(progress, shader, nir_opt_remove_phis);155NIR_PASS(progress, shader, nir_opt_dce);156}157NIR_PASS(progress, shader, nir_opt_if, true);158NIR_PASS(progress, shader, nir_opt_dead_cf);159NIR_PASS(progress, shader, nir_opt_cse);160NIR_PASS(progress, shader, nir_opt_peephole_select, 8, true, true);161NIR_PASS(progress, shader, nir_opt_constant_folding);162NIR_PASS(progress, shader, nir_opt_algebraic);163164if (lower_flrp != 0) {165bool lower_flrp_progress = false;166NIR_PASS(lower_flrp_progress, shader, nir_lower_flrp, lower_flrp,167false /* always_precise */);168if (lower_flrp_progress) {169NIR_PASS(progress, shader, nir_opt_constant_folding);170progress = true;171}172173/* Nothing should rematerialize any flrps, so we only174* need to do this lowering once.175*/176lower_flrp = 0;177}178179NIR_PASS(progress, shader, nir_opt_undef);180NIR_PASS(progress, shader, nir_opt_shrink_vectors,181!device->instance->disable_shrink_image_store);182if (shader->options->max_unroll_iterations) {183NIR_PASS(progress, shader, nir_opt_loop_unroll, 0);184}185} while (progress && !optimize_conservatively);186187NIR_PASS(progress, shader, nir_opt_conditional_discard);188NIR_PASS(progress, shader, nir_opt_move, nir_move_load_ubo);189}190191void192radv_optimize_nir_algebraic(nir_shader *nir, bool opt_offsets)193{194bool more_algebraic = true;195while (more_algebraic) {196more_algebraic = false;197NIR_PASS_V(nir, nir_copy_prop);198NIR_PASS_V(nir, nir_opt_dce);199NIR_PASS_V(nir, nir_opt_constant_folding);200NIR_PASS_V(nir, nir_opt_cse);201NIR_PASS(more_algebraic, nir, nir_opt_algebraic);202}203204if (opt_offsets)205NIR_PASS_V(nir, nir_opt_offsets);206207/* Do late algebraic optimization to turn add(a,208* neg(b)) back into subs, then the mandatory cleanup209* after algebraic. Note that it may produce fnegs,210* and if so then we need to keep running to squash211* fneg(fneg(a)).212*/213bool more_late_algebraic = true;214while (more_late_algebraic) {215more_late_algebraic = false;216NIR_PASS(more_late_algebraic, nir, nir_opt_algebraic_late);217NIR_PASS_V(nir, nir_opt_constant_folding);218NIR_PASS_V(nir, nir_copy_prop);219NIR_PASS_V(nir, nir_opt_dce);220NIR_PASS_V(nir, nir_opt_cse);221}222}223224static void225shared_var_info(const struct glsl_type *type, unsigned *size, unsigned *align)226{227assert(glsl_type_is_vector_or_scalar(type));228229uint32_t comp_size = glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8;230unsigned length = glsl_get_vector_elements(type);231*size = comp_size * length, *align = comp_size;232}233234struct radv_shader_debug_data {235struct radv_device *device;236const struct vk_shader_module *module;237};238239static void240radv_spirv_nir_debug(void *private_data, enum nir_spirv_debug_level level, size_t spirv_offset,241const char *message)242{243struct radv_shader_debug_data *debug_data = private_data;244struct radv_instance *instance = debug_data->device->instance;245246static const VkDebugReportFlagsEXT vk_flags[] = {247[NIR_SPIRV_DEBUG_LEVEL_INFO] = VK_DEBUG_REPORT_INFORMATION_BIT_EXT,248[NIR_SPIRV_DEBUG_LEVEL_WARNING] = VK_DEBUG_REPORT_WARNING_BIT_EXT,249[NIR_SPIRV_DEBUG_LEVEL_ERROR] = VK_DEBUG_REPORT_ERROR_BIT_EXT,250};251char buffer[256];252253snprintf(buffer, sizeof(buffer), "SPIR-V offset %lu: %s", (unsigned long)spirv_offset, message);254255vk_debug_report(&instance->vk, vk_flags[level], &debug_data->module->base, 0, 0, "radv", buffer);256}257258static void259radv_compiler_debug(void *private_data, enum radv_compiler_debug_level level, const char *message)260{261struct radv_shader_debug_data *debug_data = private_data;262struct radv_instance *instance = debug_data->device->instance;263264static const VkDebugReportFlagsEXT vk_flags[] = {265[RADV_COMPILER_DEBUG_LEVEL_PERFWARN] = VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT,266[RADV_COMPILER_DEBUG_LEVEL_ERROR] = VK_DEBUG_REPORT_ERROR_BIT_EXT,267};268269/* VK_DEBUG_REPORT_DEBUG_BIT_EXT specifies diagnostic information270* from the implementation and layers.271*/272vk_debug_report(&instance->vk, vk_flags[level] | VK_DEBUG_REPORT_DEBUG_BIT_EXT,273&debug_data->module->base, 0, 0, "radv", message);274}275276static nir_ssa_def *277convert_pointer_to_64(nir_builder *b, const struct radv_physical_device *pdev, nir_ssa_def *ptr)278{279nir_ssa_def *comp[] = {ptr, nir_imm_int(b, pdev->rad_info.address32_hi)};280return nir_pack_64_2x32(b, nir_vec(b, comp, 2));281}282283static bool284lower_intrinsics(nir_shader *nir, const struct radv_pipeline_key *key,285const struct radv_pipeline_layout *layout, const struct radv_physical_device *pdev)286{287nir_function_impl *entry = nir_shader_get_entrypoint(nir);288bool progress = false;289nir_builder b;290291nir_builder_init(&b, entry);292293nir_foreach_block (block, entry) {294nir_foreach_instr_safe (instr, block) {295if (instr->type != nir_instr_type_intrinsic)296continue;297298nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);299b.cursor = nir_before_instr(&intrin->instr);300301nir_ssa_def *def = NULL;302switch (intrin->intrinsic) {303case nir_intrinsic_load_vulkan_descriptor:304if (nir_intrinsic_desc_type(intrin) == VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR) {305nir_ssa_def *addr =306convert_pointer_to_64(&b, pdev,307nir_iadd(&b, nir_channels(&b, intrin->src[0].ssa, 1),308nir_channels(&b, intrin->src[0].ssa, 2)));309310def = nir_build_load_global(&b, 1, 64, addr, .access = ACCESS_NON_WRITEABLE,311.align_mul = 8, .align_offset = 0);312} else {313def = nir_vec3(&b, nir_channel(&b, intrin->src[0].ssa, 0),314nir_channel(&b, intrin->src[0].ssa, 1), nir_imm_int(&b, 0));315}316break;317case nir_intrinsic_vulkan_resource_index: {318unsigned desc_set = nir_intrinsic_desc_set(intrin);319unsigned binding = nir_intrinsic_binding(intrin);320struct radv_descriptor_set_layout *desc_layout = layout->set[desc_set].layout;321322nir_ssa_def *new_res = nir_vulkan_resource_index(323&b, 3, 32, intrin->src[0].ssa, .desc_set = desc_set, .binding = binding,324.desc_type = nir_intrinsic_desc_type(intrin));325nir_ssa_def *set_ptr = nir_channel(&b, new_res, 0);326nir_ssa_def *binding_ptr = nir_channel(&b, new_res, 1);327328nir_ssa_def *stride;329if (desc_layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||330desc_layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {331stride = nir_imm_int(&b, 16);332} else {333stride = nir_imm_int(&b, desc_layout->binding[binding].size);334}335def = nir_vec3(&b, set_ptr, binding_ptr, stride);336break;337}338case nir_intrinsic_vulkan_resource_reindex: {339nir_ssa_def *set_ptr = nir_channel(&b, intrin->src[0].ssa, 0);340nir_ssa_def *binding_ptr = nir_channel(&b, intrin->src[0].ssa, 1);341nir_ssa_def *stride = nir_channel(&b, intrin->src[0].ssa, 2);342binding_ptr = nir_iadd(&b, binding_ptr, nir_imul(&b, intrin->src[1].ssa, stride));343def = nir_vec3(&b, set_ptr, binding_ptr, stride);344break;345}346case nir_intrinsic_is_sparse_texels_resident:347def = nir_ieq_imm(&b, intrin->src[0].ssa, 0);348break;349case nir_intrinsic_sparse_residency_code_and:350def = nir_ior(&b, intrin->src[0].ssa, intrin->src[1].ssa);351break;352case nir_intrinsic_load_view_index:353if (key->has_multiview_view_index)354continue;355def = nir_imm_zero(&b, 1, 32);356break;357default:358continue;359}360361nir_ssa_def_rewrite_uses(&intrin->dest.ssa, def);362363nir_instr_remove(instr);364progress = true;365}366}367368return progress;369}370371static bool372radv_lower_primitive_shading_rate(nir_shader *nir)373{374nir_function_impl *impl = nir_shader_get_entrypoint(nir);375bool progress = false;376377nir_builder b;378nir_builder_init(&b, impl);379380/* Iterate in reverse order since there should be only one deref store to PRIMITIVE_SHADING_RATE381* after lower_io_to_temporaries for vertex shaders.382*/383nir_foreach_block_reverse(block, impl) {384nir_foreach_instr_reverse(instr, block) {385if (instr->type != nir_instr_type_intrinsic)386continue;387388nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);389if (intr->intrinsic != nir_intrinsic_store_deref)390continue;391392nir_variable *var = nir_intrinsic_get_var(intr, 0);393if (var->data.mode != nir_var_shader_out ||394var->data.location != VARYING_SLOT_PRIMITIVE_SHADING_RATE)395continue;396397b.cursor = nir_before_instr(instr);398399nir_ssa_def *val = nir_ssa_for_src(&b, intr->src[1], 1);400401/* x_rate = (shadingRate & (Horizontal2Pixels | Horizontal4Pixels)) ? 0x1 : 0x0; */402nir_ssa_def *x_rate = nir_iand(&b, val, nir_imm_int(&b, 12));403x_rate = nir_b2i32(&b, nir_ine(&b, x_rate, nir_imm_int(&b, 0)));404405/* y_rate = (shadingRate & (Vertical2Pixels | Vertical4Pixels)) ? 0x1 : 0x0; */406nir_ssa_def *y_rate = nir_iand(&b, val, nir_imm_int(&b, 3));407y_rate = nir_b2i32(&b, nir_ine(&b, y_rate, nir_imm_int(&b, 0)));408409/* Bits [2:3] = VRS rate X410* Bits [4:5] = VRS rate Y411* HW shading rate = (xRate << 2) | (yRate << 4)412*/413nir_ssa_def *out = nir_ior(&b, nir_ishl(&b, x_rate, nir_imm_int(&b, 2)),414nir_ishl(&b, y_rate, nir_imm_int(&b, 4)));415416nir_instr_rewrite_src(&intr->instr, &intr->src[1], nir_src_for_ssa(out));417418progress = true;419if (nir->info.stage == MESA_SHADER_VERTEX)420return progress;421}422}423424return progress;425}426427nir_shader *428radv_shader_compile_to_nir(struct radv_device *device, struct vk_shader_module *module,429const char *entrypoint_name, gl_shader_stage stage,430const VkSpecializationInfo *spec_info, const VkPipelineCreateFlags flags,431const struct radv_pipeline_layout *layout,432const struct radv_pipeline_key *key)433{434unsigned subgroup_size = 64, ballot_bit_size = 64;435if (key->compute_subgroup_size) {436/* Only compute shaders currently support requiring a437* specific subgroup size.438*/439assert(stage == MESA_SHADER_COMPUTE);440subgroup_size = key->compute_subgroup_size;441ballot_bit_size = key->compute_subgroup_size;442}443444nir_shader *nir;445446if (module->nir) {447/* Some things such as our meta clear/blit code will give us a NIR448* shader directly. In that case, we just ignore the SPIR-V entirely449* and just use the NIR shader */450nir = module->nir;451nir->options = &nir_options;452nir_validate_shader(nir, "in internal shader");453454assert(exec_list_length(&nir->functions) == 1);455} else {456uint32_t *spirv = (uint32_t *)module->data;457assert(module->size % 4 == 0);458459if (device->instance->debug_flags & RADV_DEBUG_DUMP_SPIRV)460radv_print_spirv(module->data, module->size, stderr);461462uint32_t num_spec_entries = 0;463struct nir_spirv_specialization *spec_entries = NULL;464if (spec_info && spec_info->mapEntryCount > 0) {465num_spec_entries = spec_info->mapEntryCount;466spec_entries = calloc(num_spec_entries, sizeof(*spec_entries));467for (uint32_t i = 0; i < num_spec_entries; i++) {468VkSpecializationMapEntry entry = spec_info->pMapEntries[i];469const void *data = (uint8_t *)spec_info->pData + entry.offset;470assert((uint8_t *)data + entry.size <=471(uint8_t *)spec_info->pData + spec_info->dataSize);472473spec_entries[i].id = spec_info->pMapEntries[i].constantID;474switch (entry.size) {475case 8:476memcpy(&spec_entries[i].value.u64, data, sizeof(uint64_t));477break;478case 4:479memcpy(&spec_entries[i].value.u32, data, sizeof(uint32_t));480break;481case 2:482memcpy(&spec_entries[i].value.u16, data, sizeof(uint16_t));483break;484case 1:485memcpy(&spec_entries[i].value.u8, data, sizeof(uint8_t));486break;487case 0:488/* The Vulkan spec says:489*490* "For a constantID specialization constant declared in a shader, size must match491* the byte size of the constantID. If the specialization constant is of type492* boolean, size must be the byte size of VkBool32."493*494* Therefore, since only scalars can be decorated as specialization constants, we can495* assume that if it doesn't have a size of 1, 2, 4, or 8, any use in a shader would496* be invalid usage. The spec further says:497*498* "If a constantID value is not a specialization constant ID used in the shader,499* that map entry does not affect the behavior of the pipeline."500*501* so we should ignore any invalid specialization constants rather than crash or502* error out when we see one.503*/504break;505default:506assert(!"Invalid spec constant size");507break;508}509}510}511512struct radv_shader_debug_data spirv_debug_data = {513.device = device,514.module = module,515};516const struct spirv_to_nir_options spirv_options = {517.caps =518{519.amd_fragment_mask = true,520.amd_gcn_shader = true,521.amd_image_gather_bias_lod = true,522.amd_image_read_write_lod = true,523.amd_shader_ballot = true,524.amd_shader_explicit_vertex_parameter = true,525.amd_trinary_minmax = true,526.demote_to_helper_invocation = true,527.derivative_group = true,528.descriptor_array_dynamic_indexing = true,529.descriptor_array_non_uniform_indexing = true,530.descriptor_indexing = true,531.device_group = true,532.draw_parameters = true,533.float_controls = true,534.float16 = device->physical_device->rad_info.has_packed_math_16bit,535.float32_atomic_add = true,536.float64 = true,537.geometry_streams = true,538.image_atomic_int64 = true,539.image_ms_array = true,540.image_read_without_format = true,541.image_write_without_format = true,542.int8 = true,543.int16 = true,544.int64 = true,545.int64_atomics = true,546.min_lod = true,547.multiview = true,548.physical_storage_buffer_address = true,549.post_depth_coverage = true,550.runtime_descriptor_array = true,551.shader_clock = true,552.shader_viewport_index_layer = true,553.sparse_residency = true,554.stencil_export = true,555.storage_8bit = true,556.storage_16bit = true,557.storage_image_ms = true,558.subgroup_arithmetic = true,559.subgroup_ballot = true,560.subgroup_basic = true,561.subgroup_quad = true,562.subgroup_shuffle = true,563.subgroup_uniform_control_flow = true,564.subgroup_vote = true,565.tessellation = true,566.transform_feedback = true,567.variable_pointers = true,568.vk_memory_model = true,569.vk_memory_model_device_scope = true,570.fragment_shading_rate = device->physical_device->rad_info.chip_class >= GFX10_3,571.workgroup_memory_explicit_layout = true,572},573.ubo_addr_format = nir_address_format_vec2_index_32bit_offset,574.ssbo_addr_format = nir_address_format_vec2_index_32bit_offset,575.phys_ssbo_addr_format = nir_address_format_64bit_global,576.push_const_addr_format = nir_address_format_logical,577.shared_addr_format = nir_address_format_32bit_offset,578.frag_coord_is_sysval = true,579.use_deref_buffer_array_length = true,580.debug =581{582.func = radv_spirv_nir_debug,583.private_data = &spirv_debug_data,584},585};586nir = spirv_to_nir(spirv, module->size / 4, spec_entries, num_spec_entries, stage,587entrypoint_name, &spirv_options, &nir_options);588assert(nir->info.stage == stage);589nir_validate_shader(nir, "after spirv_to_nir");590591free(spec_entries);592593/* We have to lower away local constant initializers right before we594* inline functions. That way they get properly initialized at the top595* of the function and not at the top of its caller.596*/597NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp);598NIR_PASS_V(nir, nir_lower_returns);599NIR_PASS_V(nir, nir_inline_functions);600NIR_PASS_V(nir, nir_copy_prop);601NIR_PASS_V(nir, nir_opt_deref);602603/* Pick off the single entrypoint that we want */604foreach_list_typed_safe(nir_function, func, node, &nir->functions)605{606if (func->is_entrypoint)607func->name = ralloc_strdup(func, "main");608else609exec_node_remove(&func->node);610}611assert(exec_list_length(&nir->functions) == 1);612613/* Make sure we lower constant initializers on output variables so that614* nir_remove_dead_variables below sees the corresponding stores615*/616NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_shader_out);617618/* Now that we've deleted all but the main function, we can go ahead and619* lower the rest of the constant initializers.620*/621NIR_PASS_V(nir, nir_lower_variable_initializers, ~0);622623/* Split member structs. We do this before lower_io_to_temporaries so that624* it doesn't lower system values to temporaries by accident.625*/626NIR_PASS_V(nir, nir_split_var_copies);627NIR_PASS_V(nir, nir_split_per_member_structs);628629if (nir->info.stage == MESA_SHADER_FRAGMENT)630NIR_PASS_V(nir, nir_lower_io_to_vector, nir_var_shader_out);631if (nir->info.stage == MESA_SHADER_FRAGMENT)632NIR_PASS_V(nir, nir_lower_input_attachments,633&(nir_input_attachment_options){634.use_fragcoord_sysval = true,635.use_layer_id_sysval = false,636});637638NIR_PASS_V(nir, nir_remove_dead_variables,639nir_var_shader_in | nir_var_shader_out | nir_var_system_value | nir_var_mem_shared,640NULL);641642/* Variables can make nir_propagate_invariant more conservative643* than it needs to be.644*/645NIR_PASS_V(nir, nir_lower_global_vars_to_local);646NIR_PASS_V(nir, nir_lower_vars_to_ssa);647648NIR_PASS_V(nir, nir_propagate_invariant,649device->instance->debug_flags & RADV_DEBUG_INVARIANT_GEOM);650651NIR_PASS_V(nir, nir_lower_system_values);652NIR_PASS_V(nir, nir_lower_compute_system_values, NULL);653654NIR_PASS_V(nir, nir_lower_clip_cull_distance_arrays);655656NIR_PASS_V(nir, nir_lower_discard_or_demote,657device->instance->debug_flags & RADV_DEBUG_DISCARD_TO_DEMOTE);658659nir_lower_doubles_options lower_doubles = nir->options->lower_doubles_options;660661if (device->physical_device->rad_info.chip_class == GFX6) {662/* GFX6 doesn't support v_floor_f64 and the precision663* of v_fract_f64 which is used to implement 64-bit664* floor is less than what Vulkan requires.665*/666lower_doubles |= nir_lower_dfloor;667}668669NIR_PASS_V(nir, nir_lower_doubles, NULL, lower_doubles);670}671672/* Vulkan uses the separate-shader linking model */673nir->info.separate_shader = true;674675nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));676677if (nir->info.stage == MESA_SHADER_GEOMETRY) {678unsigned nir_gs_flags = nir_lower_gs_intrinsics_per_stream;679680if (device->physical_device->use_ngg && !radv_use_llvm_for_stage(device, stage)) {681/* ACO needs NIR to do some of the hard lifting */682nir_gs_flags |= nir_lower_gs_intrinsics_count_primitives |683nir_lower_gs_intrinsics_count_vertices_per_primitive |684nir_lower_gs_intrinsics_overwrite_incomplete;685}686687nir_lower_gs_intrinsics(nir, nir_gs_flags);688}689690static const nir_lower_tex_options tex_options = {691.lower_txp = ~0,692.lower_tg4_offsets = true,693};694695nir_lower_tex(nir, &tex_options);696697nir_lower_vars_to_ssa(nir);698699if (nir->info.stage == MESA_SHADER_VERTEX || nir->info.stage == MESA_SHADER_GEOMETRY ||700nir->info.stage == MESA_SHADER_FRAGMENT) {701NIR_PASS_V(nir, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(nir), true, true);702} else if (nir->info.stage == MESA_SHADER_TESS_EVAL) {703NIR_PASS_V(nir, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(nir), true, false);704}705706nir_split_var_copies(nir);707708nir_lower_global_vars_to_local(nir);709nir_remove_dead_variables(nir, nir_var_function_temp, NULL);710bool gfx7minus = device->physical_device->rad_info.chip_class <= GFX7;711nir_lower_subgroups(nir, &(struct nir_lower_subgroups_options){712.subgroup_size = subgroup_size,713.ballot_bit_size = ballot_bit_size,714.ballot_components = 1,715.lower_to_scalar = 1,716.lower_subgroup_masks = 1,717.lower_shuffle = 1,718.lower_shuffle_to_32bit = 1,719.lower_vote_eq = 1,720.lower_quad_broadcast_dynamic = 1,721.lower_quad_broadcast_dynamic_to_const = gfx7minus,722.lower_shuffle_to_swizzle_amd = 1,723.lower_elect = radv_use_llvm_for_stage(device, stage),724});725726nir_lower_load_const_to_scalar(nir);727728if (!(flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT))729radv_optimize_nir(device, nir, false, true);730731/* call radv_nir_lower_ycbcr_textures() late as there might still be732* tex with undef texture/sampler before first optimization */733NIR_PASS_V(nir, radv_nir_lower_ycbcr_textures, layout);734735/* We call nir_lower_var_copies() after the first radv_optimize_nir()736* to remove any copies introduced by nir_opt_find_array_copies().737*/738nir_lower_var_copies(nir);739740const nir_opt_access_options opt_access_options = {741.is_vulkan = true,742.infer_non_readable = true,743};744NIR_PASS_V(nir, nir_opt_access, &opt_access_options);745746NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_push_const, nir_address_format_32bit_offset);747748NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ubo | nir_var_mem_ssbo,749nir_address_format_vec2_index_32bit_offset);750751NIR_PASS_V(nir, lower_intrinsics, key, layout, device->physical_device);752753/* Lower deref operations for compute shared memory. */754if (nir->info.stage == MESA_SHADER_COMPUTE) {755if (!nir->info.shared_memory_explicit_layout) {756NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, nir_var_mem_shared, shared_var_info);757}758NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_shared, nir_address_format_32bit_offset);759760if (nir->info.zero_initialize_shared_memory && nir->info.shared_size > 0) {761const unsigned chunk_size = 16; /* max single store size */762const unsigned shared_size = ALIGN(nir->info.shared_size, chunk_size);763NIR_PASS_V(nir, nir_zero_initialize_shared_memory, shared_size, chunk_size);764}765}766767nir_lower_explicit_io(nir, nir_var_mem_global, nir_address_format_64bit_global);768769/* Lower large variables that are always constant with load_constant770* intrinsics, which get turned into PC-relative loads from a data771* section next to the shader.772*/773NIR_PASS_V(nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16);774775/* Lower primitive shading rate to match HW requirements. */776if ((nir->info.stage == MESA_SHADER_VERTEX ||777nir->info.stage == MESA_SHADER_GEOMETRY) &&778nir->info.outputs_written & BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE)) {779NIR_PASS_V(nir, radv_lower_primitive_shading_rate);780}781782/* Indirect lowering must be called after the radv_optimize_nir() loop783* has been called at least once. Otherwise indirect lowering can784* bloat the instruction count of the loop and cause it to be785* considered too large for unrolling.786*/787if (ac_nir_lower_indirect_derefs(nir, device->physical_device->rad_info.chip_class) &&788!(flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT) &&789nir->info.stage != MESA_SHADER_COMPUTE) {790/* Optimize the lowered code before the linking optimizations. */791radv_optimize_nir(device, nir, false, false);792}793794return nir;795}796797static int798type_size_vec4(const struct glsl_type *type, bool bindless)799{800return glsl_count_attribute_slots(type, false);801}802803static nir_variable *804find_layer_in_var(nir_shader *nir)805{806nir_variable *var = nir_find_variable_with_location(nir, nir_var_shader_in, VARYING_SLOT_LAYER);807if (var != NULL)808return var;809810var = nir_variable_create(nir, nir_var_shader_in, glsl_int_type(), "layer id");811var->data.location = VARYING_SLOT_LAYER;812var->data.interpolation = INTERP_MODE_FLAT;813return var;814}815816/* We use layered rendering to implement multiview, which means we need to map817* view_index to gl_Layer. The code generates a load from the layer_id sysval,818* but since we don't have a way to get at this information from the fragment819* shader, we also need to lower this to the gl_Layer varying. This pass820* lowers both to a varying load from the LAYER slot, before lowering io, so821* that nir_assign_var_locations() will give the LAYER varying the correct822* driver_location.823*/824825static bool826lower_view_index(nir_shader *nir)827{828bool progress = false;829nir_function_impl *entry = nir_shader_get_entrypoint(nir);830nir_builder b;831nir_builder_init(&b, entry);832833nir_variable *layer = NULL;834nir_foreach_block (block, entry) {835nir_foreach_instr_safe (instr, block) {836if (instr->type != nir_instr_type_intrinsic)837continue;838839nir_intrinsic_instr *load = nir_instr_as_intrinsic(instr);840if (load->intrinsic != nir_intrinsic_load_view_index)841continue;842843if (!layer)844layer = find_layer_in_var(nir);845846b.cursor = nir_before_instr(instr);847nir_ssa_def *def = nir_load_var(&b, layer);848nir_ssa_def_rewrite_uses(&load->dest.ssa, def);849850nir_instr_remove(instr);851progress = true;852}853}854855return progress;856}857858void859radv_lower_io(struct radv_device *device, nir_shader *nir)860{861if (nir->info.stage == MESA_SHADER_COMPUTE)862return;863864if (nir->info.stage == MESA_SHADER_FRAGMENT) {865NIR_PASS_V(nir, lower_view_index);866nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, MESA_SHADER_FRAGMENT);867}868869/* The RADV/LLVM backend expects 64-bit IO to be lowered. */870nir_lower_io_options options =871radv_use_llvm_for_stage(device, nir->info.stage) ? nir_lower_io_lower_64bit_to_32 : 0;872873NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, type_size_vec4, options);874875/* This pass needs actual constants */876nir_opt_constant_folding(nir);877878NIR_PASS_V(nir, nir_io_add_const_offset_to_base, nir_var_shader_in | nir_var_shader_out);879}880881bool882radv_lower_io_to_mem(struct radv_device *device, struct nir_shader *nir,883struct radv_shader_info *info, const struct radv_pipeline_key *pl_key)884{885if (nir->info.stage == MESA_SHADER_VERTEX) {886if (info->vs.as_ls) {887ac_nir_lower_ls_outputs_to_mem(nir, info->vs.tcs_in_out_eq,888info->vs.tcs_temp_only_input_mask,889info->vs.num_linked_outputs);890return true;891} else if (info->vs.as_es) {892ac_nir_lower_es_outputs_to_mem(nir, device->physical_device->rad_info.chip_class,893info->vs.num_linked_outputs);894return true;895}896} else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {897ac_nir_lower_hs_inputs_to_mem(nir, info->vs.tcs_in_out_eq, info->tcs.num_linked_inputs);898ac_nir_lower_hs_outputs_to_mem(899nir, device->physical_device->rad_info.chip_class, info->tcs.tes_reads_tess_factors,900info->tcs.tes_inputs_read, info->tcs.tes_patch_inputs_read, info->tcs.num_linked_inputs,901info->tcs.num_linked_outputs, info->tcs.num_linked_patch_outputs, true);902ac_nir_lower_tess_to_const(nir, pl_key->tess_input_vertices, info->num_tess_patches,903ac_nir_lower_patch_vtx_in | ac_nir_lower_num_patches);904905return true;906} else if (nir->info.stage == MESA_SHADER_TESS_EVAL) {907ac_nir_lower_tes_inputs_to_mem(nir, info->tes.num_linked_inputs,908info->tes.num_linked_patch_inputs);909ac_nir_lower_tess_to_const(nir, nir->info.tess.tcs_vertices_out, info->num_tess_patches,910ac_nir_lower_patch_vtx_in | ac_nir_lower_num_patches);911912if (info->tes.as_es) {913ac_nir_lower_es_outputs_to_mem(nir, device->physical_device->rad_info.chip_class,914info->tes.num_linked_outputs);915}916917return true;918} else if (nir->info.stage == MESA_SHADER_GEOMETRY) {919ac_nir_lower_gs_inputs_to_mem(nir, device->physical_device->rad_info.chip_class,920info->gs.num_linked_inputs);921return true;922}923924return false;925}926927bool928radv_consider_culling(struct radv_device *device, struct nir_shader *nir,929uint64_t ps_inputs_read)930{931/* Culling doesn't make sense for meta shaders. */932if (!!nir->info.name)933return false;934935/* We don't support culling with multiple viewports yet. */936if (nir->info.outputs_written & (VARYING_BIT_VIEWPORT | VARYING_BIT_VIEWPORT_MASK))937return false;938939/* TODO: enable by default on GFX10.3 when we're confident about performance. */940bool culling_enabled = device->instance->perftest_flags & RADV_PERFTEST_NGGC;941942if (!culling_enabled)943return false;944945/* Shader based culling efficiency can depend on PS throughput.946* Estimate an upper limit for PS input param count based on GPU info.947*/948unsigned max_ps_params;949unsigned max_render_backends = device->physical_device->rad_info.max_render_backends;950unsigned max_se = device->physical_device->rad_info.max_se;951952if (max_render_backends < 2)953return false; /* Don't use NGG culling on 1 RB chips. */954else if (max_render_backends / max_se == 4)955max_ps_params = 6; /* Sienna Cichlid and other GFX10.3 dGPUs. */956else957max_ps_params = 4; /* Navi 1x. */958959/* TODO: consider other heuristics here, such as PS execution time */960961return util_bitcount64(ps_inputs_read & ~VARYING_BIT_POS) <= max_ps_params;962}963964void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,965struct radv_shader_info *info,966const struct radv_pipeline_key *pl_key,967struct radv_shader_variant_key *key,968bool consider_culling)969{970/* TODO: support the LLVM backend with the NIR lowering */971assert(!radv_use_llvm_for_stage(device, nir->info.stage));972973assert(nir->info.stage == MESA_SHADER_VERTEX ||974nir->info.stage == MESA_SHADER_TESS_EVAL ||975nir->info.stage == MESA_SHADER_GEOMETRY);976977ac_nir_ngg_config out_conf = {0};978const struct gfx10_ngg_info *ngg_info = &info->ngg_info;979unsigned num_gs_invocations = (nir->info.stage != MESA_SHADER_GEOMETRY || ngg_info->max_vert_out_per_gs_instance) ? 1 : info->gs.invocations;980unsigned num_vertices_per_prim = 3;981982/* Get the number of vertices per input primitive */983if (nir->info.stage == MESA_SHADER_TESS_EVAL) {984if (nir->info.tess.point_mode)985num_vertices_per_prim = 1;986else if (nir->info.tess.primitive_mode == GL_ISOLINES)987num_vertices_per_prim = 2;988989/* Manually mark the primitive ID used, so the shader can repack it. */990if (key->vs_common_out.export_prim_id)991BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);992993} else if (nir->info.stage == MESA_SHADER_VERTEX) {994/* Need to add 1, because: V_028A6C_POINTLIST=0, V_028A6C_LINESTRIP=1, V_028A6C_TRISTRIP=2, etc. */995num_vertices_per_prim = key->vs.outprim + 1;996997/* Manually mark the instance ID used, so the shader can repack it. */998if (key->vs.instance_rate_inputs)999BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);10001001} else if (nir->info.stage == MESA_SHADER_GEOMETRY) {1002num_vertices_per_prim = nir->info.gs.vertices_in;1003} else {1004unreachable("NGG needs to be VS, TES or GS.");1005}10061007/* Invocations that process an input vertex */1008unsigned max_vtx_in = MIN2(256, ngg_info->enable_vertex_grouping ? ngg_info->hw_max_esverts : num_vertices_per_prim * ngg_info->max_gsprims);1009/* Invocations that export an output vertex */1010unsigned max_vtx_out = ngg_info->max_out_verts;1011/* Invocations that process an input primitive */1012unsigned max_prm_in = ngg_info->max_gsprims * num_gs_invocations;1013/* Invocations that produce an output primitive */1014unsigned max_prm_out = ngg_info->max_gsprims * num_gs_invocations * ngg_info->prim_amp_factor;10151016unsigned max_workgroup_size = MAX4(max_vtx_in, max_vtx_out, max_prm_in, max_prm_out);10171018/* Maximum HW limit for NGG workgroups */1019max_workgroup_size = MIN2(256, max_workgroup_size);10201021if (nir->info.stage == MESA_SHADER_VERTEX ||1022nir->info.stage == MESA_SHADER_TESS_EVAL) {1023assert(key->vs_common_out.as_ngg);10241025if (consider_culling)1026radv_optimize_nir_algebraic(nir, false);10271028out_conf =1029ac_nir_lower_ngg_nogs(1030nir,1031max_vtx_in,1032num_vertices_per_prim,1033max_workgroup_size,1034info->wave_size,1035consider_culling,1036key->vs_common_out.as_ngg_passthrough,1037key->vs_common_out.export_prim_id,1038key->vs.provoking_vtx_last);10391040info->has_ngg_culling = out_conf.can_cull;1041info->has_ngg_early_prim_export = out_conf.early_prim_export;1042info->num_lds_blocks_when_not_culling = DIV_ROUND_UP(out_conf.lds_bytes_if_culling_off, device->physical_device->rad_info.lds_encode_granularity);1043info->is_ngg_passthrough = out_conf.passthrough;1044key->vs_common_out.as_ngg_passthrough = out_conf.passthrough;1045} else if (nir->info.stage == MESA_SHADER_GEOMETRY) {1046assert(info->is_ngg);1047ac_nir_lower_ngg_gs(1048nir, info->wave_size, max_workgroup_size,1049info->ngg_info.esgs_ring_size,1050info->gs.gsvs_vertex_size,1051info->ngg_info.ngg_emit_size * 4u,1052key->vs.provoking_vtx_last);1053} else {1054unreachable("invalid SW stage passed to radv_lower_ngg");1055}1056}10571058static void *1059radv_alloc_shader_memory(struct radv_device *device, struct radv_shader_variant *shader)1060{1061mtx_lock(&device->shader_slab_mutex);1062list_for_each_entry(struct radv_shader_slab, slab, &device->shader_slabs, slabs)1063{1064uint64_t offset = 0;10651066#ifdef __GNUC__1067#pragma GCC diagnostic push1068#pragma GCC diagnostic ignored "-Wshadow"1069#endif1070list_for_each_entry(struct radv_shader_variant, s, &slab->shaders, slab_list)1071{1072#ifdef __GNUC__1073#pragma GCC diagnostic pop1074#endif1075if (s->bo_offset - offset >= shader->code_size) {1076shader->bo = slab->bo;1077shader->bo_offset = offset;1078list_addtail(&shader->slab_list, &s->slab_list);1079mtx_unlock(&device->shader_slab_mutex);1080return slab->ptr + offset;1081}1082offset = align_u64(s->bo_offset + s->code_size, 256);1083}1084if (offset <= slab->size && slab->size - offset >= shader->code_size) {1085shader->bo = slab->bo;1086shader->bo_offset = offset;1087list_addtail(&shader->slab_list, &slab->shaders);1088mtx_unlock(&device->shader_slab_mutex);1089return slab->ptr + offset;1090}1091}10921093mtx_unlock(&device->shader_slab_mutex);1094struct radv_shader_slab *slab = calloc(1, sizeof(struct radv_shader_slab));10951096slab->size = MAX2(256 * 1024, shader->code_size);1097VkResult result = device->ws->buffer_create(1098device->ws, slab->size, 256, RADEON_DOMAIN_VRAM,1099RADEON_FLAG_NO_INTERPROCESS_SHARING |1100(device->physical_device->rad_info.cpdma_prefetch_writes_memory ? 01101: RADEON_FLAG_READ_ONLY),1102RADV_BO_PRIORITY_SHADER, 0, &slab->bo);1103if (result != VK_SUCCESS) {1104free(slab);1105return NULL;1106}11071108slab->ptr = (char *)device->ws->buffer_map(slab->bo);1109if (!slab->ptr) {1110device->ws->buffer_destroy(device->ws, slab->bo);1111free(slab);1112return NULL;1113}11141115list_inithead(&slab->shaders);11161117mtx_lock(&device->shader_slab_mutex);1118list_add(&slab->slabs, &device->shader_slabs);11191120shader->bo = slab->bo;1121shader->bo_offset = 0;1122list_add(&shader->slab_list, &slab->shaders);1123mtx_unlock(&device->shader_slab_mutex);1124return slab->ptr;1125}11261127void1128radv_destroy_shader_slabs(struct radv_device *device)1129{1130list_for_each_entry_safe(struct radv_shader_slab, slab, &device->shader_slabs, slabs)1131{1132device->ws->buffer_destroy(device->ws, slab->bo);1133free(slab);1134}1135mtx_destroy(&device->shader_slab_mutex);1136}11371138/* For the UMR disassembler. */1139#define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */1140#define DEBUGGER_NUM_MARKERS 511411142static unsigned1143radv_get_shader_binary_size(size_t code_size)1144{1145return code_size + DEBUGGER_NUM_MARKERS * 4;1146}11471148static bool1149radv_should_use_wgp_mode(const struct radv_device *device, gl_shader_stage stage,1150const struct radv_shader_info *info)1151{1152enum chip_class chip = device->physical_device->rad_info.chip_class;1153switch (stage) {1154case MESA_SHADER_COMPUTE:1155case MESA_SHADER_TESS_CTRL:1156return chip >= GFX10;1157case MESA_SHADER_GEOMETRY:1158return chip == GFX10 || (chip >= GFX10_3 && !info->is_ngg);1159case MESA_SHADER_VERTEX:1160case MESA_SHADER_TESS_EVAL:1161return chip == GFX10 && info->is_ngg;1162default:1163return false;1164}1165}11661167static void1168radv_postprocess_config(const struct radv_device *device, const struct ac_shader_config *config_in,1169const struct radv_shader_info *info, gl_shader_stage stage,1170struct ac_shader_config *config_out)1171{1172const struct radv_physical_device *pdevice = device->physical_device;1173bool scratch_enabled = config_in->scratch_bytes_per_wave > 0;1174bool trap_enabled = !!device->trap_handler_shader;1175unsigned vgpr_comp_cnt = 0;1176unsigned num_input_vgprs = info->num_input_vgprs;11771178if (stage == MESA_SHADER_FRAGMENT) {1179num_input_vgprs = ac_get_fs_input_vgpr_cnt(config_in, NULL, NULL);1180}11811182unsigned num_vgprs = MAX2(config_in->num_vgprs, num_input_vgprs);1183/* +3 for scratch wave offset and VCC */1184unsigned num_sgprs = MAX2(config_in->num_sgprs, info->num_input_sgprs + 3);1185unsigned num_shared_vgprs = config_in->num_shared_vgprs;1186/* shared VGPRs are introduced in Navi and are allocated in blocks of 8 (RDNA ref 3.6.5) */1187assert((pdevice->rad_info.chip_class >= GFX10 && num_shared_vgprs % 8 == 0) ||1188(pdevice->rad_info.chip_class < GFX10 && num_shared_vgprs == 0));1189unsigned num_shared_vgpr_blocks = num_shared_vgprs / 8;1190unsigned excp_en = 0;11911192*config_out = *config_in;1193config_out->num_vgprs = num_vgprs;1194config_out->num_sgprs = num_sgprs;1195config_out->num_shared_vgprs = num_shared_vgprs;11961197config_out->rsrc2 = S_00B12C_USER_SGPR(info->num_user_sgprs) |1198S_00B12C_SCRATCH_EN(scratch_enabled) | S_00B12C_TRAP_PRESENT(trap_enabled);11991200if (trap_enabled) {1201/* Configure the shader exceptions like memory violation, etc.1202* TODO: Enable (and validate) more exceptions.1203*/1204excp_en = 1 << 8; /* mem_viol */1205}12061207if (!pdevice->use_ngg_streamout) {1208config_out->rsrc2 |=1209S_00B12C_SO_BASE0_EN(!!info->so.strides[0]) | S_00B12C_SO_BASE1_EN(!!info->so.strides[1]) |1210S_00B12C_SO_BASE2_EN(!!info->so.strides[2]) | S_00B12C_SO_BASE3_EN(!!info->so.strides[3]) |1211S_00B12C_SO_EN(!!info->so.num_outputs);1212}12131214config_out->rsrc1 = S_00B848_VGPRS((num_vgprs - 1) / (info->wave_size == 32 ? 8 : 4)) |1215S_00B848_DX10_CLAMP(1) | S_00B848_FLOAT_MODE(config_out->float_mode);12161217if (pdevice->rad_info.chip_class >= GFX10) {1218config_out->rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX10(info->num_user_sgprs >> 5);1219} else {1220config_out->rsrc1 |= S_00B228_SGPRS((num_sgprs - 1) / 8);1221config_out->rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX9(info->num_user_sgprs >> 5);1222}12231224bool wgp_mode = radv_should_use_wgp_mode(device, stage, info);12251226switch (stage) {1227case MESA_SHADER_TESS_EVAL:1228if (info->is_ngg) {1229config_out->rsrc1 |= S_00B228_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);1230config_out->rsrc2 |= S_00B22C_OC_LDS_EN(1) | S_00B22C_EXCP_EN(excp_en);1231} else if (info->tes.as_es) {1232assert(pdevice->rad_info.chip_class <= GFX8);1233vgpr_comp_cnt = info->uses_prim_id ? 3 : 2;12341235config_out->rsrc2 |= S_00B12C_OC_LDS_EN(1) | S_00B12C_EXCP_EN(excp_en);1236} else {1237bool enable_prim_id = info->tes.export_prim_id || info->uses_prim_id;1238vgpr_comp_cnt = enable_prim_id ? 3 : 2;12391240config_out->rsrc1 |= S_00B128_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);1241config_out->rsrc2 |= S_00B12C_OC_LDS_EN(1) | S_00B12C_EXCP_EN(excp_en);1242}1243config_out->rsrc2 |= S_00B22C_SHARED_VGPR_CNT(num_shared_vgpr_blocks);1244break;1245case MESA_SHADER_TESS_CTRL:1246if (pdevice->rad_info.chip_class >= GFX9) {1247/* We need at least 2 components for LS.1248* VGPR0-3: (VertexID, RelAutoindex, InstanceID / StepRate0, InstanceID).1249* StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded.1250*/1251if (pdevice->rad_info.chip_class >= GFX10) {1252vgpr_comp_cnt = info->vs.needs_instance_id ? 3 : 1;1253config_out->rsrc2 |=1254S_00B42C_LDS_SIZE_GFX10(info->tcs.num_lds_blocks) | S_00B42C_EXCP_EN_GFX6(excp_en);1255} else {1256vgpr_comp_cnt = info->vs.needs_instance_id ? 2 : 1;1257config_out->rsrc2 |=1258S_00B42C_LDS_SIZE_GFX9(info->tcs.num_lds_blocks) | S_00B42C_EXCP_EN_GFX9(excp_en);1259}1260} else {1261config_out->rsrc2 |= S_00B12C_OC_LDS_EN(1) | S_00B12C_EXCP_EN(excp_en);1262}1263config_out->rsrc1 |=1264S_00B428_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10) | S_00B428_WGP_MODE(wgp_mode);1265config_out->rsrc2 |= S_00B42C_SHARED_VGPR_CNT(num_shared_vgpr_blocks);1266break;1267case MESA_SHADER_VERTEX:1268if (info->is_ngg) {1269config_out->rsrc1 |= S_00B228_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);1270} else if (info->vs.as_ls) {1271assert(pdevice->rad_info.chip_class <= GFX8);1272/* We need at least 2 components for LS.1273* VGPR0-3: (VertexID, RelAutoindex, InstanceID / StepRate0, InstanceID).1274* StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded.1275*/1276vgpr_comp_cnt = info->vs.needs_instance_id ? 2 : 1;1277} else if (info->vs.as_es) {1278assert(pdevice->rad_info.chip_class <= GFX8);1279/* VGPR0-3: (VertexID, InstanceID / StepRate0, ...) */1280vgpr_comp_cnt = info->vs.needs_instance_id ? 1 : 0;1281} else {1282/* VGPR0-3: (VertexID, InstanceID / StepRate0, PrimID, InstanceID)1283* If PrimID is disabled. InstanceID / StepRate1 is loaded instead.1284* StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded.1285*/1286if (info->vs.needs_instance_id && pdevice->rad_info.chip_class >= GFX10) {1287vgpr_comp_cnt = 3;1288} else if (info->vs.export_prim_id) {1289vgpr_comp_cnt = 2;1290} else if (info->vs.needs_instance_id) {1291vgpr_comp_cnt = 1;1292} else {1293vgpr_comp_cnt = 0;1294}12951296config_out->rsrc1 |= S_00B128_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);1297}1298config_out->rsrc2 |=1299S_00B12C_SHARED_VGPR_CNT(num_shared_vgpr_blocks) | S_00B12C_EXCP_EN(excp_en);1300break;1301case MESA_SHADER_FRAGMENT:1302config_out->rsrc1 |= S_00B028_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);1303config_out->rsrc2 |= S_00B02C_SHARED_VGPR_CNT(num_shared_vgpr_blocks) |1304S_00B02C_TRAP_PRESENT(1) | S_00B02C_EXCP_EN(excp_en);1305break;1306case MESA_SHADER_GEOMETRY:1307config_out->rsrc1 |= S_00B228_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);1308config_out->rsrc2 |=1309S_00B22C_SHARED_VGPR_CNT(num_shared_vgpr_blocks) | S_00B22C_EXCP_EN(excp_en);1310break;1311case MESA_SHADER_COMPUTE:1312config_out->rsrc1 |=1313S_00B848_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10) | S_00B848_WGP_MODE(wgp_mode);1314config_out->rsrc2 |= S_00B84C_TGID_X_EN(info->cs.uses_block_id[0]) |1315S_00B84C_TGID_Y_EN(info->cs.uses_block_id[1]) |1316S_00B84C_TGID_Z_EN(info->cs.uses_block_id[2]) |1317S_00B84C_TIDIG_COMP_CNT(info->cs.uses_thread_id[2] ? 21318: info->cs.uses_thread_id[1] ? 11319: 0) |1320S_00B84C_TG_SIZE_EN(info->cs.uses_local_invocation_idx) |1321S_00B84C_LDS_SIZE(config_in->lds_size) | S_00B84C_EXCP_EN(excp_en);1322config_out->rsrc3 |= S_00B8A0_SHARED_VGPR_CNT(num_shared_vgpr_blocks);13231324break;1325default:1326unreachable("unsupported shader type");1327break;1328}13291330if (pdevice->rad_info.chip_class >= GFX10 && info->is_ngg &&1331(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL ||1332stage == MESA_SHADER_GEOMETRY)) {1333unsigned gs_vgpr_comp_cnt, es_vgpr_comp_cnt;1334gl_shader_stage es_stage = stage;1335if (stage == MESA_SHADER_GEOMETRY)1336es_stage = info->gs.es_type;13371338/* VGPR5-8: (VertexID, UserVGPR0, UserVGPR1, UserVGPR2 / InstanceID) */1339if (es_stage == MESA_SHADER_VERTEX) {1340es_vgpr_comp_cnt = info->vs.needs_instance_id ? 3 : 0;1341} else if (es_stage == MESA_SHADER_TESS_EVAL) {1342bool enable_prim_id = info->tes.export_prim_id || info->uses_prim_id;1343es_vgpr_comp_cnt = enable_prim_id ? 3 : 2;1344} else1345unreachable("Unexpected ES shader stage");13461347bool tes_triangles =1348stage == MESA_SHADER_TESS_EVAL && info->tes.primitive_mode >= 4; /* GL_TRIANGLES */1349if (info->uses_invocation_id || stage == MESA_SHADER_VERTEX) {1350gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */1351} else if (info->uses_prim_id) {1352gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */1353} else if (info->gs.vertices_in >= 3 || tes_triangles) {1354gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */1355} else {1356gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */1357}13581359/* Disable the WGP mode on gfx10.3 because it can hang. (it1360* happened on VanGogh) Let's disable it on all chips that1361* disable exactly 1 CU per SA for GS.1362*/1363config_out->rsrc1 |=1364S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt) | S_00B228_WGP_MODE(wgp_mode);1365config_out->rsrc2 |= S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |1366S_00B22C_LDS_SIZE(config_in->lds_size) |1367S_00B22C_OC_LDS_EN(es_stage == MESA_SHADER_TESS_EVAL);1368} else if (pdevice->rad_info.chip_class >= GFX9 && stage == MESA_SHADER_GEOMETRY) {1369unsigned es_type = info->gs.es_type;1370unsigned gs_vgpr_comp_cnt, es_vgpr_comp_cnt;13711372if (es_type == MESA_SHADER_VERTEX) {1373/* VGPR0-3: (VertexID, InstanceID / StepRate0, ...) */1374if (info->vs.needs_instance_id) {1375es_vgpr_comp_cnt = pdevice->rad_info.chip_class >= GFX10 ? 3 : 1;1376} else {1377es_vgpr_comp_cnt = 0;1378}1379} else if (es_type == MESA_SHADER_TESS_EVAL) {1380es_vgpr_comp_cnt = info->uses_prim_id ? 3 : 2;1381} else {1382unreachable("invalid shader ES type");1383}13841385/* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and1386* VGPR[0:4] are always loaded.1387*/1388if (info->uses_invocation_id) {1389gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */1390} else if (info->uses_prim_id) {1391gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */1392} else if (info->gs.vertices_in >= 3) {1393gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */1394} else {1395gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */1396}13971398config_out->rsrc1 |=1399S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt) | S_00B228_WGP_MODE(wgp_mode);1400config_out->rsrc2 |= S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |1401S_00B22C_OC_LDS_EN(es_type == MESA_SHADER_TESS_EVAL);1402} else if (pdevice->rad_info.chip_class >= GFX9 && stage == MESA_SHADER_TESS_CTRL) {1403config_out->rsrc1 |= S_00B428_LS_VGPR_COMP_CNT(vgpr_comp_cnt);1404} else {1405config_out->rsrc1 |= S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt);1406}1407}14081409struct radv_shader_variant *1410radv_shader_variant_create(struct radv_device *device, const struct radv_shader_binary *binary,1411bool keep_shader_info)1412{1413struct ac_shader_config config = {0};1414struct ac_rtld_binary rtld_binary = {0};1415struct radv_shader_variant *variant = calloc(1, sizeof(struct radv_shader_variant));1416if (!variant)1417return NULL;14181419variant->ref_count = 1;14201421if (binary->type == RADV_BINARY_TYPE_RTLD) {1422struct ac_rtld_symbol lds_symbols[2];1423unsigned num_lds_symbols = 0;1424const char *elf_data = (const char *)((struct radv_shader_binary_rtld *)binary)->data;1425size_t elf_size = ((struct radv_shader_binary_rtld *)binary)->elf_size;14261427if (device->physical_device->rad_info.chip_class >= GFX9 &&1428(binary->stage == MESA_SHADER_GEOMETRY || binary->info.is_ngg) &&1429!binary->is_gs_copy_shader) {1430struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];1431sym->name = "esgs_ring";1432sym->size = binary->info.ngg_info.esgs_ring_size;1433sym->align = 64 * 1024;1434}14351436if (binary->info.is_ngg && binary->stage == MESA_SHADER_GEOMETRY) {1437struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];1438sym->name = "ngg_emit";1439sym->size = binary->info.ngg_info.ngg_emit_size * 4;1440sym->align = 4;1441}14421443struct ac_rtld_open_info open_info = {1444.info = &device->physical_device->rad_info,1445.shader_type = binary->stage,1446.wave_size = binary->info.wave_size,1447.num_parts = 1,1448.elf_ptrs = &elf_data,1449.elf_sizes = &elf_size,1450.num_shared_lds_symbols = num_lds_symbols,1451.shared_lds_symbols = lds_symbols,1452};14531454if (!ac_rtld_open(&rtld_binary, open_info)) {1455free(variant);1456return NULL;1457}14581459if (!ac_rtld_read_config(&device->physical_device->rad_info, &rtld_binary, &config)) {1460ac_rtld_close(&rtld_binary);1461free(variant);1462return NULL;1463}14641465if (rtld_binary.lds_size > 0) {1466unsigned encode_granularity = device->physical_device->rad_info.lds_encode_granularity;1467config.lds_size = align(rtld_binary.lds_size, encode_granularity) / encode_granularity;1468}1469if (!config.lds_size && binary->stage == MESA_SHADER_TESS_CTRL) {1470/* This is used for reporting LDS statistics */1471config.lds_size = binary->info.tcs.num_lds_blocks;1472}14731474variant->code_size = rtld_binary.rx_size;1475variant->exec_size = rtld_binary.exec_size;1476} else {1477assert(binary->type == RADV_BINARY_TYPE_LEGACY);1478config = ((struct radv_shader_binary_legacy *)binary)->config;1479variant->code_size =1480radv_get_shader_binary_size(((struct radv_shader_binary_legacy *)binary)->code_size);1481variant->exec_size = ((struct radv_shader_binary_legacy *)binary)->exec_size;1482}14831484variant->info = binary->info;1485radv_postprocess_config(device, &config, &binary->info, binary->stage, &variant->config);14861487void *dest_ptr = radv_alloc_shader_memory(device, variant);1488if (!dest_ptr) {1489if (binary->type == RADV_BINARY_TYPE_RTLD)1490ac_rtld_close(&rtld_binary);1491free(variant);1492return NULL;1493}14941495if (binary->type == RADV_BINARY_TYPE_RTLD) {1496struct radv_shader_binary_rtld *bin = (struct radv_shader_binary_rtld *)binary;1497struct ac_rtld_upload_info info = {1498.binary = &rtld_binary,1499.rx_va = radv_buffer_get_va(variant->bo) + variant->bo_offset,1500.rx_ptr = dest_ptr,1501};15021503if (!ac_rtld_upload(&info)) {1504radv_shader_variant_destroy(device, variant);1505ac_rtld_close(&rtld_binary);1506return NULL;1507}15081509if (keep_shader_info || (device->instance->debug_flags & RADV_DEBUG_DUMP_SHADERS)) {1510const char *disasm_data;1511size_t disasm_size;1512if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm_data,1513&disasm_size)) {1514radv_shader_variant_destroy(device, variant);1515ac_rtld_close(&rtld_binary);1516return NULL;1517}15181519variant->ir_string =1520bin->llvm_ir_size ? strdup((const char *)(bin->data + bin->elf_size)) : NULL;1521variant->disasm_string = malloc(disasm_size + 1);1522memcpy(variant->disasm_string, disasm_data, disasm_size);1523variant->disasm_string[disasm_size] = 0;1524}15251526variant->code_ptr = dest_ptr;1527ac_rtld_close(&rtld_binary);1528} else {1529struct radv_shader_binary_legacy *bin = (struct radv_shader_binary_legacy *)binary;1530memcpy(dest_ptr, bin->data + bin->stats_size, bin->code_size);15311532/* Add end-of-code markers for the UMR disassembler. */1533uint32_t *ptr32 = (uint32_t *)dest_ptr + bin->code_size / 4;1534for (unsigned i = 0; i < DEBUGGER_NUM_MARKERS; i++)1535ptr32[i] = DEBUGGER_END_OF_CODE_MARKER;15361537variant->code_ptr = dest_ptr;1538variant->ir_string =1539bin->ir_size ? strdup((const char *)(bin->data + bin->stats_size + bin->code_size)) : NULL;1540variant->disasm_string =1541bin->disasm_size1542? strdup((const char *)(bin->data + bin->stats_size + bin->code_size + bin->ir_size))1543: NULL;15441545if (bin->stats_size) {1546variant->statistics = calloc(bin->stats_size, 1);1547memcpy(variant->statistics, bin->data, bin->stats_size);1548}1549}1550return variant;1551}15521553static char *1554radv_dump_nir_shaders(struct nir_shader *const *shaders, int shader_count)1555{1556char *data = NULL;1557char *ret = NULL;1558size_t size = 0;1559struct u_memstream mem;1560if (u_memstream_open(&mem, &data, &size)) {1561FILE *const memf = u_memstream_get(&mem);1562for (int i = 0; i < shader_count; ++i)1563nir_print_shader(shaders[i], memf);1564u_memstream_close(&mem);1565}15661567ret = malloc(size + 1);1568if (ret) {1569memcpy(ret, data, size);1570ret[size] = 0;1571}1572free(data);1573return ret;1574}15751576static struct radv_shader_variant *1577shader_variant_compile(struct radv_device *device, struct vk_shader_module *module,1578struct nir_shader *const *shaders, int shader_count, gl_shader_stage stage,1579struct radv_shader_info *info, struct radv_nir_compiler_options *options,1580bool gs_copy_shader, bool trap_handler_shader, bool keep_shader_info,1581bool keep_statistic_info, struct radv_shader_binary **binary_out)1582{1583enum radeon_family chip_family = device->physical_device->rad_info.family;1584struct radv_shader_binary *binary = NULL;15851586struct radv_shader_debug_data debug_data = {1587.device = device,1588.module = module,1589};15901591options->family = chip_family;1592options->chip_class = device->physical_device->rad_info.chip_class;1593options->info = &device->physical_device->rad_info;1594options->dump_shader = radv_can_dump_shader(device, module, gs_copy_shader || trap_handler_shader);1595options->dump_preoptir =1596options->dump_shader && device->instance->debug_flags & RADV_DEBUG_PREOPTIR;1597options->record_ir = keep_shader_info;1598options->record_stats = keep_statistic_info;1599options->check_ir = device->instance->debug_flags & RADV_DEBUG_CHECKIR;1600options->tess_offchip_block_dw_size = device->tess_offchip_block_dw_size;1601options->address32_hi = device->physical_device->rad_info.address32_hi;1602options->has_ls_vgpr_init_bug = device->physical_device->rad_info.has_ls_vgpr_init_bug;1603options->use_ngg_streamout = device->physical_device->use_ngg_streamout;1604options->enable_mrt_output_nan_fixup =1605module && !module->nir && device->instance->enable_mrt_output_nan_fixup;1606options->adjust_frag_coord_z = device->adjust_frag_coord_z;1607options->has_image_load_dcc_bug = device->physical_device->rad_info.has_image_load_dcc_bug;1608options->debug.func = radv_compiler_debug;1609options->debug.private_data = &debug_data;16101611switch (device->force_vrs) {1612case RADV_FORCE_VRS_2x2:1613options->force_vrs_rates = (1u << 2) | (1u << 4);1614break;1615case RADV_FORCE_VRS_2x1:1616options->force_vrs_rates = (1u << 2) | (0u << 4);1617break;1618case RADV_FORCE_VRS_1x2:1619options->force_vrs_rates = (0u << 2) | (1u << 4);1620break;1621default:1622break;1623}16241625struct radv_shader_args args = {0};1626args.options = options;1627args.shader_info = info;1628args.is_gs_copy_shader = gs_copy_shader;1629args.is_trap_handler_shader = trap_handler_shader;16301631radv_declare_shader_args(1632&args, gs_copy_shader ? MESA_SHADER_VERTEX : shaders[shader_count - 1]->info.stage,1633shader_count >= 2,1634shader_count >= 2 ? shaders[shader_count - 2]->info.stage : MESA_SHADER_VERTEX);16351636if (radv_use_llvm_for_stage(device, stage) || options->dump_shader || options->record_ir)1637ac_init_llvm_once();16381639if (radv_use_llvm_for_stage(device, stage)) {1640llvm_compile_shader(device, shader_count, shaders, &binary, &args);1641} else {1642aco_compile_shader(shader_count, shaders, &binary, &args);1643}16441645binary->info = *info;16461647struct radv_shader_variant *variant =1648radv_shader_variant_create(device, binary, keep_shader_info);1649if (!variant) {1650free(binary);1651return NULL;1652}16531654if (options->dump_shader) {1655fprintf(stderr, "%s", radv_get_shader_name(info, shaders[0]->info.stage));1656for (int i = 1; i < shader_count; ++i)1657fprintf(stderr, " + %s", radv_get_shader_name(info, shaders[i]->info.stage));16581659fprintf(stderr, "\ndisasm:\n%s\n", variant->disasm_string);1660}16611662if (keep_shader_info) {1663variant->nir_string = radv_dump_nir_shaders(shaders, shader_count);1664if (!gs_copy_shader && !trap_handler_shader && !module->nir) {1665variant->spirv = malloc(module->size);1666if (!variant->spirv) {1667free(variant);1668free(binary);1669return NULL;1670}16711672memcpy(variant->spirv, module->data, module->size);1673variant->spirv_size = module->size;1674}1675}16761677if (binary_out)1678*binary_out = binary;1679else1680free(binary);16811682return variant;1683}16841685struct radv_shader_variant *1686radv_shader_variant_compile(struct radv_device *device, struct vk_shader_module *module,1687struct nir_shader *const *shaders, int shader_count,1688struct radv_pipeline_layout *layout,1689const struct radv_shader_variant_key *key,1690struct radv_shader_info *info, bool keep_shader_info,1691bool keep_statistic_info, bool disable_optimizations,1692struct radv_shader_binary **binary_out)1693{1694gl_shader_stage stage = shaders[shader_count - 1]->info.stage;1695struct radv_nir_compiler_options options = {0};16961697options.layout = layout;1698if (key)1699options.key = *key;17001701options.explicit_scratch_args = !radv_use_llvm_for_stage(device, stage);1702options.robust_buffer_access = device->robust_buffer_access;1703options.disable_optimizations = disable_optimizations;1704options.wgp_mode = radv_should_use_wgp_mode(device, stage, info);17051706return shader_variant_compile(device, module, shaders, shader_count, stage, info, &options,1707false, false, keep_shader_info, keep_statistic_info, binary_out);1708}17091710struct radv_shader_variant *1711radv_create_gs_copy_shader(struct radv_device *device, struct nir_shader *shader,1712struct radv_shader_info *info, struct radv_shader_binary **binary_out,1713bool keep_shader_info, bool keep_statistic_info, bool multiview,1714bool disable_optimizations)1715{1716struct radv_nir_compiler_options options = {0};1717gl_shader_stage stage = MESA_SHADER_VERTEX;17181719options.explicit_scratch_args = !radv_use_llvm_for_stage(device, stage);1720options.key.has_multiview_view_index = multiview;1721options.disable_optimizations = disable_optimizations;17221723return shader_variant_compile(device, NULL, &shader, 1, stage, info, &options, true, false,1724keep_shader_info, keep_statistic_info, binary_out);1725}17261727struct radv_shader_variant *1728radv_create_trap_handler_shader(struct radv_device *device)1729{1730struct radv_nir_compiler_options options = {0};1731struct radv_shader_variant *shader = NULL;1732struct radv_shader_binary *binary = NULL;1733struct radv_shader_info info = {0};17341735nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL, "meta_trap_handler");17361737options.explicit_scratch_args = true;1738options.wgp_mode = radv_should_use_wgp_mode(device, MESA_SHADER_COMPUTE, &info);1739info.wave_size = 64;17401741shader = shader_variant_compile(device, NULL, &b.shader, 1, MESA_SHADER_COMPUTE, &info, &options,1742false, true, true, false, &binary);17431744ralloc_free(b.shader);1745free(binary);17461747return shader;1748}17491750void1751radv_shader_variant_destroy(struct radv_device *device, struct radv_shader_variant *variant)1752{1753if (!p_atomic_dec_zero(&variant->ref_count))1754return;17551756mtx_lock(&device->shader_slab_mutex);1757list_del(&variant->slab_list);1758mtx_unlock(&device->shader_slab_mutex);17591760free(variant->spirv);1761free(variant->nir_string);1762free(variant->disasm_string);1763free(variant->ir_string);1764free(variant->statistics);1765free(variant);1766}17671768const char *1769radv_get_shader_name(struct radv_shader_info *info, gl_shader_stage stage)1770{1771switch (stage) {1772case MESA_SHADER_VERTEX:1773if (info->vs.as_ls)1774return "Vertex Shader as LS";1775else if (info->vs.as_es)1776return "Vertex Shader as ES";1777else if (info->is_ngg)1778return "Vertex Shader as ESGS";1779else1780return "Vertex Shader as VS";1781case MESA_SHADER_TESS_CTRL:1782return "Tessellation Control Shader";1783case MESA_SHADER_TESS_EVAL:1784if (info->tes.as_es)1785return "Tessellation Evaluation Shader as ES";1786else if (info->is_ngg)1787return "Tessellation Evaluation Shader as ESGS";1788else1789return "Tessellation Evaluation Shader as VS";1790case MESA_SHADER_GEOMETRY:1791return "Geometry Shader";1792case MESA_SHADER_FRAGMENT:1793return "Pixel Shader";1794case MESA_SHADER_COMPUTE:1795return "Compute Shader";1796default:1797return "Unknown shader";1798};1799}18001801unsigned1802radv_get_max_workgroup_size(enum chip_class chip_class, gl_shader_stage stage,1803const unsigned *sizes)1804{1805switch (stage) {1806case MESA_SHADER_TESS_CTRL:1807return chip_class >= GFX7 ? 128 : 64;1808case MESA_SHADER_GEOMETRY:1809return chip_class >= GFX9 ? 128 : 64;1810case MESA_SHADER_COMPUTE:1811break;1812default:1813return 0;1814}18151816unsigned max_workgroup_size = sizes[0] * sizes[1] * sizes[2];1817return max_workgroup_size;1818}18191820unsigned1821radv_get_max_waves(struct radv_device *device, struct radv_shader_variant *variant,1822gl_shader_stage stage)1823{1824struct radeon_info *info = &device->physical_device->rad_info;1825enum chip_class chip_class = info->chip_class;1826uint8_t wave_size = variant->info.wave_size;1827struct ac_shader_config *conf = &variant->config;1828unsigned max_simd_waves;1829unsigned lds_per_wave = 0;18301831max_simd_waves = info->max_wave64_per_simd * (64 / wave_size);18321833if (stage == MESA_SHADER_FRAGMENT) {1834lds_per_wave =1835conf->lds_size * info->lds_encode_granularity + variant->info.ps.num_interp * 48;1836lds_per_wave = align(lds_per_wave, info->lds_alloc_granularity);1837} else if (stage == MESA_SHADER_COMPUTE) {1838unsigned max_workgroup_size =1839radv_get_max_workgroup_size(chip_class, stage, variant->info.cs.block_size);1840lds_per_wave =1841align(conf->lds_size * info->lds_encode_granularity, info->lds_alloc_granularity);1842lds_per_wave /= DIV_ROUND_UP(max_workgroup_size, wave_size);1843}18441845if (conf->num_sgprs && chip_class < GFX10) {1846unsigned sgprs = align(conf->num_sgprs, chip_class >= GFX8 ? 16 : 8);1847max_simd_waves = MIN2(max_simd_waves, info->num_physical_sgprs_per_simd / sgprs);1848}18491850if (conf->num_vgprs) {1851unsigned physical_vgprs = info->num_physical_wave64_vgprs_per_simd * (64 / wave_size);1852unsigned vgprs = align(conf->num_vgprs, wave_size == 32 ? 8 : 4);1853if (chip_class >= GFX10_3)1854vgprs = align(vgprs, wave_size == 32 ? 16 : 8);1855max_simd_waves = MIN2(max_simd_waves, physical_vgprs / vgprs);1856}18571858unsigned simd_per_workgroup = info->num_simd_per_compute_unit;1859if (chip_class >= GFX10)1860simd_per_workgroup *= 2; /* like lds_size_per_workgroup, assume WGP on GFX10+ */18611862unsigned max_lds_per_simd = info->lds_size_per_workgroup / simd_per_workgroup;1863if (lds_per_wave)1864max_simd_waves = MIN2(max_simd_waves, DIV_ROUND_UP(max_lds_per_simd, lds_per_wave));18651866return chip_class >= GFX10 ? max_simd_waves * (wave_size / 32) : max_simd_waves;1867}18681869VkResult1870radv_GetShaderInfoAMD(VkDevice _device, VkPipeline _pipeline, VkShaderStageFlagBits shaderStage,1871VkShaderInfoTypeAMD infoType, size_t *pInfoSize, void *pInfo)1872{1873RADV_FROM_HANDLE(radv_device, device, _device);1874RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);1875gl_shader_stage stage = vk_to_mesa_shader_stage(shaderStage);1876struct radv_shader_variant *variant = pipeline->shaders[stage];1877VkResult result = VK_SUCCESS;18781879/* Spec doesn't indicate what to do if the stage is invalid, so just1880* return no info for this. */1881if (!variant)1882return vk_error(device->instance, VK_ERROR_FEATURE_NOT_PRESENT);18831884switch (infoType) {1885case VK_SHADER_INFO_TYPE_STATISTICS_AMD:1886if (!pInfo) {1887*pInfoSize = sizeof(VkShaderStatisticsInfoAMD);1888} else {1889unsigned lds_multiplier = device->physical_device->rad_info.lds_encode_granularity;1890struct ac_shader_config *conf = &variant->config;18911892VkShaderStatisticsInfoAMD statistics = {0};1893statistics.shaderStageMask = shaderStage;1894statistics.numPhysicalVgprs =1895device->physical_device->rad_info.num_physical_wave64_vgprs_per_simd;1896statistics.numPhysicalSgprs =1897device->physical_device->rad_info.num_physical_sgprs_per_simd;1898statistics.numAvailableSgprs = statistics.numPhysicalSgprs;18991900if (stage == MESA_SHADER_COMPUTE) {1901unsigned *local_size = variant->info.cs.block_size;1902unsigned workgroup_size = local_size[0] * local_size[1] * local_size[2];19031904statistics.numAvailableVgprs =1905statistics.numPhysicalVgprs /1906ceil((double)workgroup_size / statistics.numPhysicalVgprs);19071908statistics.computeWorkGroupSize[0] = local_size[0];1909statistics.computeWorkGroupSize[1] = local_size[1];1910statistics.computeWorkGroupSize[2] = local_size[2];1911} else {1912statistics.numAvailableVgprs = statistics.numPhysicalVgprs;1913}19141915statistics.resourceUsage.numUsedVgprs = conf->num_vgprs;1916statistics.resourceUsage.numUsedSgprs = conf->num_sgprs;1917statistics.resourceUsage.ldsSizePerLocalWorkGroup = 32768;1918statistics.resourceUsage.ldsUsageSizeInBytes = conf->lds_size * lds_multiplier;1919statistics.resourceUsage.scratchMemUsageInBytes = conf->scratch_bytes_per_wave;19201921size_t size = *pInfoSize;1922*pInfoSize = sizeof(statistics);19231924memcpy(pInfo, &statistics, MIN2(size, *pInfoSize));19251926if (size < *pInfoSize)1927result = VK_INCOMPLETE;1928}19291930break;1931case VK_SHADER_INFO_TYPE_DISASSEMBLY_AMD: {1932char *out;1933size_t outsize;1934struct u_memstream mem;1935u_memstream_open(&mem, &out, &outsize);1936FILE *const memf = u_memstream_get(&mem);19371938fprintf(memf, "%s:\n", radv_get_shader_name(&variant->info, stage));1939fprintf(memf, "%s\n\n", variant->ir_string);1940fprintf(memf, "%s\n\n", variant->disasm_string);1941radv_dump_shader_stats(device, pipeline, stage, memf);1942u_memstream_close(&mem);19431944/* Need to include the null terminator. */1945size_t length = outsize + 1;19461947if (!pInfo) {1948*pInfoSize = length;1949} else {1950size_t size = *pInfoSize;1951*pInfoSize = length;19521953memcpy(pInfo, out, MIN2(size, length));19541955if (size < length)1956result = VK_INCOMPLETE;1957}19581959free(out);1960break;1961}1962default:1963/* VK_SHADER_INFO_TYPE_BINARY_AMD unimplemented for now. */1964result = VK_ERROR_FEATURE_NOT_PRESENT;1965break;1966}19671968return result;1969}19701971VkResult1972radv_dump_shader_stats(struct radv_device *device, struct radv_pipeline *pipeline,1973gl_shader_stage stage, FILE *output)1974{1975struct radv_shader_variant *shader = pipeline->shaders[stage];1976VkPipelineExecutablePropertiesKHR *props = NULL;1977uint32_t prop_count = 0;1978VkResult result;19791980VkPipelineInfoKHR pipeline_info = {0};1981pipeline_info.sType = VK_STRUCTURE_TYPE_PIPELINE_INFO_KHR;1982pipeline_info.pipeline = radv_pipeline_to_handle(pipeline);19831984result = radv_GetPipelineExecutablePropertiesKHR(radv_device_to_handle(device), &pipeline_info,1985&prop_count, NULL);1986if (result != VK_SUCCESS)1987return result;19881989props = calloc(prop_count, sizeof(*props));1990if (!props)1991return VK_ERROR_OUT_OF_HOST_MEMORY;19921993result = radv_GetPipelineExecutablePropertiesKHR(radv_device_to_handle(device), &pipeline_info,1994&prop_count, props);1995if (result != VK_SUCCESS)1996goto fail;19971998for (unsigned exec_idx = 0; exec_idx < prop_count; exec_idx++) {1999if (!(props[exec_idx].stages & mesa_to_vk_shader_stage(stage)))2000continue;20012002VkPipelineExecutableStatisticKHR *stats = NULL;2003uint32_t stat_count = 0;20042005VkPipelineExecutableInfoKHR exec_info = {0};2006exec_info.pipeline = radv_pipeline_to_handle(pipeline);2007exec_info.executableIndex = exec_idx;20082009result = radv_GetPipelineExecutableStatisticsKHR(radv_device_to_handle(device), &exec_info,2010&stat_count, NULL);2011if (result != VK_SUCCESS)2012goto fail;20132014stats = calloc(stat_count, sizeof(*stats));2015if (!stats) {2016result = VK_ERROR_OUT_OF_HOST_MEMORY;2017goto fail;2018}20192020result = radv_GetPipelineExecutableStatisticsKHR(radv_device_to_handle(device), &exec_info,2021&stat_count, stats);2022if (result != VK_SUCCESS) {2023free(stats);2024goto fail;2025}20262027fprintf(output, "\n%s:\n", radv_get_shader_name(&shader->info, stage));2028fprintf(output, "*** SHADER STATS ***\n");20292030for (unsigned i = 0; i < stat_count; i++) {2031fprintf(output, "%s: ", stats[i].name);2032switch (stats[i].format) {2033case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_BOOL32_KHR:2034fprintf(output, "%s", stats[i].value.b32 == VK_TRUE ? "true" : "false");2035break;2036case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_INT64_KHR:2037fprintf(output, "%" PRIi64, stats[i].value.i64);2038break;2039case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR:2040fprintf(output, "%" PRIu64, stats[i].value.u64);2041break;2042case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_FLOAT64_KHR:2043fprintf(output, "%f", stats[i].value.f64);2044break;2045default:2046unreachable("Invalid pipeline statistic format");2047}2048fprintf(output, "\n");2049}20502051fprintf(output, "********************\n\n\n");20522053free(stats);2054}20552056fail:2057free(props);2058return result;2059}206020612062