Path: blob/21.2-virgl/src/intel/compiler/brw_compiler.c
4550 views
/*1* Copyright © 2015-2016 Intel Corporation2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*/2223#include "brw_compiler.h"24#include "brw_shader.h"25#include "brw_eu.h"26#include "dev/intel_debug.h"27#include "compiler/nir/nir.h"28#include "main/errors.h"29#include "util/debug.h"3031#define COMMON_OPTIONS \32.lower_fdiv = true, \33.lower_scmp = true, \34.lower_flrp16 = true, \35.lower_fmod = true, \36.lower_bitfield_extract = true, \37.lower_bitfield_insert = true, \38.lower_uadd_carry = true, \39.lower_usub_borrow = true, \40.lower_flrp64 = true, \41.lower_isign = true, \42.lower_ldexp = true, \43.lower_device_index_to_zero = true, \44.vectorize_io = true, \45.use_interpolated_input_intrinsics = true, \46.lower_insert_byte = true, \47.lower_insert_word = true, \48.vertex_id_zero_based = true, \49.lower_base_vertex = true, \50.use_scoped_barrier = true, \51.support_16bit_alu = true, \52.lower_uniforms_to_ubo = true, \53.has_txs = true5455#define COMMON_SCALAR_OPTIONS \56.lower_to_scalar = true, \57.lower_pack_half_2x16 = true, \58.lower_pack_snorm_2x16 = true, \59.lower_pack_snorm_4x8 = true, \60.lower_pack_unorm_2x16 = true, \61.lower_pack_unorm_4x8 = true, \62.lower_unpack_half_2x16 = true, \63.lower_unpack_snorm_2x16 = true, \64.lower_unpack_snorm_4x8 = true, \65.lower_unpack_unorm_2x16 = true, \66.lower_unpack_unorm_4x8 = true, \67.lower_usub_sat64 = true, \68.lower_hadd64 = true, \69.lower_bfe_with_two_constants = true, \70.max_unroll_iterations = 327172static const struct nir_shader_compiler_options scalar_nir_options = {73COMMON_OPTIONS,74COMMON_SCALAR_OPTIONS,75};7677static const struct nir_shader_compiler_options vector_nir_options = {78COMMON_OPTIONS,7980/* In the vec4 backend, our dpN instruction replicates its result to all the81* components of a vec4. We would like NIR to give us replicated fdot82* instructions because it can optimize better for us.83*/84.fdot_replicates = true,8586.lower_pack_snorm_2x16 = true,87.lower_pack_unorm_2x16 = true,88.lower_unpack_snorm_2x16 = true,89.lower_unpack_unorm_2x16 = true,90.lower_extract_byte = true,91.lower_extract_word = true,92.intel_vec4 = true,93.max_unroll_iterations = 32,94};9596struct brw_compiler *97brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo)98{99struct brw_compiler *compiler = rzalloc(mem_ctx, struct brw_compiler);100101compiler->devinfo = devinfo;102103brw_fs_alloc_reg_sets(compiler);104brw_vec4_alloc_reg_set(compiler);105106compiler->precise_trig = env_var_as_boolean("INTEL_PRECISE_TRIG", false);107108compiler->use_tcs_8_patch =109devinfo->ver >= 12 ||110(devinfo->ver >= 9 && (INTEL_DEBUG & DEBUG_TCS_EIGHT_PATCH));111112/* Default to the sampler since that's what we've done since forever */113compiler->indirect_ubos_use_sampler = true;114115/* There is no vec4 mode on Gfx10+, and we don't use it at all on Gfx8+. */116for (int i = MESA_SHADER_VERTEX; i < MESA_ALL_SHADER_STAGES; i++) {117compiler->scalar_stage[i] = devinfo->ver >= 8 ||118i == MESA_SHADER_FRAGMENT || i == MESA_SHADER_COMPUTE;119}120121for (int i = MESA_SHADER_TASK; i < MESA_VULKAN_SHADER_STAGES; i++)122compiler->scalar_stage[i] = true;123124nir_lower_int64_options int64_options =125nir_lower_imul64 |126nir_lower_isign64 |127nir_lower_divmod64 |128nir_lower_imul_high64;129nir_lower_doubles_options fp64_options =130nir_lower_drcp |131nir_lower_dsqrt |132nir_lower_drsq |133nir_lower_dtrunc |134nir_lower_dfloor |135nir_lower_dceil |136nir_lower_dfract |137nir_lower_dround_even |138nir_lower_dmod |139nir_lower_dsub |140nir_lower_ddiv;141142if (!devinfo->has_64bit_float || (INTEL_DEBUG & DEBUG_SOFT64)) {143int64_options |= (nir_lower_int64_options)~0;144fp64_options |= nir_lower_fp64_full_software;145}146147/* The Bspec's section tittled "Instruction_multiply[DevBDW+]" claims that148* destination type can be Quadword and source type Doubleword for Gfx8 and149* Gfx9. So, lower 64 bit multiply instruction on rest of the platforms.150*/151if (devinfo->ver < 8 || devinfo->ver > 9)152int64_options |= nir_lower_imul_2x32_64;153154/* We want the GLSL compiler to emit code that uses condition codes */155for (int i = 0; i < MESA_ALL_SHADER_STAGES; i++) {156compiler->glsl_compiler_options[i].MaxUnrollIterations = 0;157compiler->glsl_compiler_options[i].MaxIfDepth =158devinfo->ver < 6 ? 16 : UINT_MAX;159160/* We handle this in NIR */161compiler->glsl_compiler_options[i].EmitNoIndirectInput = false;162compiler->glsl_compiler_options[i].EmitNoIndirectOutput = false;163compiler->glsl_compiler_options[i].EmitNoIndirectUniform = false;164compiler->glsl_compiler_options[i].EmitNoIndirectTemp = false;165166bool is_scalar = compiler->scalar_stage[i];167compiler->glsl_compiler_options[i].OptimizeForAOS = !is_scalar;168169struct nir_shader_compiler_options *nir_options =170rzalloc(compiler, struct nir_shader_compiler_options);171if (is_scalar) {172*nir_options = scalar_nir_options;173} else {174*nir_options = vector_nir_options;175}176177/* Prior to Gfx6, there are no three source operations, and Gfx11 loses178* LRP.179*/180nir_options->lower_ffma16 = devinfo->ver < 6;181nir_options->lower_ffma32 = devinfo->ver < 6;182nir_options->lower_ffma64 = devinfo->ver < 6;183nir_options->lower_flrp32 = devinfo->ver < 6 || devinfo->ver >= 11;184nir_options->lower_fpow = devinfo->ver >= 12;185186nir_options->lower_rotate = devinfo->ver < 11;187nir_options->lower_bitfield_reverse = devinfo->ver < 7;188189nir_options->lower_int64_options = int64_options;190nir_options->lower_doubles_options = fp64_options;191192/* Starting with Gfx11, we lower away 8-bit arithmetic */193nir_options->support_8bit_alu = devinfo->ver < 11;194195nir_options->unify_interfaces = i < MESA_SHADER_FRAGMENT;196197compiler->glsl_compiler_options[i].NirOptions = nir_options;198199compiler->glsl_compiler_options[i].ClampBlockIndicesToArrayBounds = true;200}201202return compiler;203}204205static void206insert_u64_bit(uint64_t *val, bool add)207{208*val = (*val << 1) | !!add;209}210211uint64_t212brw_get_compiler_config_value(const struct brw_compiler *compiler)213{214uint64_t config = 0;215insert_u64_bit(&config, compiler->precise_trig);216if (compiler->devinfo->ver >= 8 && compiler->devinfo->ver < 10) {217insert_u64_bit(&config, compiler->scalar_stage[MESA_SHADER_VERTEX]);218insert_u64_bit(&config, compiler->scalar_stage[MESA_SHADER_TESS_CTRL]);219insert_u64_bit(&config, compiler->scalar_stage[MESA_SHADER_TESS_EVAL]);220insert_u64_bit(&config, compiler->scalar_stage[MESA_SHADER_GEOMETRY]);221}222uint64_t debug_bits = INTEL_DEBUG;223uint64_t mask = DEBUG_DISK_CACHE_MASK;224while (mask != 0) {225const uint64_t bit = 1ULL << (ffsll(mask) - 1);226insert_u64_bit(&config, (debug_bits & bit) != 0);227mask &= ~bit;228}229return config;230}231232unsigned233brw_prog_data_size(gl_shader_stage stage)234{235static const size_t stage_sizes[] = {236[MESA_SHADER_VERTEX] = sizeof(struct brw_vs_prog_data),237[MESA_SHADER_TESS_CTRL] = sizeof(struct brw_tcs_prog_data),238[MESA_SHADER_TESS_EVAL] = sizeof(struct brw_tes_prog_data),239[MESA_SHADER_GEOMETRY] = sizeof(struct brw_gs_prog_data),240[MESA_SHADER_FRAGMENT] = sizeof(struct brw_wm_prog_data),241[MESA_SHADER_COMPUTE] = sizeof(struct brw_cs_prog_data),242[MESA_SHADER_RAYGEN] = sizeof(struct brw_bs_prog_data),243[MESA_SHADER_ANY_HIT] = sizeof(struct brw_bs_prog_data),244[MESA_SHADER_CLOSEST_HIT] = sizeof(struct brw_bs_prog_data),245[MESA_SHADER_MISS] = sizeof(struct brw_bs_prog_data),246[MESA_SHADER_INTERSECTION] = sizeof(struct brw_bs_prog_data),247[MESA_SHADER_CALLABLE] = sizeof(struct brw_bs_prog_data),248[MESA_SHADER_KERNEL] = sizeof(struct brw_cs_prog_data),249};250assert((int)stage >= 0 && stage < ARRAY_SIZE(stage_sizes));251return stage_sizes[stage];252}253254unsigned255brw_prog_key_size(gl_shader_stage stage)256{257static const size_t stage_sizes[] = {258[MESA_SHADER_VERTEX] = sizeof(struct brw_vs_prog_key),259[MESA_SHADER_TESS_CTRL] = sizeof(struct brw_tcs_prog_key),260[MESA_SHADER_TESS_EVAL] = sizeof(struct brw_tes_prog_key),261[MESA_SHADER_GEOMETRY] = sizeof(struct brw_gs_prog_key),262[MESA_SHADER_FRAGMENT] = sizeof(struct brw_wm_prog_key),263[MESA_SHADER_COMPUTE] = sizeof(struct brw_cs_prog_key),264[MESA_SHADER_RAYGEN] = sizeof(struct brw_bs_prog_key),265[MESA_SHADER_ANY_HIT] = sizeof(struct brw_bs_prog_key),266[MESA_SHADER_CLOSEST_HIT] = sizeof(struct brw_bs_prog_key),267[MESA_SHADER_MISS] = sizeof(struct brw_bs_prog_key),268[MESA_SHADER_INTERSECTION] = sizeof(struct brw_bs_prog_key),269[MESA_SHADER_CALLABLE] = sizeof(struct brw_bs_prog_key),270[MESA_SHADER_KERNEL] = sizeof(struct brw_cs_prog_key),271};272assert((int)stage >= 0 && stage < ARRAY_SIZE(stage_sizes));273return stage_sizes[stage];274}275276void277brw_write_shader_relocs(const struct intel_device_info *devinfo,278void *program,279const struct brw_stage_prog_data *prog_data,280struct brw_shader_reloc_value *values,281unsigned num_values)282{283for (unsigned i = 0; i < prog_data->num_relocs; i++) {284assert(prog_data->relocs[i].offset % 8 == 0);285void *dst = program + prog_data->relocs[i].offset;286for (unsigned j = 0; j < num_values; j++) {287if (prog_data->relocs[i].id == values[j].id) {288uint32_t value = values[j].value + prog_data->relocs[i].delta;289switch (prog_data->relocs[i].type) {290case BRW_SHADER_RELOC_TYPE_U32:291*(uint32_t *)dst = value;292break;293case BRW_SHADER_RELOC_TYPE_MOV_IMM:294brw_update_reloc_imm(devinfo, dst, value);295break;296default:297unreachable("Invalid relocation type");298}299break;300}301}302}303}304305306