Path: blob/21.2-virgl/src/microsoft/compiler/dxil_nir.c
4564 views
/*1* Copyright © Microsoft Corporation2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*/2223#include "dxil_nir.h"2425#include "nir_builder.h"26#include "nir_deref.h"27#include "nir_to_dxil.h"28#include "util/u_math.h"2930static void31cl_type_size_align(const struct glsl_type *type, unsigned *size,32unsigned *align)33{34*size = glsl_get_cl_size(type);35*align = glsl_get_cl_alignment(type);36}3738static void39extract_comps_from_vec32(nir_builder *b, nir_ssa_def *vec32,40unsigned dst_bit_size,41nir_ssa_def **dst_comps,42unsigned num_dst_comps)43{44unsigned step = DIV_ROUND_UP(dst_bit_size, 32);45unsigned comps_per32b = 32 / dst_bit_size;46nir_ssa_def *tmp;4748for (unsigned i = 0; i < vec32->num_components; i += step) {49switch (dst_bit_size) {50case 64:51tmp = nir_pack_64_2x32_split(b, nir_channel(b, vec32, i),52nir_channel(b, vec32, i + 1));53dst_comps[i / 2] = tmp;54break;55case 32:56dst_comps[i] = nir_channel(b, vec32, i);57break;58case 16:59case 8: {60unsigned dst_offs = i * comps_per32b;6162tmp = nir_unpack_bits(b, nir_channel(b, vec32, i), dst_bit_size);63for (unsigned j = 0; j < comps_per32b && dst_offs + j < num_dst_comps; j++)64dst_comps[dst_offs + j] = nir_channel(b, tmp, j);65}6667break;68}69}70}7172static nir_ssa_def *73load_comps_to_vec32(nir_builder *b, unsigned src_bit_size,74nir_ssa_def **src_comps, unsigned num_src_comps)75{76unsigned num_vec32comps = DIV_ROUND_UP(num_src_comps * src_bit_size, 32);77unsigned step = DIV_ROUND_UP(src_bit_size, 32);78unsigned comps_per32b = 32 / src_bit_size;79nir_ssa_def *vec32comps[4];8081for (unsigned i = 0; i < num_vec32comps; i += step) {82switch (src_bit_size) {83case 64:84vec32comps[i] = nir_unpack_64_2x32_split_x(b, src_comps[i / 2]);85vec32comps[i + 1] = nir_unpack_64_2x32_split_y(b, src_comps[i / 2]);86break;87case 32:88vec32comps[i] = src_comps[i];89break;90case 16:91case 8: {92unsigned src_offs = i * comps_per32b;9394vec32comps[i] = nir_u2u32(b, src_comps[src_offs]);95for (unsigned j = 1; j < comps_per32b && src_offs + j < num_src_comps; j++) {96nir_ssa_def *tmp = nir_ishl(b, nir_u2u32(b, src_comps[src_offs + j]),97nir_imm_int(b, j * src_bit_size));98vec32comps[i] = nir_ior(b, vec32comps[i], tmp);99}100break;101}102}103}104105return nir_vec(b, vec32comps, num_vec32comps);106}107108static nir_ssa_def *109build_load_ptr_dxil(nir_builder *b, nir_deref_instr *deref, nir_ssa_def *idx)110{111return nir_load_ptr_dxil(b, 1, 32, &deref->dest.ssa, idx);112}113114static bool115lower_load_deref(nir_builder *b, nir_intrinsic_instr *intr)116{117assert(intr->dest.is_ssa);118119b->cursor = nir_before_instr(&intr->instr);120121nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);122if (!nir_deref_mode_is(deref, nir_var_shader_temp))123return false;124nir_ssa_def *ptr = nir_u2u32(b, nir_build_deref_offset(b, deref, cl_type_size_align));125nir_ssa_def *offset = nir_iand(b, ptr, nir_inot(b, nir_imm_int(b, 3)));126127assert(intr->dest.is_ssa);128unsigned num_components = nir_dest_num_components(intr->dest);129unsigned bit_size = nir_dest_bit_size(intr->dest);130unsigned load_size = MAX2(32, bit_size);131unsigned num_bits = num_components * bit_size;132nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];133unsigned comp_idx = 0;134135nir_deref_path path;136nir_deref_path_init(&path, deref, NULL);137nir_ssa_def *base_idx = nir_ishr(b, offset, nir_imm_int(b, 2 /* log2(32 / 8) */));138139/* Split loads into 32-bit chunks */140for (unsigned i = 0; i < num_bits; i += load_size) {141unsigned subload_num_bits = MIN2(num_bits - i, load_size);142nir_ssa_def *idx = nir_iadd(b, base_idx, nir_imm_int(b, i / 32));143nir_ssa_def *vec32 = build_load_ptr_dxil(b, path.path[0], idx);144145if (load_size == 64) {146idx = nir_iadd(b, idx, nir_imm_int(b, 1));147vec32 = nir_vec2(b, vec32,148build_load_ptr_dxil(b, path.path[0], idx));149}150151/* If we have 2 bytes or less to load we need to adjust the u32 value so152* we can always extract the LSB.153*/154if (subload_num_bits <= 16) {155nir_ssa_def *shift = nir_imul(b, nir_iand(b, ptr, nir_imm_int(b, 3)),156nir_imm_int(b, 8));157vec32 = nir_ushr(b, vec32, shift);158}159160/* And now comes the pack/unpack step to match the original type. */161extract_comps_from_vec32(b, vec32, bit_size, &comps[comp_idx],162subload_num_bits / bit_size);163comp_idx += subload_num_bits / bit_size;164}165166nir_deref_path_finish(&path);167assert(comp_idx == num_components);168nir_ssa_def *result = nir_vec(b, comps, num_components);169nir_ssa_def_rewrite_uses(&intr->dest.ssa, result);170nir_instr_remove(&intr->instr);171return true;172}173174static nir_ssa_def *175ubo_load_select_32b_comps(nir_builder *b, nir_ssa_def *vec32,176nir_ssa_def *offset, unsigned num_bytes)177{178assert(num_bytes == 16 || num_bytes == 12 || num_bytes == 8 ||179num_bytes == 4 || num_bytes == 3 || num_bytes == 2 ||180num_bytes == 1);181assert(vec32->num_components == 4);182183/* 16 and 12 byte types are always aligned on 16 bytes. */184if (num_bytes > 8)185return vec32;186187nir_ssa_def *comps[4];188nir_ssa_def *cond;189190for (unsigned i = 0; i < 4; i++)191comps[i] = nir_channel(b, vec32, i);192193/* If we have 8bytes or less to load, select which half the vec4 should194* be used.195*/196cond = nir_ine(b, nir_iand(b, offset, nir_imm_int(b, 0x8)),197nir_imm_int(b, 0));198199comps[0] = nir_bcsel(b, cond, comps[2], comps[0]);200comps[1] = nir_bcsel(b, cond, comps[3], comps[1]);201202/* Thanks to the CL alignment constraints, if we want 8 bytes we're done. */203if (num_bytes == 8)204return nir_vec(b, comps, 2);205206/* 4 bytes or less needed, select which of the 32bit component should be207* used and return it. The sub-32bit split is handled in208* extract_comps_from_vec32().209*/210cond = nir_ine(b, nir_iand(b, offset, nir_imm_int(b, 0x4)),211nir_imm_int(b, 0));212return nir_bcsel(b, cond, comps[1], comps[0]);213}214215nir_ssa_def *216build_load_ubo_dxil(nir_builder *b, nir_ssa_def *buffer,217nir_ssa_def *offset, unsigned num_components,218unsigned bit_size)219{220nir_ssa_def *idx = nir_ushr(b, offset, nir_imm_int(b, 4));221nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];222unsigned num_bits = num_components * bit_size;223unsigned comp_idx = 0;224225/* We need to split loads in 16byte chunks because that's the226* granularity of cBufferLoadLegacy().227*/228for (unsigned i = 0; i < num_bits; i += (16 * 8)) {229/* For each 16byte chunk (or smaller) we generate a 32bit ubo vec230* load.231*/232unsigned subload_num_bits = MIN2(num_bits - i, 16 * 8);233nir_ssa_def *vec32 =234nir_load_ubo_dxil(b, 4, 32, buffer, nir_iadd(b, idx, nir_imm_int(b, i / (16 * 8))));235236/* First re-arrange the vec32 to account for intra 16-byte offset. */237vec32 = ubo_load_select_32b_comps(b, vec32, offset, subload_num_bits / 8);238239/* If we have 2 bytes or less to load we need to adjust the u32 value so240* we can always extract the LSB.241*/242if (subload_num_bits <= 16) {243nir_ssa_def *shift = nir_imul(b, nir_iand(b, offset,244nir_imm_int(b, 3)),245nir_imm_int(b, 8));246vec32 = nir_ushr(b, vec32, shift);247}248249/* And now comes the pack/unpack step to match the original type. */250extract_comps_from_vec32(b, vec32, bit_size, &comps[comp_idx],251subload_num_bits / bit_size);252comp_idx += subload_num_bits / bit_size;253}254255assert(comp_idx == num_components);256return nir_vec(b, comps, num_components);257}258259static bool260lower_load_ssbo(nir_builder *b, nir_intrinsic_instr *intr)261{262assert(intr->dest.is_ssa);263assert(intr->src[0].is_ssa);264assert(intr->src[1].is_ssa);265266b->cursor = nir_before_instr(&intr->instr);267268nir_ssa_def *buffer = intr->src[0].ssa;269nir_ssa_def *offset = nir_iand(b, intr->src[1].ssa, nir_imm_int(b, ~3));270enum gl_access_qualifier access = nir_intrinsic_access(intr);271unsigned bit_size = nir_dest_bit_size(intr->dest);272unsigned num_components = nir_dest_num_components(intr->dest);273unsigned num_bits = num_components * bit_size;274275nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];276unsigned comp_idx = 0;277278/* We need to split loads in 16byte chunks because that's the optimal279* granularity of bufferLoad(). Minimum alignment is 4byte, which saves280* from us from extra complexity to extract >= 32 bit components.281*/282for (unsigned i = 0; i < num_bits; i += 4 * 32) {283/* For each 16byte chunk (or smaller) we generate a 32bit ssbo vec284* load.285*/286unsigned subload_num_bits = MIN2(num_bits - i, 4 * 32);287288/* The number of components to store depends on the number of bytes. */289nir_ssa_def *vec32 =290nir_load_ssbo(b, DIV_ROUND_UP(subload_num_bits, 32), 32,291buffer, nir_iadd(b, offset, nir_imm_int(b, i / 8)),292.align_mul = 4,293.align_offset = 0,294.access = access);295296/* If we have 2 bytes or less to load we need to adjust the u32 value so297* we can always extract the LSB.298*/299if (subload_num_bits <= 16) {300nir_ssa_def *shift = nir_imul(b, nir_iand(b, intr->src[1].ssa, nir_imm_int(b, 3)),301nir_imm_int(b, 8));302vec32 = nir_ushr(b, vec32, shift);303}304305/* And now comes the pack/unpack step to match the original type. */306extract_comps_from_vec32(b, vec32, bit_size, &comps[comp_idx],307subload_num_bits / bit_size);308comp_idx += subload_num_bits / bit_size;309}310311assert(comp_idx == num_components);312nir_ssa_def *result = nir_vec(b, comps, num_components);313nir_ssa_def_rewrite_uses(&intr->dest.ssa, result);314nir_instr_remove(&intr->instr);315return true;316}317318static bool319lower_store_ssbo(nir_builder *b, nir_intrinsic_instr *intr)320{321b->cursor = nir_before_instr(&intr->instr);322323assert(intr->src[0].is_ssa);324assert(intr->src[1].is_ssa);325assert(intr->src[2].is_ssa);326327nir_ssa_def *val = intr->src[0].ssa;328nir_ssa_def *buffer = intr->src[1].ssa;329nir_ssa_def *offset = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, ~3));330331unsigned bit_size = val->bit_size;332unsigned num_components = val->num_components;333unsigned num_bits = num_components * bit_size;334335nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];336unsigned comp_idx = 0;337338for (unsigned i = 0; i < num_components; i++)339comps[i] = nir_channel(b, val, i);340341/* We split stores in 16byte chunks because that's the optimal granularity342* of bufferStore(). Minimum alignment is 4byte, which saves from us from343* extra complexity to store >= 32 bit components.344*/345for (unsigned i = 0; i < num_bits; i += 4 * 32) {346/* For each 16byte chunk (or smaller) we generate a 32bit ssbo vec347* store.348*/349unsigned substore_num_bits = MIN2(num_bits - i, 4 * 32);350nir_ssa_def *local_offset = nir_iadd(b, offset, nir_imm_int(b, i / 8));351nir_ssa_def *vec32 = load_comps_to_vec32(b, bit_size, &comps[comp_idx],352substore_num_bits / bit_size);353nir_intrinsic_instr *store;354355if (substore_num_bits < 32) {356nir_ssa_def *mask = nir_imm_int(b, (1 << substore_num_bits) - 1);357358/* If we have 16 bits or less to store we need to place them359* correctly in the u32 component. Anything greater than 16 bits360* (including uchar3) is naturally aligned on 32bits.361*/362if (substore_num_bits <= 16) {363nir_ssa_def *pos = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, 3));364nir_ssa_def *shift = nir_imul_imm(b, pos, 8);365366vec32 = nir_ishl(b, vec32, shift);367mask = nir_ishl(b, mask, shift);368}369370store = nir_intrinsic_instr_create(b->shader,371nir_intrinsic_store_ssbo_masked_dxil);372store->src[0] = nir_src_for_ssa(vec32);373store->src[1] = nir_src_for_ssa(nir_inot(b, mask));374store->src[2] = nir_src_for_ssa(buffer);375store->src[3] = nir_src_for_ssa(local_offset);376} else {377store = nir_intrinsic_instr_create(b->shader,378nir_intrinsic_store_ssbo);379store->src[0] = nir_src_for_ssa(vec32);380store->src[1] = nir_src_for_ssa(buffer);381store->src[2] = nir_src_for_ssa(local_offset);382383nir_intrinsic_set_align(store, 4, 0);384}385386/* The number of components to store depends on the number of bits. */387store->num_components = DIV_ROUND_UP(substore_num_bits, 32);388nir_builder_instr_insert(b, &store->instr);389comp_idx += substore_num_bits / bit_size;390}391392nir_instr_remove(&intr->instr);393return true;394}395396static void397lower_load_vec32(nir_builder *b, nir_ssa_def *index, unsigned num_comps, nir_ssa_def **comps, nir_intrinsic_op op)398{399for (unsigned i = 0; i < num_comps; i++) {400nir_intrinsic_instr *load =401nir_intrinsic_instr_create(b->shader, op);402403load->num_components = 1;404load->src[0] = nir_src_for_ssa(nir_iadd(b, index, nir_imm_int(b, i)));405nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);406nir_builder_instr_insert(b, &load->instr);407comps[i] = &load->dest.ssa;408}409}410411static bool412lower_32b_offset_load(nir_builder *b, nir_intrinsic_instr *intr)413{414assert(intr->dest.is_ssa);415unsigned bit_size = nir_dest_bit_size(intr->dest);416unsigned num_components = nir_dest_num_components(intr->dest);417unsigned num_bits = num_components * bit_size;418419b->cursor = nir_before_instr(&intr->instr);420nir_intrinsic_op op = intr->intrinsic;421422assert(intr->src[0].is_ssa);423nir_ssa_def *offset = intr->src[0].ssa;424if (op == nir_intrinsic_load_shared) {425offset = nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_base(intr)));426op = nir_intrinsic_load_shared_dxil;427} else {428offset = nir_u2u32(b, offset);429op = nir_intrinsic_load_scratch_dxil;430}431nir_ssa_def *index = nir_ushr(b, offset, nir_imm_int(b, 2));432nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];433nir_ssa_def *comps_32bit[NIR_MAX_VEC_COMPONENTS * 2];434435/* We need to split loads in 32-bit accesses because the buffer436* is an i32 array and DXIL does not support type casts.437*/438unsigned num_32bit_comps = DIV_ROUND_UP(num_bits, 32);439lower_load_vec32(b, index, num_32bit_comps, comps_32bit, op);440unsigned num_comps_per_pass = MIN2(num_32bit_comps, 4);441442for (unsigned i = 0; i < num_32bit_comps; i += num_comps_per_pass) {443unsigned num_vec32_comps = MIN2(num_32bit_comps - i, 4);444unsigned num_dest_comps = num_vec32_comps * 32 / bit_size;445nir_ssa_def *vec32 = nir_vec(b, &comps_32bit[i], num_vec32_comps);446447/* If we have 16 bits or less to load we need to adjust the u32 value so448* we can always extract the LSB.449*/450if (num_bits <= 16) {451nir_ssa_def *shift =452nir_imul(b, nir_iand(b, offset, nir_imm_int(b, 3)),453nir_imm_int(b, 8));454vec32 = nir_ushr(b, vec32, shift);455}456457/* And now comes the pack/unpack step to match the original type. */458unsigned dest_index = i * 32 / bit_size;459extract_comps_from_vec32(b, vec32, bit_size, &comps[dest_index], num_dest_comps);460}461462nir_ssa_def *result = nir_vec(b, comps, num_components);463nir_ssa_def_rewrite_uses(&intr->dest.ssa, result);464nir_instr_remove(&intr->instr);465466return true;467}468469static void470lower_store_vec32(nir_builder *b, nir_ssa_def *index, nir_ssa_def *vec32, nir_intrinsic_op op)471{472473for (unsigned i = 0; i < vec32->num_components; i++) {474nir_intrinsic_instr *store =475nir_intrinsic_instr_create(b->shader, op);476477store->src[0] = nir_src_for_ssa(nir_channel(b, vec32, i));478store->src[1] = nir_src_for_ssa(nir_iadd(b, index, nir_imm_int(b, i)));479store->num_components = 1;480nir_builder_instr_insert(b, &store->instr);481}482}483484static void485lower_masked_store_vec32(nir_builder *b, nir_ssa_def *offset, nir_ssa_def *index,486nir_ssa_def *vec32, unsigned num_bits, nir_intrinsic_op op)487{488nir_ssa_def *mask = nir_imm_int(b, (1 << num_bits) - 1);489490/* If we have 16 bits or less to store we need to place them correctly in491* the u32 component. Anything greater than 16 bits (including uchar3) is492* naturally aligned on 32bits.493*/494if (num_bits <= 16) {495nir_ssa_def *shift =496nir_imul_imm(b, nir_iand(b, offset, nir_imm_int(b, 3)), 8);497498vec32 = nir_ishl(b, vec32, shift);499mask = nir_ishl(b, mask, shift);500}501502if (op == nir_intrinsic_store_shared_dxil) {503/* Use the dedicated masked intrinsic */504nir_store_shared_masked_dxil(b, vec32, nir_inot(b, mask), index);505} else {506/* For scratch, since we don't need atomics, just generate the read-modify-write in NIR */507nir_ssa_def *load = nir_load_scratch_dxil(b, 1, 32, index);508509nir_ssa_def *new_val = nir_ior(b, vec32,510nir_iand(b,511nir_inot(b, mask),512load));513514lower_store_vec32(b, index, new_val, op);515}516}517518static bool519lower_32b_offset_store(nir_builder *b, nir_intrinsic_instr *intr)520{521assert(intr->src[0].is_ssa);522unsigned num_components = nir_src_num_components(intr->src[0]);523unsigned bit_size = nir_src_bit_size(intr->src[0]);524unsigned num_bits = num_components * bit_size;525526b->cursor = nir_before_instr(&intr->instr);527nir_intrinsic_op op = intr->intrinsic;528529nir_ssa_def *offset = intr->src[1].ssa;530if (op == nir_intrinsic_store_shared) {531offset = nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_base(intr)));532op = nir_intrinsic_store_shared_dxil;533} else {534offset = nir_u2u32(b, offset);535op = nir_intrinsic_store_scratch_dxil;536}537nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];538539unsigned comp_idx = 0;540for (unsigned i = 0; i < num_components; i++)541comps[i] = nir_channel(b, intr->src[0].ssa, i);542543for (unsigned i = 0; i < num_bits; i += 4 * 32) {544/* For each 4byte chunk (or smaller) we generate a 32bit scalar store.545*/546unsigned substore_num_bits = MIN2(num_bits - i, 4 * 32);547nir_ssa_def *local_offset = nir_iadd(b, offset, nir_imm_int(b, i / 8));548nir_ssa_def *vec32 = load_comps_to_vec32(b, bit_size, &comps[comp_idx],549substore_num_bits / bit_size);550nir_ssa_def *index = nir_ushr(b, local_offset, nir_imm_int(b, 2));551552/* For anything less than 32bits we need to use the masked version of the553* intrinsic to preserve data living in the same 32bit slot.554*/555if (num_bits < 32) {556lower_masked_store_vec32(b, local_offset, index, vec32, num_bits, op);557} else {558lower_store_vec32(b, index, vec32, op);559}560561comp_idx += substore_num_bits / bit_size;562}563564nir_instr_remove(&intr->instr);565566return true;567}568569static void570ubo_to_temp_patch_deref_mode(nir_deref_instr *deref)571{572deref->modes = nir_var_shader_temp;573nir_foreach_use(use_src, &deref->dest.ssa) {574if (use_src->parent_instr->type != nir_instr_type_deref)575continue;576577nir_deref_instr *parent = nir_instr_as_deref(use_src->parent_instr);578ubo_to_temp_patch_deref_mode(parent);579}580}581582static void583ubo_to_temp_update_entry(nir_deref_instr *deref, struct hash_entry *he)584{585assert(nir_deref_mode_is(deref, nir_var_mem_constant));586assert(deref->dest.is_ssa);587assert(he->data);588589nir_foreach_use(use_src, &deref->dest.ssa) {590if (use_src->parent_instr->type == nir_instr_type_deref) {591ubo_to_temp_update_entry(nir_instr_as_deref(use_src->parent_instr), he);592} else if (use_src->parent_instr->type == nir_instr_type_intrinsic) {593nir_intrinsic_instr *intr = nir_instr_as_intrinsic(use_src->parent_instr);594if (intr->intrinsic != nir_intrinsic_load_deref)595he->data = NULL;596} else {597he->data = NULL;598}599600if (!he->data)601break;602}603}604605bool606dxil_nir_lower_ubo_to_temp(nir_shader *nir)607{608struct hash_table *ubo_to_temp = _mesa_pointer_hash_table_create(NULL);609bool progress = false;610611/* First pass: collect all UBO accesses that could be turned into612* shader temp accesses.613*/614foreach_list_typed(nir_function, func, node, &nir->functions) {615if (!func->is_entrypoint)616continue;617assert(func->impl);618619nir_foreach_block(block, func->impl) {620nir_foreach_instr_safe(instr, block) {621if (instr->type != nir_instr_type_deref)622continue;623624nir_deref_instr *deref = nir_instr_as_deref(instr);625if (!nir_deref_mode_is(deref, nir_var_mem_constant) ||626deref->deref_type != nir_deref_type_var)627continue;628629struct hash_entry *he =630_mesa_hash_table_search(ubo_to_temp, deref->var);631632if (!he)633he = _mesa_hash_table_insert(ubo_to_temp, deref->var, deref->var);634635if (!he->data)636continue;637638ubo_to_temp_update_entry(deref, he);639}640}641}642643hash_table_foreach(ubo_to_temp, he) {644nir_variable *var = he->data;645646if (!var)647continue;648649/* Change the variable mode. */650var->data.mode = nir_var_shader_temp;651652/* Make sure the variable has a name.653* DXIL variables must have names.654*/655if (!var->name)656var->name = ralloc_asprintf(nir, "global_%d", exec_list_length(&nir->variables));657658progress = true;659}660_mesa_hash_table_destroy(ubo_to_temp, NULL);661662/* Second pass: patch all derefs that were accessing the converted UBOs663* variables.664*/665foreach_list_typed(nir_function, func, node, &nir->functions) {666if (!func->is_entrypoint)667continue;668assert(func->impl);669670nir_foreach_block(block, func->impl) {671nir_foreach_instr_safe(instr, block) {672if (instr->type != nir_instr_type_deref)673continue;674675nir_deref_instr *deref = nir_instr_as_deref(instr);676if (nir_deref_mode_is(deref, nir_var_mem_constant) &&677deref->deref_type == nir_deref_type_var &&678deref->var->data.mode == nir_var_shader_temp)679ubo_to_temp_patch_deref_mode(deref);680}681}682}683684return progress;685}686687static bool688lower_load_ubo(nir_builder *b, nir_intrinsic_instr *intr)689{690assert(intr->dest.is_ssa);691assert(intr->src[0].is_ssa);692assert(intr->src[1].is_ssa);693694b->cursor = nir_before_instr(&intr->instr);695696nir_ssa_def *result =697build_load_ubo_dxil(b, intr->src[0].ssa, intr->src[1].ssa,698nir_dest_num_components(intr->dest),699nir_dest_bit_size(intr->dest));700701nir_ssa_def_rewrite_uses(&intr->dest.ssa, result);702nir_instr_remove(&intr->instr);703return true;704}705706bool707dxil_nir_lower_loads_stores_to_dxil(nir_shader *nir)708{709bool progress = false;710711foreach_list_typed(nir_function, func, node, &nir->functions) {712if (!func->is_entrypoint)713continue;714assert(func->impl);715716nir_builder b;717nir_builder_init(&b, func->impl);718719nir_foreach_block(block, func->impl) {720nir_foreach_instr_safe(instr, block) {721if (instr->type != nir_instr_type_intrinsic)722continue;723nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);724725switch (intr->intrinsic) {726case nir_intrinsic_load_deref:727progress |= lower_load_deref(&b, intr);728break;729case nir_intrinsic_load_shared:730case nir_intrinsic_load_scratch:731progress |= lower_32b_offset_load(&b, intr);732break;733case nir_intrinsic_load_ssbo:734progress |= lower_load_ssbo(&b, intr);735break;736case nir_intrinsic_load_ubo:737progress |= lower_load_ubo(&b, intr);738break;739case nir_intrinsic_store_shared:740case nir_intrinsic_store_scratch:741progress |= lower_32b_offset_store(&b, intr);742break;743case nir_intrinsic_store_ssbo:744progress |= lower_store_ssbo(&b, intr);745break;746default:747break;748}749}750}751}752753return progress;754}755756static bool757lower_shared_atomic(nir_builder *b, nir_intrinsic_instr *intr,758nir_intrinsic_op dxil_op)759{760b->cursor = nir_before_instr(&intr->instr);761762assert(intr->src[0].is_ssa);763nir_ssa_def *offset =764nir_iadd(b, intr->src[0].ssa, nir_imm_int(b, nir_intrinsic_base(intr)));765nir_ssa_def *index = nir_ushr(b, offset, nir_imm_int(b, 2));766767nir_intrinsic_instr *atomic = nir_intrinsic_instr_create(b->shader, dxil_op);768atomic->src[0] = nir_src_for_ssa(index);769assert(intr->src[1].is_ssa);770atomic->src[1] = nir_src_for_ssa(intr->src[1].ssa);771if (dxil_op == nir_intrinsic_shared_atomic_comp_swap_dxil) {772assert(intr->src[2].is_ssa);773atomic->src[2] = nir_src_for_ssa(intr->src[2].ssa);774}775atomic->num_components = 0;776nir_ssa_dest_init(&atomic->instr, &atomic->dest, 1, 32, NULL);777778nir_builder_instr_insert(b, &atomic->instr);779nir_ssa_def_rewrite_uses(&intr->dest.ssa, &atomic->dest.ssa);780nir_instr_remove(&intr->instr);781return true;782}783784bool785dxil_nir_lower_atomics_to_dxil(nir_shader *nir)786{787bool progress = false;788789foreach_list_typed(nir_function, func, node, &nir->functions) {790if (!func->is_entrypoint)791continue;792assert(func->impl);793794nir_builder b;795nir_builder_init(&b, func->impl);796797nir_foreach_block(block, func->impl) {798nir_foreach_instr_safe(instr, block) {799if (instr->type != nir_instr_type_intrinsic)800continue;801nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);802803switch (intr->intrinsic) {804805#define ATOMIC(op) \806case nir_intrinsic_shared_atomic_##op: \807progress |= lower_shared_atomic(&b, intr, \808nir_intrinsic_shared_atomic_##op##_dxil); \809break810811ATOMIC(add);812ATOMIC(imin);813ATOMIC(umin);814ATOMIC(imax);815ATOMIC(umax);816ATOMIC(and);817ATOMIC(or);818ATOMIC(xor);819ATOMIC(exchange);820ATOMIC(comp_swap);821822#undef ATOMIC823default:824break;825}826}827}828}829830return progress;831}832833static bool834lower_deref_ssbo(nir_builder *b, nir_deref_instr *deref)835{836assert(nir_deref_mode_is(deref, nir_var_mem_ssbo));837assert(deref->deref_type == nir_deref_type_var ||838deref->deref_type == nir_deref_type_cast);839nir_variable *var = deref->var;840841b->cursor = nir_before_instr(&deref->instr);842843if (deref->deref_type == nir_deref_type_var) {844/* We turn all deref_var into deref_cast and build a pointer value based on845* the var binding which encodes the UAV id.846*/847nir_ssa_def *ptr = nir_imm_int64(b, (uint64_t)var->data.binding << 32);848nir_deref_instr *deref_cast =849nir_build_deref_cast(b, ptr, nir_var_mem_ssbo, deref->type,850glsl_get_explicit_stride(var->type));851nir_ssa_def_rewrite_uses(&deref->dest.ssa,852&deref_cast->dest.ssa);853nir_instr_remove(&deref->instr);854855deref = deref_cast;856return true;857}858return false;859}860861bool862dxil_nir_lower_deref_ssbo(nir_shader *nir)863{864bool progress = false;865866foreach_list_typed(nir_function, func, node, &nir->functions) {867if (!func->is_entrypoint)868continue;869assert(func->impl);870871nir_builder b;872nir_builder_init(&b, func->impl);873874nir_foreach_block(block, func->impl) {875nir_foreach_instr_safe(instr, block) {876if (instr->type != nir_instr_type_deref)877continue;878879nir_deref_instr *deref = nir_instr_as_deref(instr);880881if (!nir_deref_mode_is(deref, nir_var_mem_ssbo) ||882(deref->deref_type != nir_deref_type_var &&883deref->deref_type != nir_deref_type_cast))884continue;885886progress |= lower_deref_ssbo(&b, deref);887}888}889}890891return progress;892}893894static bool895lower_alu_deref_srcs(nir_builder *b, nir_alu_instr *alu)896{897const nir_op_info *info = &nir_op_infos[alu->op];898bool progress = false;899900b->cursor = nir_before_instr(&alu->instr);901902for (unsigned i = 0; i < info->num_inputs; i++) {903nir_deref_instr *deref = nir_src_as_deref(alu->src[i].src);904905if (!deref)906continue;907908nir_deref_path path;909nir_deref_path_init(&path, deref, NULL);910nir_deref_instr *root_deref = path.path[0];911nir_deref_path_finish(&path);912913if (root_deref->deref_type != nir_deref_type_cast)914continue;915916nir_ssa_def *ptr =917nir_iadd(b, root_deref->parent.ssa,918nir_build_deref_offset(b, deref, cl_type_size_align));919nir_instr_rewrite_src(&alu->instr, &alu->src[i].src, nir_src_for_ssa(ptr));920progress = true;921}922923return progress;924}925926bool927dxil_nir_opt_alu_deref_srcs(nir_shader *nir)928{929bool progress = false;930931foreach_list_typed(nir_function, func, node, &nir->functions) {932if (!func->is_entrypoint)933continue;934assert(func->impl);935936bool progress = false;937nir_builder b;938nir_builder_init(&b, func->impl);939940nir_foreach_block(block, func->impl) {941nir_foreach_instr_safe(instr, block) {942if (instr->type != nir_instr_type_alu)943continue;944945nir_alu_instr *alu = nir_instr_as_alu(instr);946progress |= lower_alu_deref_srcs(&b, alu);947}948}949}950951return progress;952}953954static nir_ssa_def *955memcpy_load_deref_elem(nir_builder *b, nir_deref_instr *parent,956nir_ssa_def *index)957{958nir_deref_instr *deref;959960index = nir_i2i(b, index, nir_dest_bit_size(parent->dest));961assert(parent->deref_type == nir_deref_type_cast);962deref = nir_build_deref_ptr_as_array(b, parent, index);963964return nir_load_deref(b, deref);965}966967static void968memcpy_store_deref_elem(nir_builder *b, nir_deref_instr *parent,969nir_ssa_def *index, nir_ssa_def *value)970{971nir_deref_instr *deref;972973index = nir_i2i(b, index, nir_dest_bit_size(parent->dest));974assert(parent->deref_type == nir_deref_type_cast);975deref = nir_build_deref_ptr_as_array(b, parent, index);976nir_store_deref(b, deref, value, 1);977}978979static bool980lower_memcpy_deref(nir_builder *b, nir_intrinsic_instr *intr)981{982nir_deref_instr *dst_deref = nir_src_as_deref(intr->src[0]);983nir_deref_instr *src_deref = nir_src_as_deref(intr->src[1]);984assert(intr->src[2].is_ssa);985nir_ssa_def *num_bytes = intr->src[2].ssa;986987assert(dst_deref && src_deref);988989b->cursor = nir_after_instr(&intr->instr);990991dst_deref = nir_build_deref_cast(b, &dst_deref->dest.ssa, dst_deref->modes,992glsl_uint8_t_type(), 1);993src_deref = nir_build_deref_cast(b, &src_deref->dest.ssa, src_deref->modes,994glsl_uint8_t_type(), 1);995996/*997* We want to avoid 64b instructions, so let's assume we'll always be998* passed a value that fits in a 32b type and truncate the 64b value.999*/1000num_bytes = nir_u2u32(b, num_bytes);10011002nir_variable *loop_index_var =1003nir_local_variable_create(b->impl, glsl_uint_type(), "loop_index");1004nir_deref_instr *loop_index_deref = nir_build_deref_var(b, loop_index_var);1005nir_store_deref(b, loop_index_deref, nir_imm_int(b, 0), 1);10061007nir_loop *loop = nir_push_loop(b);1008nir_ssa_def *loop_index = nir_load_deref(b, loop_index_deref);1009nir_ssa_def *cmp = nir_ige(b, loop_index, num_bytes);1010nir_if *loop_check = nir_push_if(b, cmp);1011nir_jump(b, nir_jump_break);1012nir_pop_if(b, loop_check);1013nir_ssa_def *val = memcpy_load_deref_elem(b, src_deref, loop_index);1014memcpy_store_deref_elem(b, dst_deref, loop_index, val);1015nir_store_deref(b, loop_index_deref, nir_iadd_imm(b, loop_index, 1), 1);1016nir_pop_loop(b, loop);1017nir_instr_remove(&intr->instr);1018return true;1019}10201021bool1022dxil_nir_lower_memcpy_deref(nir_shader *nir)1023{1024bool progress = false;10251026foreach_list_typed(nir_function, func, node, &nir->functions) {1027if (!func->is_entrypoint)1028continue;1029assert(func->impl);10301031nir_builder b;1032nir_builder_init(&b, func->impl);10331034nir_foreach_block(block, func->impl) {1035nir_foreach_instr_safe(instr, block) {1036if (instr->type != nir_instr_type_intrinsic)1037continue;10381039nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);10401041if (intr->intrinsic == nir_intrinsic_memcpy_deref)1042progress |= lower_memcpy_deref(&b, intr);1043}1044}1045}10461047return progress;1048}10491050static void1051cast_phi(nir_builder *b, nir_phi_instr *phi, unsigned new_bit_size)1052{1053nir_phi_instr *lowered = nir_phi_instr_create(b->shader);1054int num_components = 0;1055int old_bit_size = phi->dest.ssa.bit_size;10561057nir_op upcast_op = nir_type_conversion_op(nir_type_uint | old_bit_size,1058nir_type_uint | new_bit_size,1059nir_rounding_mode_undef);1060nir_op downcast_op = nir_type_conversion_op(nir_type_uint | new_bit_size,1061nir_type_uint | old_bit_size,1062nir_rounding_mode_undef);10631064nir_foreach_phi_src(src, phi) {1065assert(num_components == 0 || num_components == src->src.ssa->num_components);1066num_components = src->src.ssa->num_components;10671068b->cursor = nir_after_instr_and_phis(src->src.ssa->parent_instr);10691070nir_ssa_def *cast = nir_build_alu(b, upcast_op, src->src.ssa, NULL, NULL, NULL);10711072nir_phi_src *new_src = rzalloc(lowered, nir_phi_src);1073new_src->pred = src->pred;1074new_src->src = nir_src_for_ssa(cast);1075exec_list_push_tail(&lowered->srcs, &new_src->node);1076}10771078nir_ssa_dest_init(&lowered->instr, &lowered->dest,1079num_components, new_bit_size, NULL);10801081b->cursor = nir_before_instr(&phi->instr);1082nir_builder_instr_insert(b, &lowered->instr);10831084b->cursor = nir_after_phis(nir_cursor_current_block(b->cursor));1085nir_ssa_def *result = nir_build_alu(b, downcast_op, &lowered->dest.ssa, NULL, NULL, NULL);10861087nir_ssa_def_rewrite_uses(&phi->dest.ssa, result);1088nir_instr_remove(&phi->instr);1089}10901091static bool1092upcast_phi_impl(nir_function_impl *impl, unsigned min_bit_size)1093{1094nir_builder b;1095nir_builder_init(&b, impl);1096bool progress = false;10971098nir_foreach_block_reverse(block, impl) {1099nir_foreach_instr_safe(instr, block) {1100if (instr->type != nir_instr_type_phi)1101continue;11021103nir_phi_instr *phi = nir_instr_as_phi(instr);1104assert(phi->dest.is_ssa);11051106if (phi->dest.ssa.bit_size == 1 ||1107phi->dest.ssa.bit_size >= min_bit_size)1108continue;11091110cast_phi(&b, phi, min_bit_size);1111progress = true;1112}1113}11141115if (progress) {1116nir_metadata_preserve(impl, nir_metadata_block_index |1117nir_metadata_dominance);1118}11191120return progress;1121}11221123bool1124dxil_nir_lower_upcast_phis(nir_shader *shader, unsigned min_bit_size)1125{1126bool progress = false;11271128nir_foreach_function(function, shader) {1129if (function->impl)1130progress |= upcast_phi_impl(function->impl, min_bit_size);1131}11321133return progress;1134}11351136/* In GLSL and SPIR-V, clip and cull distance are arrays of floats (with a limit of 8).1137* In DXIL, clip and cull distances are up to 2 float4s combined.1138* Coming from GLSL, we can request this 2 float4 format, but coming from SPIR-V,1139* we can't, and have to accept a "compact" array of scalar floats.1140*1141* To help emitting a valid input signature for this case, split the variables so that they1142* match what we need to put in the signature (e.g. { float clip[4]; float clip1; float cull[3]; })1143*/1144bool1145dxil_nir_split_clip_cull_distance(nir_shader *shader)1146{1147nir_variable *new_var = NULL;1148nir_foreach_function(function, shader) {1149if (!function->impl)1150continue;11511152bool progress = false;1153nir_builder b;1154nir_builder_init(&b, function->impl);1155nir_foreach_block(block, function->impl) {1156nir_foreach_instr_safe(instr, block) {1157if (instr->type != nir_instr_type_deref)1158continue;1159nir_deref_instr *deref = nir_instr_as_deref(instr);1160nir_variable *var = nir_deref_instr_get_variable(deref);1161if (!var ||1162var->data.location < VARYING_SLOT_CLIP_DIST0 ||1163var->data.location > VARYING_SLOT_CULL_DIST1 ||1164!var->data.compact)1165continue;11661167/* The location should only be inside clip distance, because clip1168* and cull should've been merged by nir_lower_clip_cull_distance_arrays()1169*/1170assert(var->data.location == VARYING_SLOT_CLIP_DIST0 ||1171var->data.location == VARYING_SLOT_CLIP_DIST1);11721173/* The deref chain to the clip/cull variables should be simple, just the1174* var and an array with a constant index, otherwise more lowering/optimization1175* might be needed before this pass, e.g. copy prop, lower_io_to_temporaries,1176* split_var_copies, and/or lower_var_copies1177*/1178assert(deref->deref_type == nir_deref_type_var ||1179deref->deref_type == nir_deref_type_array);11801181b.cursor = nir_before_instr(instr);1182if (!new_var) {1183/* Update lengths for new and old vars */1184int old_length = glsl_array_size(var->type);1185int new_length = (old_length + var->data.location_frac) - 4;1186old_length -= new_length;11871188/* The existing variable fits in the float4 */1189if (new_length <= 0)1190continue;11911192new_var = nir_variable_clone(var, shader);1193nir_shader_add_variable(shader, new_var);1194assert(glsl_get_base_type(glsl_get_array_element(var->type)) == GLSL_TYPE_FLOAT);1195var->type = glsl_array_type(glsl_float_type(), old_length, 0);1196new_var->type = glsl_array_type(glsl_float_type(), new_length, 0);1197new_var->data.location++;1198new_var->data.location_frac = 0;1199}12001201/* Update the type for derefs of the old var */1202if (deref->deref_type == nir_deref_type_var) {1203deref->type = var->type;1204continue;1205}12061207nir_const_value *index = nir_src_as_const_value(deref->arr.index);1208assert(index);12091210/* Treat this array as a vector starting at the component index in location_frac,1211* so if location_frac is 1 and index is 0, then it's accessing the 'y' component1212* of the vector. If index + location_frac is >= 4, there's no component there,1213* so we need to add a new variable and adjust the index.1214*/1215unsigned total_index = index->u32 + var->data.location_frac;1216if (total_index < 4)1217continue;12181219nir_deref_instr *new_var_deref = nir_build_deref_var(&b, new_var);1220nir_deref_instr *new_array_deref = nir_build_deref_array(&b, new_var_deref, nir_imm_int(&b, total_index % 4));1221nir_ssa_def_rewrite_uses(&deref->dest.ssa, &new_array_deref->dest.ssa);1222progress = true;1223}1224}1225if (progress)1226nir_metadata_preserve(function->impl, nir_metadata_block_index |1227nir_metadata_dominance |1228nir_metadata_loop_analysis);1229else1230nir_metadata_preserve(function->impl, nir_metadata_all);1231}12321233return new_var != NULL;1234}12351236bool1237dxil_nir_lower_double_math(nir_shader *shader)1238{1239bool progress = false;1240nir_foreach_function(func, shader) {1241bool func_progress = false;1242if (!func->impl)1243continue;12441245nir_builder b;1246nir_builder_init(&b, func->impl);1247nir_foreach_block(block, func->impl) {1248nir_foreach_instr_safe(instr, block) {1249if (instr->type != nir_instr_type_alu)1250continue;12511252nir_alu_instr *alu = nir_instr_as_alu(instr);12531254/* TODO: See if we can apply this explicitly to packs/unpacks that are then1255* used as a double. As-is, if we had an app explicitly do a 64bit integer op,1256* then try to bitcast to double (not expressible in HLSL, but it is in other1257* source languages), this would unpack the integer and repack as a double, when1258* we probably want to just send the bitcast through to the backend.1259*/12601261b.cursor = nir_before_instr(&alu->instr);12621263for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; ++i) {1264if (nir_alu_type_get_base_type(nir_op_infos[alu->op].input_types[i]) == nir_type_float &&1265alu->src[i].src.ssa->bit_size == 64) {1266nir_ssa_def *packed_double = nir_channel(&b, alu->src[i].src.ssa, alu->src[i].swizzle[0]);1267nir_ssa_def *unpacked_double = nir_unpack_64_2x32(&b, packed_double);1268nir_ssa_def *repacked_double = nir_pack_double_2x32_dxil(&b, unpacked_double);1269nir_instr_rewrite_src_ssa(instr, &alu->src[i].src, repacked_double);1270memset(alu->src[i].swizzle, 0, ARRAY_SIZE(alu->src[i].swizzle));1271func_progress = true;1272}1273}12741275if (nir_alu_type_get_base_type(nir_op_infos[alu->op].output_type) == nir_type_float &&1276alu->dest.dest.ssa.bit_size == 64) {1277b.cursor = nir_after_instr(&alu->instr);1278nir_ssa_def *packed_double = &alu->dest.dest.ssa;1279nir_ssa_def *unpacked_double = nir_unpack_double_2x32_dxil(&b, packed_double);1280nir_ssa_def *repacked_double = nir_pack_64_2x32(&b, unpacked_double);1281nir_ssa_def_rewrite_uses_after(packed_double, repacked_double, unpacked_double->parent_instr);1282func_progress = true;1283}1284}1285}12861287if (func_progress)1288nir_metadata_preserve(func->impl, nir_metadata_block_index |1289nir_metadata_dominance |1290nir_metadata_loop_analysis);1291else1292nir_metadata_preserve(func->impl, nir_metadata_all);1293progress |= func_progress;1294}12951296return progress;1297}12981299typedef struct {1300gl_system_value *values;1301uint32_t count;1302} zero_system_values_state;13031304static bool1305lower_system_value_to_zero_filter(const nir_instr* instr, const void* cb_state)1306{1307if (instr->type != nir_instr_type_intrinsic) {1308return false;1309}13101311nir_intrinsic_instr* intrin = nir_instr_as_intrinsic(instr);13121313/* All the intrinsics we care about are loads */1314if (!nir_intrinsic_infos[intrin->intrinsic].has_dest)1315return false;13161317assert(intrin->dest.is_ssa);13181319zero_system_values_state* state = (zero_system_values_state*)cb_state;1320for (uint32_t i = 0; i < state->count; ++i) {1321gl_system_value value = state->values[i];1322nir_intrinsic_op value_op = nir_intrinsic_from_system_value(value);13231324if (intrin->intrinsic == value_op) {1325return true;1326} else if (intrin->intrinsic == nir_intrinsic_load_deref) {1327nir_deref_instr* deref = nir_src_as_deref(intrin->src[0]);1328if (!nir_deref_mode_is(deref, nir_var_system_value))1329return false;13301331nir_variable* var = deref->var;1332if (var->data.location == value) {1333return true;1334}1335}1336}13371338return false;1339}13401341static nir_ssa_def*1342lower_system_value_to_zero_instr(nir_builder* b, nir_instr* instr, void* _state)1343{1344return nir_imm_int(b, 0);1345}13461347bool1348dxil_nir_lower_system_values_to_zero(nir_shader* shader,1349gl_system_value* system_values,1350uint32_t count)1351{1352zero_system_values_state state = { system_values, count };1353return nir_shader_lower_instructions(shader,1354lower_system_value_to_zero_filter,1355lower_system_value_to_zero_instr,1356&state);1357}13581359static const struct glsl_type *1360get_bare_samplers_for_type(const struct glsl_type *type)1361{1362if (glsl_type_is_sampler(type)) {1363if (glsl_sampler_type_is_shadow(type))1364return glsl_bare_shadow_sampler_type();1365else1366return glsl_bare_sampler_type();1367} else if (glsl_type_is_array(type)) {1368return glsl_array_type(1369get_bare_samplers_for_type(glsl_get_array_element(type)),1370glsl_get_length(type),13710 /*explicit size*/);1372}1373assert(!"Unexpected type");1374return NULL;1375}13761377static bool1378redirect_sampler_derefs(struct nir_builder *b, nir_instr *instr, void *data)1379{1380if (instr->type != nir_instr_type_tex)1381return false;13821383nir_tex_instr *tex = nir_instr_as_tex(instr);1384if (!nir_tex_instr_need_sampler(tex))1385return false;13861387int sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_deref);1388if (sampler_idx == -1) {1389/* No derefs, must be using indices */1390struct hash_entry *hash_entry = _mesa_hash_table_u64_search(data, tex->sampler_index);13911392/* Already have a bare sampler here */1393if (hash_entry)1394return false;13951396nir_variable *typed_sampler = NULL;1397nir_foreach_variable_with_modes(var, b->shader, nir_var_uniform) {1398if (var->data.binding <= tex->sampler_index &&1399var->data.binding + glsl_type_get_sampler_count(var->type) > tex->sampler_index) {1400/* Already have a bare sampler for this binding, add it to the table */1401if (glsl_get_sampler_result_type(glsl_without_array(var->type)) == GLSL_TYPE_VOID) {1402_mesa_hash_table_u64_insert(data, tex->sampler_index, var);1403return false;1404}14051406typed_sampler = var;1407}1408}14091410/* Clone the typed sampler to a bare sampler and we're done */1411assert(typed_sampler);1412nir_variable *bare_sampler = nir_variable_clone(typed_sampler, b->shader);1413bare_sampler->type = get_bare_samplers_for_type(typed_sampler->type);1414nir_shader_add_variable(b->shader, bare_sampler);1415_mesa_hash_table_u64_insert(data, tex->sampler_index, bare_sampler);1416return true;1417}14181419/* Using derefs, means we have to rewrite the deref chain in addition to cloning */1420nir_deref_instr *final_deref = nir_src_as_deref(tex->src[sampler_idx].src);1421nir_deref_path path;1422nir_deref_path_init(&path, final_deref, NULL);14231424nir_deref_instr *old_tail = path.path[0];1425assert(old_tail->deref_type == nir_deref_type_var);1426nir_variable *old_var = old_tail->var;1427if (glsl_get_sampler_result_type(glsl_without_array(old_var->type)) == GLSL_TYPE_VOID) {1428nir_deref_path_finish(&path);1429return false;1430}14311432struct hash_entry *hash_entry = _mesa_hash_table_u64_search(data, old_var->data.binding);1433nir_variable *new_var;1434if (hash_entry) {1435new_var = hash_entry->data;1436} else {1437new_var = nir_variable_clone(old_var, b->shader);1438new_var->type = get_bare_samplers_for_type(old_var->type);1439nir_shader_add_variable(b->shader, new_var);1440_mesa_hash_table_u64_insert(data, old_var->data.binding, new_var);1441}14421443b->cursor = nir_after_instr(&old_tail->instr);1444nir_deref_instr *new_tail = nir_build_deref_var(b, new_var);14451446for (unsigned i = 1; path.path[i]; ++i) {1447b->cursor = nir_after_instr(&path.path[i]->instr);1448new_tail = nir_build_deref_follower(b, new_tail, path.path[i]);1449}14501451nir_deref_path_finish(&path);1452nir_instr_rewrite_src_ssa(&tex->instr, &tex->src[sampler_idx].src, &new_tail->dest.ssa);14531454return true;1455}14561457bool1458dxil_nir_create_bare_samplers(nir_shader *nir)1459{1460struct hash_table_u64 *sampler_to_bare = _mesa_hash_table_u64_create(NULL);14611462bool progress = nir_shader_instructions_pass(nir, redirect_sampler_derefs,1463nir_metadata_block_index | nir_metadata_dominance | nir_metadata_loop_analysis, sampler_to_bare);14641465_mesa_hash_table_u64_destroy(sampler_to_bare);1466return progress;1467}146814691470/* Comparison function to sort io values so that first come normal varyings,1471* then system values, and then system generated values.1472*/1473static int1474variable_location_cmp(const nir_variable* a, const nir_variable* b)1475{1476// Sort by driver_location, location, then index1477return a->data.driver_location != b->data.driver_location ?1478a->data.driver_location - b->data.driver_location :1479a->data.location != b->data.location ?1480a->data.location - b->data.location :1481a->data.index - b->data.index;1482}14831484/* Order varyings according to driver location */1485uint64_t1486dxil_sort_by_driver_location(nir_shader* s, nir_variable_mode modes)1487{1488nir_sort_variables_with_modes(s, variable_location_cmp, modes);14891490uint64_t result = 0;1491nir_foreach_variable_with_modes(var, s, modes) {1492result |= 1ull << var->data.location;1493}1494return result;1495}14961497/* Sort PS outputs so that color outputs come first */1498void1499dxil_sort_ps_outputs(nir_shader* s)1500{1501nir_foreach_variable_with_modes_safe(var, s, nir_var_shader_out) {1502/* We use the driver_location here to avoid introducing a new1503* struct or member variable here. The true, updated driver location1504* will be written below, after sorting */1505switch (var->data.location) {1506case FRAG_RESULT_DEPTH:1507var->data.driver_location = 1;1508break;1509case FRAG_RESULT_STENCIL:1510var->data.driver_location = 2;1511break;1512case FRAG_RESULT_SAMPLE_MASK:1513var->data.driver_location = 3;1514break;1515default:1516var->data.driver_location = 0;1517}1518}15191520nir_sort_variables_with_modes(s, variable_location_cmp,1521nir_var_shader_out);15221523unsigned driver_loc = 0;1524nir_foreach_variable_with_modes(var, s, nir_var_shader_out) {1525var->data.driver_location = driver_loc++;1526}1527}15281529/* Order between stage values so that normal varyings come first,1530* then sysvalues and then system generated values.1531*/1532uint64_t1533dxil_reassign_driver_locations(nir_shader* s, nir_variable_mode modes,1534uint64_t other_stage_mask)1535{1536nir_foreach_variable_with_modes_safe(var, s, modes) {1537/* We use the driver_location here to avoid introducing a new1538* struct or member variable here. The true, updated driver location1539* will be written below, after sorting */1540var->data.driver_location = nir_var_to_dxil_sysvalue_type(var, other_stage_mask);1541}15421543nir_sort_variables_with_modes(s, variable_location_cmp, modes);15441545uint64_t result = 0;1546unsigned driver_loc = 0;1547nir_foreach_variable_with_modes(var, s, modes) {1548result |= 1ull << var->data.location;1549var->data.driver_location = driver_loc++;1550}1551return result;1552}155315541555