Path: blob/21.2-virgl/src/freedreno/ir3/ir3_a6xx.c
4565 views
/*1* Copyright (C) 2017-2018 Rob Clark <[email protected]>2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,19* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE20* SOFTWARE.21*22* Authors:23* Rob Clark <[email protected]>24*/2526#define GPU 6002728#include "ir3_context.h"29#include "ir3_image.h"3031/*32* Handlers for instructions changed/added in a6xx:33*34* Starting with a6xx, isam and stbi is used for SSBOs as well; stbi and the35* atomic instructions (used for both SSBO and image) use a new instruction36* encoding compared to a4xx/a5xx.37*/3839/* src[] = { buffer_index, offset }. No const_index */40static void41emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr,42struct ir3_instruction **dst)43{44struct ir3_block *b = ctx->block;45struct ir3_instruction *offset;46struct ir3_instruction *ldib;4748offset = ir3_get_src(ctx, &intr->src[2])[0];4950ldib = ir3_LDIB(b, ir3_ssbo_to_ibo(ctx, intr->src[0]), 0, offset, 0);51ldib->dsts[0]->wrmask = MASK(intr->num_components);52ldib->cat6.iim_val = intr->num_components;53ldib->cat6.d = 1;54ldib->cat6.type = intr->dest.ssa.bit_size == 16 ? TYPE_U16 : TYPE_U32;55ldib->barrier_class = IR3_BARRIER_BUFFER_R;56ldib->barrier_conflict = IR3_BARRIER_BUFFER_W;57ir3_handle_bindless_cat6(ldib, intr->src[0]);58ir3_handle_nonuniform(ldib, intr);5960ir3_split_dest(b, dst, ldib, 0, intr->num_components);61}6263/* src[] = { value, block_index, offset }. const_index[] = { write_mask } */64static void65emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)66{67struct ir3_block *b = ctx->block;68struct ir3_instruction *stib, *val, *offset;69unsigned wrmask = nir_intrinsic_write_mask(intr);70unsigned ncomp = ffs(~wrmask) - 1;7172assert(wrmask == BITFIELD_MASK(intr->num_components));7374/* src0 is offset, src1 is value:75*/76val = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), ncomp);77offset = ir3_get_src(ctx, &intr->src[3])[0];7879stib = ir3_STIB(b, ir3_ssbo_to_ibo(ctx, intr->src[1]), 0, offset, 0, val, 0);80stib->cat6.iim_val = ncomp;81stib->cat6.d = 1;82stib->cat6.type = intr->src[0].ssa->bit_size == 16 ? TYPE_U16 : TYPE_U32;83stib->barrier_class = IR3_BARRIER_BUFFER_W;84stib->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;85ir3_handle_bindless_cat6(stib, intr->src[1]);86ir3_handle_nonuniform(stib, intr);8788array_insert(b, b->keeps, stib);89}9091/*92* SSBO atomic intrinsics93*94* All of the SSBO atomic memory operations read a value from memory,95* compute a new value using one of the operations below, write the new96* value to memory, and return the original value read.97*98* All operations take 3 sources except CompSwap that takes 4. These99* sources represent:100*101* 0: The SSBO buffer index.102* 1: The offset into the SSBO buffer of the variable that the atomic103* operation will operate on.104* 2: The data parameter to the atomic function (i.e. the value to add105* in ssbo_atomic_add, etc).106* 3: For CompSwap only: the second data parameter.107*/108static struct ir3_instruction *109emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)110{111struct ir3_block *b = ctx->block;112struct ir3_instruction *atomic, *ibo, *src0, *src1, *data, *dummy;113type_t type = TYPE_U32;114115ibo = ir3_ssbo_to_ibo(ctx, intr->src[0]);116117data = ir3_get_src(ctx, &intr->src[2])[0];118119/* So this gets a bit creative:120*121* src0 - vecN offset/coords122* src1.x - is actually destination register123* src1.y - is 'data' except for cmpxchg where src2.y is 'compare'124* src1.z - is 'data' for cmpxchg125*126* The combining src and dest kinda doesn't work out so well with how127* scheduling and RA work. So we create a dummy src2 which is tied to the128* destination in RA (i.e. must be allocated to the same vec2/vec3129* register) and then immediately extract the first component.130*131* Note that nir already multiplies the offset by four132*/133dummy = create_immed(b, 0);134135if (intr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap_ir3) {136src0 = ir3_get_src(ctx, &intr->src[4])[0];137struct ir3_instruction *compare = ir3_get_src(ctx, &intr->src[3])[0];138src1 = ir3_collect(ctx, dummy, compare, data);139} else {140src0 = ir3_get_src(ctx, &intr->src[3])[0];141src1 = ir3_collect(ctx, dummy, data);142}143144switch (intr->intrinsic) {145case nir_intrinsic_ssbo_atomic_add_ir3:146atomic = ir3_ATOMIC_ADD_G(b, ibo, 0, src0, 0, src1, 0);147break;148case nir_intrinsic_ssbo_atomic_imin_ir3:149atomic = ir3_ATOMIC_MIN_G(b, ibo, 0, src0, 0, src1, 0);150type = TYPE_S32;151break;152case nir_intrinsic_ssbo_atomic_umin_ir3:153atomic = ir3_ATOMIC_MIN_G(b, ibo, 0, src0, 0, src1, 0);154break;155case nir_intrinsic_ssbo_atomic_imax_ir3:156atomic = ir3_ATOMIC_MAX_G(b, ibo, 0, src0, 0, src1, 0);157type = TYPE_S32;158break;159case nir_intrinsic_ssbo_atomic_umax_ir3:160atomic = ir3_ATOMIC_MAX_G(b, ibo, 0, src0, 0, src1, 0);161break;162case nir_intrinsic_ssbo_atomic_and_ir3:163atomic = ir3_ATOMIC_AND_G(b, ibo, 0, src0, 0, src1, 0);164break;165case nir_intrinsic_ssbo_atomic_or_ir3:166atomic = ir3_ATOMIC_OR_G(b, ibo, 0, src0, 0, src1, 0);167break;168case nir_intrinsic_ssbo_atomic_xor_ir3:169atomic = ir3_ATOMIC_XOR_G(b, ibo, 0, src0, 0, src1, 0);170break;171case nir_intrinsic_ssbo_atomic_exchange_ir3:172atomic = ir3_ATOMIC_XCHG_G(b, ibo, 0, src0, 0, src1, 0);173break;174case nir_intrinsic_ssbo_atomic_comp_swap_ir3:175atomic = ir3_ATOMIC_CMPXCHG_G(b, ibo, 0, src0, 0, src1, 0);176break;177default:178unreachable("boo");179}180181atomic->cat6.iim_val = 1;182atomic->cat6.d = 1;183atomic->cat6.type = type;184atomic->barrier_class = IR3_BARRIER_BUFFER_W;185atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;186ir3_handle_bindless_cat6(atomic, intr->src[0]);187188/* even if nothing consume the result, we can't DCE the instruction: */189array_insert(b, b->keeps, atomic);190191atomic->dsts[0]->wrmask = src1->dsts[0]->wrmask;192ir3_reg_tie(atomic->dsts[0], atomic->srcs[2]);193struct ir3_instruction *split;194ir3_split_dest(b, &split, atomic, 0, 1);195return split;196}197198/* src[] = { deref, coord, sample_index }. const_index[] = {} */199static void200emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr,201struct ir3_instruction **dst)202{203struct ir3_block *b = ctx->block;204struct ir3_instruction *ldib;205struct ir3_instruction *const *coords = ir3_get_src(ctx, &intr->src[1]);206unsigned ncoords = ir3_get_image_coords(intr, NULL);207208ldib = ir3_LDIB(b, ir3_image_to_ibo(ctx, intr->src[0]), 0,209ir3_create_collect(ctx, coords, ncoords), 0);210ldib->dsts[0]->wrmask = MASK(intr->num_components);211ldib->cat6.iim_val = intr->num_components;212ldib->cat6.d = ncoords;213ldib->cat6.type = ir3_get_type_for_image_intrinsic(intr);214ldib->cat6.typed = true;215ldib->barrier_class = IR3_BARRIER_IMAGE_R;216ldib->barrier_conflict = IR3_BARRIER_IMAGE_W;217ir3_handle_bindless_cat6(ldib, intr->src[0]);218ir3_handle_nonuniform(ldib, intr);219220ir3_split_dest(b, dst, ldib, 0, intr->num_components);221}222223/* src[] = { deref, coord, sample_index, value }. const_index[] = {} */224static void225emit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)226{227struct ir3_block *b = ctx->block;228struct ir3_instruction *stib;229struct ir3_instruction *const *value = ir3_get_src(ctx, &intr->src[3]);230struct ir3_instruction *const *coords = ir3_get_src(ctx, &intr->src[1]);231unsigned ncoords = ir3_get_image_coords(intr, NULL);232enum pipe_format format = nir_intrinsic_format(intr);233unsigned ncomp = ir3_get_num_components_for_image_format(format);234235/* src0 is offset, src1 is value:236*/237stib = ir3_STIB(b, ir3_image_to_ibo(ctx, intr->src[0]), 0,238ir3_create_collect(ctx, coords, ncoords), 0,239ir3_create_collect(ctx, value, ncomp), 0);240stib->cat6.iim_val = ncomp;241stib->cat6.d = ncoords;242stib->cat6.type = ir3_get_type_for_image_intrinsic(intr);243stib->cat6.typed = true;244stib->barrier_class = IR3_BARRIER_IMAGE_W;245stib->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;246ir3_handle_bindless_cat6(stib, intr->src[0]);247ir3_handle_nonuniform(stib, intr);248249array_insert(b, b->keeps, stib);250}251252/* src[] = { deref, coord, sample_index, value, compare }. const_index[] = {} */253static struct ir3_instruction *254emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)255{256struct ir3_block *b = ctx->block;257struct ir3_instruction *atomic, *ibo, *src0, *src1, *dummy;258struct ir3_instruction *const *coords = ir3_get_src(ctx, &intr->src[1]);259struct ir3_instruction *value = ir3_get_src(ctx, &intr->src[3])[0];260unsigned ncoords = ir3_get_image_coords(intr, NULL);261262ibo = ir3_image_to_ibo(ctx, intr->src[0]);263264/* So this gets a bit creative:265*266* src0 - vecN offset/coords267* src1.x - is actually destination register268* src1.y - is 'value' except for cmpxchg where src2.y is 'compare'269* src1.z - is 'value' for cmpxchg270*271* The combining src and dest kinda doesn't work out so well with how272* scheduling and RA work. So we create a dummy src2 which is tied to the273* destination in RA (i.e. must be allocated to the same vec2/vec3274* register) and then immediately extract the first component.275*/276dummy = create_immed(b, 0);277src0 = ir3_create_collect(ctx, coords, ncoords);278279if (intr->intrinsic == nir_intrinsic_image_atomic_comp_swap ||280intr->intrinsic == nir_intrinsic_bindless_image_atomic_comp_swap) {281struct ir3_instruction *compare = ir3_get_src(ctx, &intr->src[4])[0];282src1 = ir3_collect(ctx, dummy, compare, value);283} else {284src1 = ir3_collect(ctx, dummy, value);285}286287switch (intr->intrinsic) {288case nir_intrinsic_image_atomic_add:289case nir_intrinsic_bindless_image_atomic_add:290atomic = ir3_ATOMIC_ADD_G(b, ibo, 0, src0, 0, src1, 0);291break;292case nir_intrinsic_image_atomic_imin:293case nir_intrinsic_image_atomic_umin:294case nir_intrinsic_bindless_image_atomic_imin:295case nir_intrinsic_bindless_image_atomic_umin:296atomic = ir3_ATOMIC_MIN_G(b, ibo, 0, src0, 0, src1, 0);297break;298case nir_intrinsic_image_atomic_imax:299case nir_intrinsic_image_atomic_umax:300case nir_intrinsic_bindless_image_atomic_imax:301case nir_intrinsic_bindless_image_atomic_umax:302atomic = ir3_ATOMIC_MAX_G(b, ibo, 0, src0, 0, src1, 0);303break;304case nir_intrinsic_image_atomic_and:305case nir_intrinsic_bindless_image_atomic_and:306atomic = ir3_ATOMIC_AND_G(b, ibo, 0, src0, 0, src1, 0);307break;308case nir_intrinsic_image_atomic_or:309case nir_intrinsic_bindless_image_atomic_or:310atomic = ir3_ATOMIC_OR_G(b, ibo, 0, src0, 0, src1, 0);311break;312case nir_intrinsic_image_atomic_xor:313case nir_intrinsic_bindless_image_atomic_xor:314atomic = ir3_ATOMIC_XOR_G(b, ibo, 0, src0, 0, src1, 0);315break;316case nir_intrinsic_image_atomic_exchange:317case nir_intrinsic_bindless_image_atomic_exchange:318atomic = ir3_ATOMIC_XCHG_G(b, ibo, 0, src0, 0, src1, 0);319break;320case nir_intrinsic_image_atomic_comp_swap:321case nir_intrinsic_bindless_image_atomic_comp_swap:322atomic = ir3_ATOMIC_CMPXCHG_G(b, ibo, 0, src0, 0, src1, 0);323break;324default:325unreachable("boo");326}327328atomic->cat6.iim_val = 1;329atomic->cat6.d = ncoords;330atomic->cat6.type = ir3_get_type_for_image_intrinsic(intr);331atomic->cat6.typed = true;332atomic->barrier_class = IR3_BARRIER_IMAGE_W;333atomic->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;334ir3_handle_bindless_cat6(atomic, intr->src[0]);335336/* even if nothing consume the result, we can't DCE the instruction: */337array_insert(b, b->keeps, atomic);338339atomic->dsts[0]->wrmask = src1->dsts[0]->wrmask;340ir3_reg_tie(atomic->dsts[0], atomic->srcs[2]);341struct ir3_instruction *split;342ir3_split_dest(b, &split, atomic, 0, 1);343return split;344}345346static void347emit_intrinsic_image_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,348struct ir3_instruction **dst)349{350struct ir3_block *b = ctx->block;351struct ir3_instruction *ibo = ir3_image_to_ibo(ctx, intr->src[0]);352struct ir3_instruction *resinfo = ir3_RESINFO(b, ibo, 0);353resinfo->cat6.iim_val = 1;354resinfo->cat6.d = intr->num_components;355resinfo->cat6.type = TYPE_U32;356resinfo->cat6.typed = false;357/* resinfo has no writemask and always writes out 3 components: */358compile_assert(ctx, intr->num_components <= 3);359resinfo->dsts[0]->wrmask = MASK(3);360ir3_handle_bindless_cat6(resinfo, intr->src[0]);361362ir3_split_dest(b, dst, resinfo, 0, intr->num_components);363}364365static void366emit_intrinsic_load_global_ir3(struct ir3_context *ctx,367nir_intrinsic_instr *intr,368struct ir3_instruction **dst)369{370struct ir3_block *b = ctx->block;371unsigned dest_components = nir_intrinsic_dest_components(intr);372struct ir3_instruction *addr, *offset;373374addr = ir3_collect(ctx, ir3_get_src(ctx, &intr->src[0])[0],375ir3_get_src(ctx, &intr->src[0])[1]);376377offset = ir3_get_src(ctx, &intr->src[1])[0];378379struct ir3_instruction *load =380ir3_LDG_A(b, addr, 0, offset, 0, create_immed(b, 0), 0,381create_immed(b, 0), 0, create_immed(b, dest_components), 0);382load->cat6.type = TYPE_U32;383load->dsts[0]->wrmask = MASK(dest_components);384385load->barrier_class = IR3_BARRIER_BUFFER_R;386load->barrier_conflict = IR3_BARRIER_BUFFER_W;387388ir3_split_dest(b, dst, load, 0, dest_components);389}390391static void392emit_intrinsic_store_global_ir3(struct ir3_context *ctx,393nir_intrinsic_instr *intr)394{395struct ir3_block *b = ctx->block;396struct ir3_instruction *value, *addr, *offset;397unsigned ncomp = nir_intrinsic_src_components(intr, 0);398399addr = ir3_collect(ctx, ir3_get_src(ctx, &intr->src[1])[0],400ir3_get_src(ctx, &intr->src[1])[1]);401402offset = ir3_get_src(ctx, &intr->src[2])[0];403404value = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), ncomp);405406struct ir3_instruction *stg =407ir3_STG_A(b, addr, 0, offset, 0, create_immed(b, 0), 0,408create_immed(b, 0), 0, value, 0, create_immed(b, ncomp), 0);409stg->cat6.type = TYPE_U32;410stg->cat6.iim_val = 1;411412array_insert(b, b->keeps, stg);413414stg->barrier_class = IR3_BARRIER_BUFFER_W;415stg->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;416}417418const struct ir3_context_funcs ir3_a6xx_funcs = {419.emit_intrinsic_load_ssbo = emit_intrinsic_load_ssbo,420.emit_intrinsic_store_ssbo = emit_intrinsic_store_ssbo,421.emit_intrinsic_atomic_ssbo = emit_intrinsic_atomic_ssbo,422.emit_intrinsic_load_image = emit_intrinsic_load_image,423.emit_intrinsic_store_image = emit_intrinsic_store_image,424.emit_intrinsic_atomic_image = emit_intrinsic_atomic_image,425.emit_intrinsic_image_size = emit_intrinsic_image_size,426.emit_intrinsic_load_global_ir3 = emit_intrinsic_load_global_ir3,427.emit_intrinsic_store_global_ir3 = emit_intrinsic_store_global_ir3,428};429430431