Path: blob/21.2-virgl/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
4570 views
/*1* Copyright 2018 Advanced Micro Devices, Inc.2* All Rights Reserved.3*4* Permission is hereby granted, free of charge, to any person obtaining a5* copy of this software and associated documentation files (the "Software"),6* to deal in the Software without restriction, including without limitation7* on the rights to use, copy, modify, merge, publish, distribute, sub8* license, and/or sell copies of the Software, and to permit persons to whom9* the Software is furnished to do so, subject to the following conditions:10*11* The above copyright notice and this permission notice (including the next12* paragraph) shall be included in all copies or substantial portions of the13* Software.14*15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,17* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL18* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,19* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR20* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE21* USE OR OTHER DEALINGS IN THE SOFTWARE.22*/2324#include "si_pipe.h"25#include "tgsi/tgsi_text.h"26#include "tgsi/tgsi_ureg.h"2728void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type, unsigned num_layers)29{30unsigned vs_blit_property;31void **vs;3233switch (type) {34case UTIL_BLITTER_ATTRIB_NONE:35vs = num_layers > 1 ? &sctx->vs_blit_pos_layered : &sctx->vs_blit_pos;36vs_blit_property = SI_VS_BLIT_SGPRS_POS;37break;38case UTIL_BLITTER_ATTRIB_COLOR:39vs = num_layers > 1 ? &sctx->vs_blit_color_layered : &sctx->vs_blit_color;40vs_blit_property = SI_VS_BLIT_SGPRS_POS_COLOR;41break;42case UTIL_BLITTER_ATTRIB_TEXCOORD_XY:43case UTIL_BLITTER_ATTRIB_TEXCOORD_XYZW:44assert(num_layers == 1);45vs = &sctx->vs_blit_texcoord;46vs_blit_property = SI_VS_BLIT_SGPRS_POS_TEXCOORD;47break;48default:49assert(0);50return NULL;51}52if (*vs)53return *vs;5455struct ureg_program *ureg = ureg_create(PIPE_SHADER_VERTEX);56if (!ureg)57return NULL;5859/* Tell the shader to load VS inputs from SGPRs: */60ureg_property(ureg, TGSI_PROPERTY_VS_BLIT_SGPRS_AMD, vs_blit_property);61ureg_property(ureg, TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION, true);6263/* This is just a pass-through shader with 1-3 MOV instructions. */64ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0), ureg_DECL_vs_input(ureg, 0));6566if (type != UTIL_BLITTER_ATTRIB_NONE) {67ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 0), ureg_DECL_vs_input(ureg, 1));68}6970if (num_layers > 1) {71struct ureg_src instance_id = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_INSTANCEID, 0);72struct ureg_dst layer = ureg_DECL_output(ureg, TGSI_SEMANTIC_LAYER, 0);7374ureg_MOV(ureg, ureg_writemask(layer, TGSI_WRITEMASK_X),75ureg_scalar(instance_id, TGSI_SWIZZLE_X));76}77ureg_END(ureg);7879*vs = ureg_create_shader_and_destroy(ureg, &sctx->b);80return *vs;81}8283/**84* This is used when TCS is NULL in the VS->TCS->TES chain. In this case,85* VS passes its outputs to TES directly, so the fixed-function shader only86* has to write TESSOUTER and TESSINNER.87*/88void *si_create_fixed_func_tcs(struct si_context *sctx)89{90struct ureg_src outer, inner;91struct ureg_dst tessouter, tessinner;92struct ureg_program *ureg = ureg_create(PIPE_SHADER_TESS_CTRL);9394if (!ureg)95return NULL;9697outer = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_TESS_DEFAULT_OUTER_LEVEL, 0);98inner = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL, 0);99100tessouter = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSOUTER, 0);101tessinner = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSINNER, 0);102103ureg_MOV(ureg, tessouter, outer);104ureg_MOV(ureg, tessinner, inner);105ureg_END(ureg);106107return ureg_create_shader_and_destroy(ureg, &sctx->b);108}109110/* Create a compute shader implementing clear_buffer or copy_buffer. */111void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords_per_thread,112bool dst_stream_cache_policy, bool is_copy)113{114struct si_screen *sscreen = (struct si_screen *)ctx->screen;115assert(util_is_power_of_two_nonzero(num_dwords_per_thread));116117unsigned store_qualifier = TGSI_MEMORY_COHERENT | TGSI_MEMORY_RESTRICT;118if (dst_stream_cache_policy)119store_qualifier |= TGSI_MEMORY_STREAM_CACHE_POLICY;120121/* Don't cache loads, because there is no reuse. */122unsigned load_qualifier = store_qualifier | TGSI_MEMORY_STREAM_CACHE_POLICY;123124unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4);125unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned));126127for (unsigned i = 0; i < num_mem_ops; i++) {128if (i * 4 < num_dwords_per_thread)129inst_dwords[i] = MIN2(4, num_dwords_per_thread - i * 4);130}131132struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE);133if (!ureg)134return NULL;135136ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, sscreen->compute_wave_size);137ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1);138ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);139140struct ureg_src value;141if (!is_copy) {142ureg_property(ureg, TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD, inst_dwords[0]);143value = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_CS_USER_DATA_AMD, 0);144}145146struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0);147struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0);148struct ureg_dst store_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);149struct ureg_dst load_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);150struct ureg_dst dstbuf = ureg_dst(ureg_DECL_buffer(ureg, 0, false));151struct ureg_src srcbuf;152struct ureg_src *values = NULL;153154if (is_copy) {155srcbuf = ureg_DECL_buffer(ureg, 1, false);156values = malloc(num_mem_ops * sizeof(struct ureg_src));157}158159/* If there are multiple stores, the first store writes into 0*wavesize+tid,160* the 2nd store writes into 1*wavesize+tid, the 3rd store writes into 2*wavesize+tid, etc.161*/162ureg_UMAD(ureg, store_addr, blk, ureg_imm1u(ureg, sscreen->compute_wave_size * num_mem_ops),163tid);164/* Convert from a "store size unit" into bytes. */165ureg_UMUL(ureg, store_addr, ureg_src(store_addr), ureg_imm1u(ureg, 4 * inst_dwords[0]));166ureg_MOV(ureg, load_addr, ureg_src(store_addr));167168/* Distance between a load and a store for latency hiding. */169unsigned load_store_distance = is_copy ? 8 : 0;170171for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) {172int d = i - load_store_distance;173174if (is_copy && i < num_mem_ops) {175if (i) {176ureg_UADD(ureg, load_addr, ureg_src(load_addr),177ureg_imm1u(ureg, 4 * inst_dwords[i] * sscreen->compute_wave_size));178}179180values[i] = ureg_src(ureg_DECL_temporary(ureg));181struct ureg_dst dst =182ureg_writemask(ureg_dst(values[i]), u_bit_consecutive(0, inst_dwords[i]));183struct ureg_src srcs[] = {srcbuf, ureg_src(load_addr)};184ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dst, 1, srcs, 2, load_qualifier,185TGSI_TEXTURE_BUFFER, 0);186}187188if (d >= 0) {189if (d) {190ureg_UADD(ureg, store_addr, ureg_src(store_addr),191ureg_imm1u(ureg, 4 * inst_dwords[d] * sscreen->compute_wave_size));192}193194struct ureg_dst dst = ureg_writemask(dstbuf, u_bit_consecutive(0, inst_dwords[d]));195struct ureg_src srcs[] = {ureg_src(store_addr), is_copy ? values[d] : value};196ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst, 1, srcs, 2, store_qualifier,197TGSI_TEXTURE_BUFFER, 0);198}199}200ureg_END(ureg);201202struct pipe_compute_state state = {};203state.ir_type = PIPE_SHADER_IR_TGSI;204state.prog = ureg_get_tokens(ureg, NULL);205206void *cs = ctx->create_compute_state(ctx, &state);207ureg_destroy(ureg);208ureg_free_tokens(state.prog);209210free(values);211return cs;212}213214/* Create a compute shader implementing clear_buffer or copy_buffer. */215void *si_create_clear_buffer_rmw_cs(struct pipe_context *ctx)216{217const char *text = "COMP\n"218"PROPERTY CS_FIXED_BLOCK_WIDTH 64\n"219"PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"220"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"221"PROPERTY CS_USER_DATA_COMPONENTS_AMD 2\n"222"DCL SV[0], THREAD_ID\n"223"DCL SV[1], BLOCK_ID\n"224"DCL SV[2], CS_USER_DATA_AMD\n"225"DCL BUFFER[0]\n"226"DCL TEMP[0..1]\n"227"IMM[0] UINT32 {64, 16, 0, 0}\n"228/* ADDRESS = BLOCK_ID * 64 + THREAD_ID; */229"UMAD TEMP[0].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx\n"230/* ADDRESS = ADDRESS * 16; (byte offset, loading one vec4 per thread) */231"UMUL TEMP[0].x, TEMP[0].xxxx, IMM[0].yyyy\n"232"LOAD TEMP[1], BUFFER[0], TEMP[0].xxxx\n"233/* DATA &= inverted_writemask; */234"AND TEMP[1], TEMP[1], SV[2].yyyy\n"235/* DATA |= clear_value_masked; */236"OR TEMP[1], TEMP[1], SV[2].xxxx\n"237"STORE BUFFER[0].xyzw, TEMP[0], TEMP[1]%s\n"238"END\n";239char final_text[2048];240struct tgsi_token tokens[1024];241struct pipe_compute_state state = {0};242243snprintf(final_text, sizeof(final_text), text,244SI_COMPUTE_DST_CACHE_POLICY != L2_LRU ? ", STREAM_CACHE_POLICY" : "");245246if (!tgsi_text_translate(final_text, tokens, ARRAY_SIZE(tokens))) {247assert(false);248return NULL;249}250251state.ir_type = PIPE_SHADER_IR_TGSI;252state.prog = tokens;253254return ctx->create_compute_state(ctx, &state);255}256257/* Create the compute shader that is used to collect the results.258*259* One compute grid with a single thread is launched for every query result260* buffer. The thread (optionally) reads a previous summary buffer, then261* accumulates data from the query result buffer, and writes the result either262* to a summary buffer to be consumed by the next grid invocation or to the263* user-supplied buffer.264*265* Data layout:266*267* CONST268* 0.x = end_offset269* 0.y = result_stride270* 0.z = result_count271* 0.w = bit field:272* 1: read previously accumulated values273* 2: write accumulated values for chaining274* 4: write result available275* 8: convert result to boolean (0/1)276* 16: only read one dword and use that as result277* 32: apply timestamp conversion278* 64: store full 64 bits result279* 128: store signed 32 bits result280* 256: SO_OVERFLOW mode: take the difference of two successive half-pairs281* 1.x = fence_offset282* 1.y = pair_stride283* 1.z = pair_count284*285* BUFFER[0] = query result buffer286* BUFFER[1] = previous summary buffer287* BUFFER[2] = next summary buffer or user-supplied buffer288*/289void *si_create_query_result_cs(struct si_context *sctx)290{291/* TEMP[0].xy = accumulated result so far292* TEMP[0].z = result not available293*294* TEMP[1].x = current result index295* TEMP[1].y = current pair index296*/297static const char text_tmpl[] =298"COMP\n"299"PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"300"PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"301"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"302"DCL BUFFER[0]\n"303"DCL BUFFER[1]\n"304"DCL BUFFER[2]\n"305"DCL CONST[0][0..1]\n"306"DCL TEMP[0..5]\n"307"IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n"308"IMM[1] UINT32 {1, 2, 4, 8}\n"309"IMM[2] UINT32 {16, 32, 64, 128}\n"310"IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */311"IMM[4] UINT32 {256, 0, 0, 0}\n"312313"AND TEMP[5], CONST[0][0].wwww, IMM[2].xxxx\n"314"UIF TEMP[5]\n"315/* Check result availability. */316"LOAD TEMP[1].x, BUFFER[0], CONST[0][1].xxxx\n"317"ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n"318"MOV TEMP[1], TEMP[0].zzzz\n"319"NOT TEMP[0].z, TEMP[0].zzzz\n"320321/* Load result if available. */322"UIF TEMP[1]\n"323"LOAD TEMP[0].xy, BUFFER[0], IMM[0].xxxx\n"324"ENDIF\n"325"ELSE\n"326/* Load previously accumulated result if requested. */327"MOV TEMP[0], IMM[0].xxxx\n"328"AND TEMP[4], CONST[0][0].wwww, IMM[1].xxxx\n"329"UIF TEMP[4]\n"330"LOAD TEMP[0].xyz, BUFFER[1], IMM[0].xxxx\n"331"ENDIF\n"332333"MOV TEMP[1].x, IMM[0].xxxx\n"334"BGNLOOP\n"335/* Break if accumulated result so far is not available. */336"UIF TEMP[0].zzzz\n"337"BRK\n"338"ENDIF\n"339340/* Break if result_index >= result_count. */341"USGE TEMP[5], TEMP[1].xxxx, CONST[0][0].zzzz\n"342"UIF TEMP[5]\n"343"BRK\n"344"ENDIF\n"345346/* Load fence and check result availability */347"UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy, CONST[0][1].xxxx\n"348"LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"349"ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n"350"NOT TEMP[0].z, TEMP[0].zzzz\n"351"UIF TEMP[0].zzzz\n"352"BRK\n"353"ENDIF\n"354355"MOV TEMP[1].y, IMM[0].xxxx\n"356"BGNLOOP\n"357/* Load start and end. */358"UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy\n"359"UMAD TEMP[5].x, TEMP[1].yyyy, CONST[0][1].yyyy, TEMP[5].xxxx\n"360"LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"361362"UADD TEMP[5].y, TEMP[5].xxxx, CONST[0][0].xxxx\n"363"LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"364365"U64ADD TEMP[4].xy, TEMP[3], -TEMP[2]\n"366367"AND TEMP[5].z, CONST[0][0].wwww, IMM[4].xxxx\n"368"UIF TEMP[5].zzzz\n"369/* Load second start/end half-pair and370* take the difference371*/372"UADD TEMP[5].xy, TEMP[5], IMM[1].wwww\n"373"LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"374"LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"375376"U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n"377"U64ADD TEMP[4].xy, TEMP[4], -TEMP[3]\n"378"ENDIF\n"379380"U64ADD TEMP[0].xy, TEMP[0], TEMP[4]\n"381382/* Increment pair index */383"UADD TEMP[1].y, TEMP[1].yyyy, IMM[1].xxxx\n"384"USGE TEMP[5], TEMP[1].yyyy, CONST[0][1].zzzz\n"385"UIF TEMP[5]\n"386"BRK\n"387"ENDIF\n"388"ENDLOOP\n"389390/* Increment result index */391"UADD TEMP[1].x, TEMP[1].xxxx, IMM[1].xxxx\n"392"ENDLOOP\n"393"ENDIF\n"394395"AND TEMP[4], CONST[0][0].wwww, IMM[1].yyyy\n"396"UIF TEMP[4]\n"397/* Store accumulated data for chaining. */398"STORE BUFFER[2].xyz, IMM[0].xxxx, TEMP[0]\n"399"ELSE\n"400"AND TEMP[4], CONST[0][0].wwww, IMM[1].zzzz\n"401"UIF TEMP[4]\n"402/* Store result availability. */403"NOT TEMP[0].z, TEMP[0]\n"404"AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n"405"STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].zzzz\n"406407"AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"408"UIF TEMP[4]\n"409"STORE BUFFER[2].y, IMM[0].xxxx, IMM[0].xxxx\n"410"ENDIF\n"411"ELSE\n"412/* Store result if it is available. */413"NOT TEMP[4], TEMP[0].zzzz\n"414"UIF TEMP[4]\n"415/* Apply timestamp conversion */416"AND TEMP[4], CONST[0][0].wwww, IMM[2].yyyy\n"417"UIF TEMP[4]\n"418"U64MUL TEMP[0].xy, TEMP[0], IMM[3].xyxy\n"419"U64DIV TEMP[0].xy, TEMP[0], IMM[3].zwzw\n"420"ENDIF\n"421422/* Convert to boolean */423"AND TEMP[4], CONST[0][0].wwww, IMM[1].wwww\n"424"UIF TEMP[4]\n"425"U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[4].zwzw\n"426"AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n"427"MOV TEMP[0].y, IMM[0].xxxx\n"428"ENDIF\n"429430"AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"431"UIF TEMP[4]\n"432"STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0].xyxy\n"433"ELSE\n"434/* Clamping */435"UIF TEMP[0].yyyy\n"436"MOV TEMP[0].x, IMM[0].wwww\n"437"ENDIF\n"438439"AND TEMP[4], CONST[0][0].wwww, IMM[2].wwww\n"440"UIF TEMP[4]\n"441"UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n"442"ENDIF\n"443444"STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"445"ENDIF\n"446"ENDIF\n"447"ENDIF\n"448"ENDIF\n"449450"END\n";451452char text[sizeof(text_tmpl) + 32];453struct tgsi_token tokens[1024];454struct pipe_compute_state state = {};455456/* Hard code the frequency into the shader so that the backend can457* use the full range of optimizations for divide-by-constant.458*/459snprintf(text, sizeof(text), text_tmpl, sctx->screen->info.clock_crystal_freq);460461if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {462assert(false);463return NULL;464}465466state.ir_type = PIPE_SHADER_IR_TGSI;467state.prog = tokens;468469return sctx->b.create_compute_state(&sctx->b, &state);470}471472/* Create a compute shader implementing copy_image.473* Luckily, this works with all texture targets except 1D_ARRAY.474*/475void *si_create_copy_image_compute_shader(struct pipe_context *ctx)476{477static const char text[] =478"COMP\n"479"PROPERTY CS_USER_DATA_COMPONENTS_AMD 3\n"480"DCL SV[0], THREAD_ID\n"481"DCL SV[1], BLOCK_ID\n"482"DCL SV[2], BLOCK_SIZE\n"483"DCL SV[3], CS_USER_DATA_AMD\n"484"DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"485"DCL IMAGE[1], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"486"DCL TEMP[0..3], LOCAL\n"487"IMM[0] UINT32 {65535, 16, 0, 0}\n"488489"UMAD TEMP[0].xyz, SV[1], SV[2], SV[0]\n" /* threadID.xyz */490"AND TEMP[1].xyz, SV[3], IMM[0].xxxx\n" /* src.xyz */491"UADD TEMP[1].xyz, TEMP[1], TEMP[0]\n" /* src.xyz + threadID.xyz */492"LOAD TEMP[3], IMAGE[0], TEMP[1], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"493"USHR TEMP[2].xyz, SV[3], IMM[0].yyyy\n" /* dst.xyz */494"UADD TEMP[2].xyz, TEMP[2], TEMP[0]\n" /* dst.xyz + threadID.xyz */495"STORE IMAGE[1], TEMP[2], TEMP[3], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"496"END\n";497498struct tgsi_token tokens[1024];499struct pipe_compute_state state = {0};500501if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {502assert(false);503return NULL;504}505506state.ir_type = PIPE_SHADER_IR_TGSI;507state.prog = tokens;508509return ctx->create_compute_state(ctx, &state);510}511512void *si_create_copy_image_compute_shader_1d_array(struct pipe_context *ctx)513{514static const char text[] =515"COMP\n"516"PROPERTY CS_FIXED_BLOCK_WIDTH 64\n"517"PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"518"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"519"PROPERTY CS_USER_DATA_COMPONENTS_AMD 3\n"520"DCL SV[0], THREAD_ID\n"521"DCL SV[1], BLOCK_ID\n"522"DCL SV[2], CS_USER_DATA_AMD\n"523"DCL IMAGE[0], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"524"DCL IMAGE[1], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"525"DCL TEMP[0..4], LOCAL\n"526"IMM[0] UINT32 {64, 1, 65535, 16}\n"527528"UMAD TEMP[0].xz, SV[1].xyyy, IMM[0].xyyy, SV[0].xyyy\n" /* threadID.xz */529"AND TEMP[1].xz, SV[2], IMM[0].zzzz\n" /* src.xz */530"UADD TEMP[1].xz, TEMP[1], TEMP[0]\n" /* src.xz + threadID.xz */531"LOAD TEMP[3], IMAGE[0], TEMP[1].xzzz, 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"532"USHR TEMP[2].xz, SV[2], IMM[0].wwww\n" /* dst.xz */533"UADD TEMP[2].xz, TEMP[2], TEMP[0]\n" /* dst.xz + threadID.xz */534"STORE IMAGE[1], TEMP[2].xzzz, TEMP[3], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"535"END\n";536537struct tgsi_token tokens[1024];538struct pipe_compute_state state = {0};539540if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {541assert(false);542return NULL;543}544545state.ir_type = PIPE_SHADER_IR_TGSI;546state.prog = tokens;547548return ctx->create_compute_state(ctx, &state);549}550551/* Create a compute shader implementing DCC decompression via a blit.552* This is a trivial copy_image shader except that it has a variable block553* size and a barrier.554*/555void *si_create_dcc_decompress_cs(struct pipe_context *ctx)556{557static const char text[] =558"COMP\n"559"DCL SV[0], THREAD_ID\n"560"DCL SV[1], BLOCK_ID\n"561"DCL SV[2], BLOCK_SIZE\n"562"DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"563"DCL IMAGE[1], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"564"DCL TEMP[0..1]\n"565566"UMAD TEMP[0].xyz, SV[1].xyzz, SV[2].xyzz, SV[0].xyzz\n"567"LOAD TEMP[1], IMAGE[0], TEMP[0].xyzz, 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"568/* Wait for the whole threadgroup (= DCC block) to load texels before569* overwriting them, because overwriting any pixel within a DCC block570* can break compression for the whole block.571*/572"BARRIER\n"573"STORE IMAGE[1], TEMP[0].xyzz, TEMP[1], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"574"END\n";575576struct tgsi_token tokens[1024];577struct pipe_compute_state state = {0};578579if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {580assert(false);581return NULL;582}583584state.ir_type = PIPE_SHADER_IR_TGSI;585state.prog = tokens;586587return ctx->create_compute_state(ctx, &state);588}589590void *si_clear_render_target_shader(struct pipe_context *ctx)591{592static const char text[] =593"COMP\n"594"PROPERTY CS_FIXED_BLOCK_WIDTH 8\n"595"PROPERTY CS_FIXED_BLOCK_HEIGHT 8\n"596"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"597"DCL SV[0], THREAD_ID\n"598"DCL SV[1], BLOCK_ID\n"599"DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"600"DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw601"DCL TEMP[0..3], LOCAL\n"602"IMM[0] UINT32 {8, 1, 0, 0}\n"603"MOV TEMP[0].xyz, CONST[0][0].xyzw\n"604"UMAD TEMP[1].xyz, SV[1].xyzz, IMM[0].xxyy, SV[0].xyzz\n"605"UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[0].xyzx\n"606"MOV TEMP[3].xyzw, CONST[0][1].xyzw\n"607"STORE IMAGE[0], TEMP[2].xyzz, TEMP[3], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"608"END\n";609610struct tgsi_token tokens[1024];611struct pipe_compute_state state = {0};612613if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {614assert(false);615return NULL;616}617618state.ir_type = PIPE_SHADER_IR_TGSI;619state.prog = tokens;620621return ctx->create_compute_state(ctx, &state);622}623624/* TODO: Didn't really test 1D_ARRAY */625void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx)626{627static const char text[] =628"COMP\n"629"PROPERTY CS_FIXED_BLOCK_WIDTH 64\n"630"PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"631"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"632"DCL SV[0], THREAD_ID\n"633"DCL SV[1], BLOCK_ID\n"634"DCL IMAGE[0], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"635"DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw636"DCL TEMP[0..3], LOCAL\n"637"IMM[0] UINT32 {64, 1, 0, 0}\n"638"MOV TEMP[0].xy, CONST[0][0].xzzw\n"639"UMAD TEMP[1].xy, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n"640"UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[0].xyzx\n"641"MOV TEMP[3].xyzw, CONST[0][1].xyzw\n"642"STORE IMAGE[0], TEMP[2].xyzz, TEMP[3], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"643"END\n";644645struct tgsi_token tokens[1024];646struct pipe_compute_state state = {0};647648if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {649assert(false);650return NULL;651}652653state.ir_type = PIPE_SHADER_IR_TGSI;654state.prog = tokens;655656return ctx->create_compute_state(ctx, &state);657}658659void *si_clear_12bytes_buffer_shader(struct pipe_context *ctx)660{661static const char text[] = "COMP\n"662"PROPERTY CS_FIXED_BLOCK_WIDTH 64\n"663"PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"664"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"665"PROPERTY CS_USER_DATA_COMPONENTS_AMD 3\n"666"DCL SV[0], THREAD_ID\n"667"DCL SV[1], BLOCK_ID\n"668"DCL SV[2], CS_USER_DATA_AMD\n"669"DCL BUFFER[0]\n"670"DCL TEMP[0..0]\n"671"IMM[0] UINT32 {64, 1, 12, 0}\n"672"UMAD TEMP[0].x, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n"673"UMUL TEMP[0].x, TEMP[0].xyzz, IMM[0].zzzz\n" // 12 bytes674"STORE BUFFER[0].xyz, TEMP[0].xxxx, SV[2].xyzz%s\n"675"END\n";676char final_text[2048];677struct tgsi_token tokens[1024];678struct pipe_compute_state state = {0};679680snprintf(final_text, sizeof(final_text), text,681SI_COMPUTE_DST_CACHE_POLICY != L2_LRU ? ", STREAM_CACHE_POLICY" : "");682683if (!tgsi_text_translate(final_text, tokens, ARRAY_SIZE(tokens))) {684assert(false);685return NULL;686}687688state.ir_type = PIPE_SHADER_IR_TGSI;689state.prog = tokens;690691return ctx->create_compute_state(ctx, &state);692}693694/* Load samples from the image, and copy them to the same image. This looks like695* a no-op, but it's not. Loads use FMASK, while stores don't, so samples are696* reordered to match expanded FMASK.697*698* After the shader finishes, FMASK should be cleared to identity.699*/700void *si_create_fmask_expand_cs(struct pipe_context *ctx, unsigned num_samples, bool is_array)701{702enum tgsi_texture_type target = is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA : TGSI_TEXTURE_2D_MSAA;703struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE);704if (!ureg)705return NULL;706707ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, 8);708ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 8);709ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);710711/* Compute the image coordinates. */712struct ureg_src image = ureg_DECL_image(ureg, 0, target, 0, true, false);713struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0);714struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0);715struct ureg_dst coord = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZW);716ureg_UMAD(ureg, ureg_writemask(coord, TGSI_WRITEMASK_XY), ureg_swizzle(blk, 0, 1, 1, 1),717ureg_imm2u(ureg, 8, 8), ureg_swizzle(tid, 0, 1, 1, 1));718if (is_array) {719ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_Z), ureg_scalar(blk, TGSI_SWIZZLE_Z));720}721722/* Load samples, resolving FMASK. */723struct ureg_dst sample[8];724assert(num_samples <= ARRAY_SIZE(sample));725726for (unsigned i = 0; i < num_samples; i++) {727sample[i] = ureg_DECL_temporary(ureg);728729ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_W), ureg_imm1u(ureg, i));730731struct ureg_src srcs[] = {image, ureg_src(coord)};732ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &sample[i], 1, srcs, 2, TGSI_MEMORY_RESTRICT, target,7330);734}735736/* Store samples, ignoring FMASK. */737for (unsigned i = 0; i < num_samples; i++) {738ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_W), ureg_imm1u(ureg, i));739740struct ureg_dst dst_image = ureg_dst(image);741struct ureg_src srcs[] = {ureg_src(coord), ureg_src(sample[i])};742ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst_image, 1, srcs, 2, TGSI_MEMORY_RESTRICT,743target, 0);744}745ureg_END(ureg);746747struct pipe_compute_state state = {};748state.ir_type = PIPE_SHADER_IR_TGSI;749state.prog = ureg_get_tokens(ureg, NULL);750751void *cs = ctx->create_compute_state(ctx, &state);752ureg_destroy(ureg);753return cs;754}755756/* Create the compute shader that is used to collect the results of gfx10+757* shader queries.758*759* One compute grid with a single thread is launched for every query result760* buffer. The thread (optionally) reads a previous summary buffer, then761* accumulates data from the query result buffer, and writes the result either762* to a summary buffer to be consumed by the next grid invocation or to the763* user-supplied buffer.764*765* Data layout:766*767* BUFFER[0] = query result buffer (layout is defined by gfx10_sh_query_buffer_mem)768* BUFFER[1] = previous summary buffer769* BUFFER[2] = next summary buffer or user-supplied buffer770*771* CONST772* 0.x = config; the low 3 bits indicate the mode:773* 0: sum up counts774* 1: determine result availability and write it as a boolean775* 2: SO_OVERFLOW776* 3: SO_ANY_OVERFLOW777* the remaining bits form a bitfield:778* 8: write result as a 64-bit value779* 0.y = offset in bytes to counts or stream for SO_OVERFLOW mode780* 0.z = chain bit field:781* 1: have previous summary buffer782* 2: write next summary buffer783* 0.w = result_count784*/785void *gfx10_create_sh_query_result_cs(struct si_context *sctx)786{787/* TEMP[0].x = accumulated result so far788* TEMP[0].y = result missing789* TEMP[0].z = whether we're in overflow mode790*/791static const char text_tmpl[] = "COMP\n"792"PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"793"PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"794"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"795"DCL BUFFER[0]\n"796"DCL BUFFER[1]\n"797"DCL BUFFER[2]\n"798"DCL CONST[0][0..0]\n"799"DCL TEMP[0..5]\n"800"IMM[0] UINT32 {0, 7, 256, 4294967295}\n"801"IMM[1] UINT32 {1, 2, 4, 8}\n"802"IMM[2] UINT32 {16, 32, 64, 128}\n"803804/*805acc_result = 0;806acc_missing = 0;807if (chain & 1) {808acc_result = buffer[1][0];809acc_missing = buffer[1][1];810}811*/812"MOV TEMP[0].xy, IMM[0].xxxx\n"813"AND TEMP[5], CONST[0][0].zzzz, IMM[1].xxxx\n"814"UIF TEMP[5]\n"815"LOAD TEMP[0].xy, BUFFER[1], IMM[0].xxxx\n"816"ENDIF\n"817818/*819is_overflow (TEMP[0].z) = (config & 7) >= 2;820result_remaining (TEMP[1].x) = (is_overflow && acc_result) ? 0 :821result_count; base_offset (TEMP[1].y) = 0; for (;;) { if822(!result_remaining) break; result_remaining--;823*/824"AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"825"USGE TEMP[0].z, TEMP[5].xxxx, IMM[1].yyyy\n"826827"AND TEMP[5].x, TEMP[0].zzzz, TEMP[0].xxxx\n"828"UCMP TEMP[1].x, TEMP[5].xxxx, IMM[0].xxxx, CONST[0][0].wwww\n"829"MOV TEMP[1].y, IMM[0].xxxx\n"830831"BGNLOOP\n"832"USEQ TEMP[5], TEMP[1].xxxx, IMM[0].xxxx\n"833"UIF TEMP[5]\n"834"BRK\n"835"ENDIF\n"836"UADD TEMP[1].x, TEMP[1].xxxx, IMM[0].wwww\n"837838/*839fence = buffer[0]@(base_offset + sizeof(gfx10_sh_query_buffer_mem.stream));840if (!fence) {841acc_missing = ~0u;842break;843}844*/845"UADD TEMP[5].x, TEMP[1].yyyy, IMM[2].wwww\n"846"LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"847"USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n"848"UIF TEMP[5]\n"849"MOV TEMP[0].y, TEMP[5].xxxx\n"850"BRK\n"851"ENDIF\n"852853/*854stream_offset (TEMP[2].x) = base_offset + offset;855856if (!(config & 7)) {857acc_result += buffer[0]@stream_offset;858}859*/860"UADD TEMP[2].x, TEMP[1].yyyy, CONST[0][0].yyyy\n"861862"AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"863"USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n"864"UIF TEMP[5]\n"865"LOAD TEMP[5].x, BUFFER[0], TEMP[2].xxxx\n"866"UADD TEMP[0].x, TEMP[0].xxxx, TEMP[5].xxxx\n"867"ENDIF\n"868869/*870if ((config & 7) >= 2) {871count (TEMP[2].y) = (config & 1) ? 4 : 1;872*/873"AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"874"USGE TEMP[5], TEMP[5].xxxx, IMM[1].yyyy\n"875"UIF TEMP[5]\n"876"AND TEMP[5].x, CONST[0][0].xxxx, IMM[1].xxxx\n"877"UCMP TEMP[2].y, TEMP[5].xxxx, IMM[1].zzzz, IMM[1].xxxx\n"878879/*880do {881generated = buffer[0]@(stream_offset + 2 * sizeof(uint64_t));882emitted = buffer[0]@(stream_offset + 3 * sizeof(uint64_t));883if (generated != emitted) {884acc_result = 1;885result_remaining = 0;886break;887}888889stream_offset += sizeof(gfx10_sh_query_buffer_mem.stream[0]);890} while (--count);891*/892"BGNLOOP\n"893"UADD TEMP[5].x, TEMP[2].xxxx, IMM[2].xxxx\n"894"LOAD TEMP[4].xyzw, BUFFER[0], TEMP[5].xxxx\n"895"USNE TEMP[5], TEMP[4].xyxy, TEMP[4].zwzw\n"896"UIF TEMP[5]\n"897"MOV TEMP[0].x, IMM[1].xxxx\n"898"MOV TEMP[1].y, IMM[0].xxxx\n"899"BRK\n"900"ENDIF\n"901902"UADD TEMP[2].y, TEMP[2].yyyy, IMM[0].wwww\n"903"USEQ TEMP[5], TEMP[2].yyyy, IMM[0].xxxx\n"904"UIF TEMP[5]\n"905"BRK\n"906"ENDIF\n"907"UADD TEMP[2].x, TEMP[2].xxxx, IMM[2].yyyy\n"908"ENDLOOP\n"909"ENDIF\n"910911/*912base_offset += sizeof(gfx10_sh_query_buffer_mem);913} // end outer loop914*/915"UADD TEMP[1].y, TEMP[1].yyyy, IMM[0].zzzz\n"916"ENDLOOP\n"917918/*919if (chain & 2) {920buffer[2][0] = acc_result;921buffer[2][1] = acc_missing;922} else {923*/924"AND TEMP[5], CONST[0][0].zzzz, IMM[1].yyyy\n"925"UIF TEMP[5]\n"926"STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0]\n"927"ELSE\n"928929/*930if ((config & 7) == 1) {931acc_result = acc_missing ? 0 : 1;932acc_missing = 0;933}934*/935"AND TEMP[5], CONST[0][0].xxxx, IMM[0].yyyy\n"936"USEQ TEMP[5], TEMP[5].xxxx, IMM[1].xxxx\n"937"UIF TEMP[5]\n"938"UCMP TEMP[0].x, TEMP[0].yyyy, IMM[0].xxxx, IMM[1].xxxx\n"939"MOV TEMP[0].y, IMM[0].xxxx\n"940"ENDIF\n"941942/*943if (!acc_missing) {944buffer[2][0] = acc_result;945if (config & 8)946buffer[2][1] = 0;947}948*/949"USEQ TEMP[5], TEMP[0].yyyy, IMM[0].xxxx\n"950"UIF TEMP[5]\n"951"STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"952953"AND TEMP[5], CONST[0][0].xxxx, IMM[1].wwww\n"954"UIF TEMP[5]\n"955"STORE BUFFER[2].x, IMM[1].zzzz, TEMP[0].yyyy\n"956"ENDIF\n"957"ENDIF\n"958"ENDIF\n"959960"END\n";961962struct tgsi_token tokens[1024];963struct pipe_compute_state state = {};964965if (!tgsi_text_translate(text_tmpl, tokens, ARRAY_SIZE(tokens))) {966assert(false);967return NULL;968}969970state.ir_type = PIPE_SHADER_IR_TGSI;971state.prog = tokens;972973return sctx->b.create_compute_state(&sctx->b, &state);974}975976977