Path: blob/21.2-virgl/src/gallium/drivers/r600/r600_shader.c
4570 views
/*1* Copyright 2010 Jerome Glisse <[email protected]>2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* on the rights to use, copy, modify, merge, publish, distribute, sub7* license, and/or sell copies of the Software, and to permit persons to whom8* the Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL17* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,18* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR19* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE20* USE OR OTHER DEALINGS IN THE SOFTWARE.21*/22#include "r600_sq.h"23#include "r600_formats.h"24#include "r600_opcodes.h"25#include "r600_shader.h"26#include "r600_dump.h"27#include "r600d.h"28#include "sfn/sfn_nir.h"2930#include "sb/sb_public.h"3132#include "pipe/p_shader_tokens.h"33#include "tgsi/tgsi_info.h"34#include "tgsi/tgsi_parse.h"35#include "tgsi/tgsi_scan.h"36#include "tgsi/tgsi_dump.h"37#include "tgsi/tgsi_from_mesa.h"38#include "nir/tgsi_to_nir.h"39#include "nir/nir_to_tgsi_info.h"40#include "compiler/nir/nir.h"41#include "util/u_bitcast.h"42#include "util/u_memory.h"43#include "util/u_math.h"44#include <stdio.h>45#include <errno.h>4647/* CAYMAN notes48Why CAYMAN got loops for lots of instructions is explained here.4950-These 8xx t-slot only ops are implemented in all vector slots.51MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT52These 8xx t-slot only opcodes become vector ops, with all four53slots expecting the arguments on sources a and b. Result is54broadcast to all channels.55MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_6456These 8xx t-slot only opcodes become vector ops in the z, y, and57x slots.58EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_6459RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_6460SQRT_IEEE/_6461SIN/COS62The w slot may have an independent co-issued operation, or if the63result is required to be in the w slot, the opcode above may be64issued in the w slot as well.65The compiler must issue the source argument to slots z, y, and x66*/6768/* Contents of r0 on entry to various shaders6970VS - .x = VertexID71.y = RelVertexID (??)72.w = InstanceID7374GS - r0.xyw, r1.xyz = per-vertex offsets75r0.z = PrimitiveID7677TCS - .x = PatchID78.y = RelPatchID (??)79.z = InvocationID80.w = tess factor base.8182TES - .x = TessCoord.x83- .y = TessCoord.y84- .z = RelPatchID (??)85- .w = PrimitiveID8687PS - face_gpr.z = SampleMask88face_gpr.w = SampleID89*/90#define R600_SHADER_BUFFER_INFO_SEL (512 + R600_BUFFER_INFO_OFFSET / 16)91static int r600_shader_from_tgsi(struct r600_context *rctx,92struct r600_pipe_shader *pipeshader,93union r600_shader_key key);9495static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,96int size, unsigned comp_mask) {9798if (!size)99return;100101if (ps->num_arrays == ps->max_arrays) {102ps->max_arrays += 64;103ps->arrays = realloc(ps->arrays, ps->max_arrays *104sizeof(struct r600_shader_array));105}106107int n = ps->num_arrays;108++ps->num_arrays;109110ps->arrays[n].comp_mask = comp_mask;111ps->arrays[n].gpr_start = start_gpr;112ps->arrays[n].gpr_count = size;113}114115static void r600_dump_streamout(struct pipe_stream_output_info *so)116{117unsigned i;118119fprintf(stderr, "STREAMOUT\n");120for (i = 0; i < so->num_outputs; i++) {121unsigned mask = ((1 << so->output[i].num_components) - 1) <<122so->output[i].start_component;123fprintf(stderr, " %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",124i,125so->output[i].stream,126so->output[i].output_buffer,127so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,128so->output[i].register_index,129mask & 1 ? "x" : "",130mask & 2 ? "y" : "",131mask & 4 ? "z" : "",132mask & 8 ? "w" : "",133so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");134}135}136137static int store_shader(struct pipe_context *ctx,138struct r600_pipe_shader *shader)139{140struct r600_context *rctx = (struct r600_context *)ctx;141uint32_t *ptr, i;142143if (shader->bo == NULL) {144shader->bo = (struct r600_resource*)145pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);146if (shader->bo == NULL) {147return -ENOMEM;148}149ptr = r600_buffer_map_sync_with_rings(150&rctx->b, shader->bo,151PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY);152if (R600_BIG_ENDIAN) {153for (i = 0; i < shader->shader.bc.ndw; ++i) {154ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);155}156} else {157memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));158}159rctx->b.ws->buffer_unmap(rctx->b.ws, shader->bo->buf);160}161162return 0;163}164165extern const struct nir_shader_compiler_options r600_nir_options;166static int nshader = 0;167int r600_pipe_shader_create(struct pipe_context *ctx,168struct r600_pipe_shader *shader,169union r600_shader_key key)170{171struct r600_context *rctx = (struct r600_context *)ctx;172struct r600_pipe_shader_selector *sel = shader->selector;173int r;174struct r600_screen *rscreen = (struct r600_screen *)ctx->screen;175176int processor = sel->ir_type == PIPE_SHADER_IR_TGSI ?177tgsi_get_processor_type(sel->tokens):178pipe_shader_type_from_mesa(sel->nir->info.stage);179180bool dump = r600_can_dump_shader(&rctx->screen->b, processor);181unsigned use_sb = !(rctx->screen->b.debug_flags & (DBG_NO_SB | DBG_NIR)) ||182(rctx->screen->b.debug_flags & DBG_NIR_SB);183unsigned sb_disasm;184unsigned export_shader;185186shader->shader.bc.isa = rctx->isa;187188if (!(rscreen->b.debug_flags & DBG_NIR_PREFERRED)) {189assert(sel->ir_type == PIPE_SHADER_IR_TGSI);190r = r600_shader_from_tgsi(rctx, shader, key);191if (r) {192R600_ERR("translation from TGSI failed !\n");193goto error;194}195} else {196if (sel->ir_type == PIPE_SHADER_IR_TGSI) {197sel->nir = tgsi_to_nir(sel->tokens, ctx->screen, true);198const nir_shader_compiler_options *nir_options =199(const nir_shader_compiler_options *)200ctx->screen->get_compiler_options(ctx->screen,201PIPE_SHADER_IR_NIR,202shader->shader.processor_type);203/* Lower int64 ops because we have some r600 build-in shaders that use it */204if (nir_options->lower_int64_options) {205NIR_PASS_V(sel->nir, nir_lower_regs_to_ssa);206NIR_PASS_V(sel->nir, nir_lower_alu_to_scalar, NULL, NULL);207NIR_PASS_V(sel->nir, nir_lower_int64);208NIR_PASS_V(sel->nir, nir_opt_vectorize, NULL, NULL);209}210NIR_PASS_V(sel->nir, nir_lower_flrp, ~0, false);211}212nir_tgsi_scan_shader(sel->nir, &sel->info, true);213214r = r600_shader_from_nir(rctx, shader, &key);215if (r) {216fprintf(stderr, "--Failed shader--------------------------------------------------\n");217218if (sel->ir_type == PIPE_SHADER_IR_TGSI) {219fprintf(stderr, "--TGSI--------------------------------------------------------\n");220tgsi_dump(sel->tokens, 0);221}222223if (rscreen->b.debug_flags & (DBG_NIR_PREFERRED)) {224fprintf(stderr, "--NIR --------------------------------------------------------\n");225nir_print_shader(sel->nir, stderr);226}227228R600_ERR("translation from NIR failed !\n");229goto error;230}231}232233if (dump) {234if (sel->ir_type == PIPE_SHADER_IR_TGSI) {235fprintf(stderr, "--TGSI--------------------------------------------------------\n");236tgsi_dump(sel->tokens, 0);237}238239if (sel->so.num_outputs) {240r600_dump_streamout(&sel->so);241}242}243244if (shader->shader.processor_type == PIPE_SHADER_VERTEX) {245/* only disable for vertex shaders in tess paths */246if (key.vs.as_ls)247use_sb = 0;248}249use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_CTRL);250use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_EVAL);251use_sb &= (shader->shader.processor_type != PIPE_SHADER_COMPUTE);252253/* disable SB for shaders using doubles */254use_sb &= !shader->shader.uses_doubles;255256use_sb &= !shader->shader.uses_atomics;257use_sb &= !shader->shader.uses_images;258use_sb &= !shader->shader.uses_helper_invocation;259260/* Check if the bytecode has already been built. */261if (!shader->shader.bc.bytecode) {262r = r600_bytecode_build(&shader->shader.bc);263if (r) {264R600_ERR("building bytecode failed !\n");265goto error;266}267}268269sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);270if (dump && !sb_disasm) {271fprintf(stderr, "--------------------------------------------------------------\n");272r600_bytecode_disasm(&shader->shader.bc);273fprintf(stderr, "______________________________________________________________\n");274} else if ((dump && sb_disasm) || use_sb) {275r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader,276dump, use_sb);277if (r) {278R600_ERR("r600_sb_bytecode_process failed !\n");279goto error;280}281}282283if (dump) {284FILE *f;285char fname[1024];286snprintf(fname, 1024, "shader_from_%s_%d.cpp",287(sel->ir_type == PIPE_SHADER_IR_TGSI ?288(rscreen->b.debug_flags & DBG_NIR_PREFERRED ? "tgsi-nir" : "tgsi")289: "nir"), nshader);290f = fopen(fname, "w");291print_shader_info(f, nshader++, &shader->shader);292print_shader_info(stderr, nshader++, &shader->shader);293print_pipe_info(stderr, &sel->info);294if (sel->ir_type == PIPE_SHADER_IR_TGSI) {295fprintf(f, "/****TGSI**********************************\n");296tgsi_dump_to_file(sel->tokens, 0, f);297}298299if (rscreen->b.debug_flags & DBG_NIR_PREFERRED){300fprintf(f, "/****NIR **********************************\n");301nir_print_shader(sel->nir, f);302}303fprintf(f, "******************************************/\n");304fclose(f);305}306307if (shader->gs_copy_shader) {308if (dump) {309// dump copy shader310r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc,311&shader->gs_copy_shader->shader, dump, 0);312if (r)313goto error;314}315316if ((r = store_shader(ctx, shader->gs_copy_shader)))317goto error;318}319320/* Store the shader in a buffer. */321if ((r = store_shader(ctx, shader)))322goto error;323324/* Build state. */325switch (shader->shader.processor_type) {326case PIPE_SHADER_TESS_CTRL:327evergreen_update_hs_state(ctx, shader);328break;329case PIPE_SHADER_TESS_EVAL:330if (key.tes.as_es)331evergreen_update_es_state(ctx, shader);332else333evergreen_update_vs_state(ctx, shader);334break;335case PIPE_SHADER_GEOMETRY:336if (rctx->b.chip_class >= EVERGREEN) {337evergreen_update_gs_state(ctx, shader);338evergreen_update_vs_state(ctx, shader->gs_copy_shader);339} else {340r600_update_gs_state(ctx, shader);341r600_update_vs_state(ctx, shader->gs_copy_shader);342}343break;344case PIPE_SHADER_VERTEX:345export_shader = key.vs.as_es;346if (rctx->b.chip_class >= EVERGREEN) {347if (key.vs.as_ls)348evergreen_update_ls_state(ctx, shader);349else if (key.vs.as_es)350evergreen_update_es_state(ctx, shader);351else352evergreen_update_vs_state(ctx, shader);353} else {354if (export_shader)355r600_update_es_state(ctx, shader);356else357r600_update_vs_state(ctx, shader);358}359break;360case PIPE_SHADER_FRAGMENT:361if (rctx->b.chip_class >= EVERGREEN) {362evergreen_update_ps_state(ctx, shader);363} else {364r600_update_ps_state(ctx, shader);365}366break;367case PIPE_SHADER_COMPUTE:368evergreen_update_ls_state(ctx, shader);369break;370default:371r = -EINVAL;372goto error;373}374return 0;375376error:377r600_pipe_shader_destroy(ctx, shader);378return r;379}380381void r600_pipe_shader_destroy(struct pipe_context *ctx UNUSED, struct r600_pipe_shader *shader)382{383r600_resource_reference(&shader->bo, NULL);384if (list_is_linked(&shader->shader.bc.cf))385r600_bytecode_clear(&shader->shader.bc);386r600_release_command_buffer(&shader->command_buffer);387}388389/*390* tgsi -> r600 shader391*/392struct r600_shader_tgsi_instruction;393394struct r600_shader_src {395unsigned sel;396unsigned swizzle[4];397unsigned neg;398unsigned abs;399unsigned rel;400unsigned kc_bank;401boolean kc_rel; /* true if cache bank is indexed */402uint32_t value[4];403};404405struct eg_interp {406boolean enabled;407unsigned ij_index;408};409410struct r600_shader_ctx {411struct tgsi_shader_info info;412struct tgsi_array_info *array_infos;413/* flag for each tgsi temp array if its been spilled or not */414bool *spilled_arrays;415struct tgsi_parse_context parse;416const struct tgsi_token *tokens;417unsigned type;418unsigned file_offset[TGSI_FILE_COUNT];419unsigned temp_reg;420const struct r600_shader_tgsi_instruction *inst_info;421struct r600_bytecode *bc;422struct r600_shader *shader;423struct r600_shader_src src[4];424uint32_t *literals;425uint32_t nliterals;426uint32_t max_driver_temp_used;427/* needed for evergreen interpolation */428struct eg_interp eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid429/* evergreen/cayman also store sample mask in face register */430int face_gpr;431/* sample id is .w component stored in fixed point position register */432int fixed_pt_position_gpr;433int colors_used;434boolean clip_vertex_write;435unsigned cv_output;436unsigned edgeflag_output;437int helper_invoc_reg;438int cs_block_size_reg;439int cs_grid_size_reg;440bool cs_block_size_loaded, cs_grid_size_loaded;441int fragcoord_input;442int next_ring_offset;443int gs_out_ring_offset;444int gs_next_vertex;445struct r600_shader *gs_for_vs;446int gs_export_gpr_tregs[4];447int gs_rotated_input[2];448const struct pipe_stream_output_info *gs_stream_output_info;449unsigned enabled_stream_buffers_mask;450unsigned tess_input_info; /* temp with tess input offsets */451unsigned tess_output_info; /* temp with tess input offsets */452unsigned thread_id_gpr; /* temp with thread id calculated for images */453};454455struct r600_shader_tgsi_instruction {456unsigned op;457int (*process)(struct r600_shader_ctx *ctx);458};459460static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind);461static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];462static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);463static inline int callstack_push(struct r600_shader_ctx *ctx, unsigned reason);464static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);465static int tgsi_else(struct r600_shader_ctx *ctx);466static int tgsi_endif(struct r600_shader_ctx *ctx);467static int tgsi_bgnloop(struct r600_shader_ctx *ctx);468static int tgsi_endloop(struct r600_shader_ctx *ctx);469static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);470static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,471unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,472unsigned int dst_reg);473static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,474const struct r600_shader_src *shader_src,475unsigned chan);476static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,477unsigned dst_reg, unsigned mask);478479static bool ctx_needs_stack_workaround_8xx(struct r600_shader_ctx *ctx)480{481if (ctx->bc->family == CHIP_HEMLOCK ||482ctx->bc->family == CHIP_CYPRESS ||483ctx->bc->family == CHIP_JUNIPER)484return false;485return true;486}487488static int tgsi_last_instruction(unsigned writemask)489{490int i, lasti = 0;491492for (i = 0; i < 4; i++) {493if (writemask & (1 << i)) {494lasti = i;495}496}497return lasti;498}499500static int tgsi_is_supported(struct r600_shader_ctx *ctx)501{502struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;503unsigned j;504505if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) {506R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);507return -EINVAL;508}509#if 0510if (i->Instruction.Label) {511R600_ERR("label unsupported\n");512return -EINVAL;513}514#endif515for (j = 0; j < i->Instruction.NumSrcRegs; j++) {516if (i->Src[j].Register.Dimension) {517switch (i->Src[j].Register.File) {518case TGSI_FILE_CONSTANT:519case TGSI_FILE_HW_ATOMIC:520break;521case TGSI_FILE_INPUT:522if (ctx->type == PIPE_SHADER_GEOMETRY ||523ctx->type == PIPE_SHADER_TESS_CTRL ||524ctx->type == PIPE_SHADER_TESS_EVAL)525break;526FALLTHROUGH;527case TGSI_FILE_OUTPUT:528if (ctx->type == PIPE_SHADER_TESS_CTRL)529break;530FALLTHROUGH;531default:532R600_ERR("unsupported src %d (file %d, dimension %d)\n", j,533i->Src[j].Register.File,534i->Src[j].Register.Dimension);535return -EINVAL;536}537}538}539for (j = 0; j < i->Instruction.NumDstRegs; j++) {540if (i->Dst[j].Register.Dimension) {541if (ctx->type == PIPE_SHADER_TESS_CTRL)542continue;543R600_ERR("unsupported dst (dimension)\n");544return -EINVAL;545}546}547return 0;548}549550int eg_get_interpolator_index(unsigned interpolate, unsigned location)551{552if (interpolate == TGSI_INTERPOLATE_COLOR ||553interpolate == TGSI_INTERPOLATE_LINEAR ||554interpolate == TGSI_INTERPOLATE_PERSPECTIVE)555{556int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR;557int loc;558559switch(location) {560case TGSI_INTERPOLATE_LOC_CENTER:561loc = 1;562break;563case TGSI_INTERPOLATE_LOC_CENTROID:564loc = 2;565break;566case TGSI_INTERPOLATE_LOC_SAMPLE:567default:568loc = 0; break;569}570571return is_linear * 3 + loc;572}573574return -1;575}576577static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx,578int input)579{580int i = eg_get_interpolator_index(581ctx->shader->input[input].interpolate,582ctx->shader->input[input].interpolate_location);583assert(i >= 0);584ctx->shader->input[input].ij_index = ctx->eg_interpolators[i].ij_index;585}586587static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)588{589int i, r;590struct r600_bytecode_alu alu;591int gpr = 0, base_chan = 0;592int ij_index = ctx->shader->input[input].ij_index;593594/* work out gpr and base_chan from index */595gpr = ij_index / 2;596base_chan = (2 * (ij_index % 2)) + 1;597598for (i = 0; i < 8; i++) {599memset(&alu, 0, sizeof(struct r600_bytecode_alu));600601if (i < 4)602alu.op = ALU_OP2_INTERP_ZW;603else604alu.op = ALU_OP2_INTERP_XY;605606if ((i > 1) && (i < 6)) {607alu.dst.sel = ctx->shader->input[input].gpr;608alu.dst.write = 1;609}610611alu.dst.chan = i % 4;612613alu.src[0].sel = gpr;614alu.src[0].chan = (base_chan - (i % 2));615616alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;617618alu.bank_swizzle_force = SQ_ALU_VEC_210;619if ((i % 4) == 3)620alu.last = 1;621r = r600_bytecode_add_alu(ctx->bc, &alu);622if (r)623return r;624}625return 0;626}627628static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)629{630int i, r;631struct r600_bytecode_alu alu;632633for (i = 0; i < 4; i++) {634memset(&alu, 0, sizeof(struct r600_bytecode_alu));635636alu.op = ALU_OP1_INTERP_LOAD_P0;637638alu.dst.sel = ctx->shader->input[input].gpr;639alu.dst.write = 1;640641alu.dst.chan = i;642643alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;644alu.src[0].chan = i;645646if (i == 3)647alu.last = 1;648r = r600_bytecode_add_alu(ctx->bc, &alu);649if (r)650return r;651}652return 0;653}654655/*656* Special export handling in shaders657*658* shader export ARRAY_BASE for EXPORT_POS:659* 60 is position660* 61 is misc vector661* 62, 63 are clip distance vectors662*663* The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:664* VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61665* USE_VTX_POINT_SIZE - point size in the X channel of export 61666* USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61667* USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61668* USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61669* USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually670* exclusive from render target index)671* VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors672*673*674* shader export ARRAY_BASE for EXPORT_PIXEL:675* 0-7 CB targets676* 61 computed Z vector677*678* The use of the values exported in the computed Z vector are controlled679* by DB_SHADER_CONTROL:680* Z_EXPORT_ENABLE - Z as a float in RED681* STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN682* COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA683* MASK_EXPORT_ENABLE - pixel sample mask in BLUE684* DB_SOURCE_FORMAT - export control restrictions685*686*/687688689/* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */690static int r600_spi_sid(struct r600_shader_io * io)691{692int index, name = io->name;693694/* These params are handled differently, they don't need695* semantic indices, so we'll use 0 for them.696*/697if (name == TGSI_SEMANTIC_POSITION ||698name == TGSI_SEMANTIC_PSIZE ||699name == TGSI_SEMANTIC_EDGEFLAG ||700name == TGSI_SEMANTIC_FACE ||701name == TGSI_SEMANTIC_SAMPLEMASK)702index = 0;703else {704if (name == TGSI_SEMANTIC_GENERIC) {705/* For generic params simply use sid from tgsi */706index = 9 + io->sid;707} else if (name == TGSI_SEMANTIC_TEXCOORD) {708index = io->sid;709} else {710/* For non-generic params - pack name and sid into 8 bits */711index = 0x80 | (name<<3) | (io->sid);712}713714/* Make sure that all really used indices have nonzero value, so715* we can just compare it to 0 later instead of comparing the name716* with different values to detect special cases. */717index++;718}719720return index;721};722723/* we need this to get a common lds index for vs/tcs/tes input/outputs */724int r600_get_lds_unique_index(unsigned semantic_name, unsigned index)725{726switch (semantic_name) {727case TGSI_SEMANTIC_POSITION:728return 0;729case TGSI_SEMANTIC_PSIZE:730return 1;731case TGSI_SEMANTIC_CLIPDIST:732assert(index <= 1);733return 2 + index;734case TGSI_SEMANTIC_TEXCOORD:735return 4 + index;736case TGSI_SEMANTIC_GENERIC:737if (index <= 63-4)738return 4 + index;739else740/* same explanation as in the default statement,741* the only user hitting this is st/nine.742*/743return 0;744745/* patch indices are completely separate and thus start from 0 */746case TGSI_SEMANTIC_TESSOUTER:747return 0;748case TGSI_SEMANTIC_TESSINNER:749return 1;750case TGSI_SEMANTIC_PATCH:751return 2 + index;752753default:754/* Don't fail here. The result of this function is only used755* for LS, TCS, TES, and GS, where legacy GL semantics can't756* occur, but this function is called for all vertex shaders757* before it's known whether LS will be compiled or not.758*/759return 0;760}761}762763/* turn input into interpolate on EG */764static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)765{766int r = 0;767768if (ctx->shader->input[index].spi_sid) {769ctx->shader->input[index].lds_pos = ctx->shader->nlds++;770if (ctx->shader->input[index].interpolate > 0) {771evergreen_interp_assign_ij_index(ctx, index);772r = evergreen_interp_alu(ctx, index);773} else {774r = evergreen_interp_flat(ctx, index);775}776}777return r;778}779780static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)781{782struct r600_bytecode_alu alu;783int i, r;784int gpr_front = ctx->shader->input[front].gpr;785int gpr_back = ctx->shader->input[back].gpr;786787for (i = 0; i < 4; i++) {788memset(&alu, 0, sizeof(alu));789alu.op = ALU_OP3_CNDGT;790alu.is_op3 = 1;791alu.dst.write = 1;792alu.dst.sel = gpr_front;793alu.src[0].sel = ctx->face_gpr;794alu.src[1].sel = gpr_front;795alu.src[2].sel = gpr_back;796797alu.dst.chan = i;798alu.src[1].chan = i;799alu.src[2].chan = i;800alu.last = (i==3);801802if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))803return r;804}805806return 0;807}808809/* execute a single slot ALU calculation */810static int single_alu_op2(struct r600_shader_ctx *ctx, int op,811int dst_sel, int dst_chan,812int src0_sel, unsigned src0_chan_val,813int src1_sel, unsigned src1_chan_val)814{815struct r600_bytecode_alu alu;816int r, i;817818if (ctx->bc->chip_class == CAYMAN && op == ALU_OP2_MULLO_INT) {819for (i = 0; i < 4; i++) {820memset(&alu, 0, sizeof(struct r600_bytecode_alu));821alu.op = op;822alu.src[0].sel = src0_sel;823if (src0_sel == V_SQ_ALU_SRC_LITERAL)824alu.src[0].value = src0_chan_val;825else826alu.src[0].chan = src0_chan_val;827alu.src[1].sel = src1_sel;828if (src1_sel == V_SQ_ALU_SRC_LITERAL)829alu.src[1].value = src1_chan_val;830else831alu.src[1].chan = src1_chan_val;832alu.dst.sel = dst_sel;833alu.dst.chan = i;834alu.dst.write = i == dst_chan;835alu.last = (i == 3);836r = r600_bytecode_add_alu(ctx->bc, &alu);837if (r)838return r;839}840return 0;841}842843memset(&alu, 0, sizeof(struct r600_bytecode_alu));844alu.op = op;845alu.src[0].sel = src0_sel;846if (src0_sel == V_SQ_ALU_SRC_LITERAL)847alu.src[0].value = src0_chan_val;848else849alu.src[0].chan = src0_chan_val;850alu.src[1].sel = src1_sel;851if (src1_sel == V_SQ_ALU_SRC_LITERAL)852alu.src[1].value = src1_chan_val;853else854alu.src[1].chan = src1_chan_val;855alu.dst.sel = dst_sel;856alu.dst.chan = dst_chan;857alu.dst.write = 1;858alu.last = 1;859r = r600_bytecode_add_alu(ctx->bc, &alu);860if (r)861return r;862return 0;863}864865/* execute a single slot ALU calculation */866static int single_alu_op3(struct r600_shader_ctx *ctx, int op,867int dst_sel, int dst_chan,868int src0_sel, unsigned src0_chan_val,869int src1_sel, unsigned src1_chan_val,870int src2_sel, unsigned src2_chan_val)871{872struct r600_bytecode_alu alu;873int r;874875/* validate this for other ops */876assert(op == ALU_OP3_MULADD_UINT24 || op == ALU_OP3_CNDE_INT || op == ALU_OP3_BFE_UINT);877memset(&alu, 0, sizeof(struct r600_bytecode_alu));878alu.op = op;879alu.src[0].sel = src0_sel;880if (src0_sel == V_SQ_ALU_SRC_LITERAL)881alu.src[0].value = src0_chan_val;882else883alu.src[0].chan = src0_chan_val;884alu.src[1].sel = src1_sel;885if (src1_sel == V_SQ_ALU_SRC_LITERAL)886alu.src[1].value = src1_chan_val;887else888alu.src[1].chan = src1_chan_val;889alu.src[2].sel = src2_sel;890if (src2_sel == V_SQ_ALU_SRC_LITERAL)891alu.src[2].value = src2_chan_val;892else893alu.src[2].chan = src2_chan_val;894alu.dst.sel = dst_sel;895alu.dst.chan = dst_chan;896alu.is_op3 = 1;897alu.last = 1;898r = r600_bytecode_add_alu(ctx->bc, &alu);899if (r)900return r;901return 0;902}903904/* put it in temp_reg.x */905static int get_lds_offset0(struct r600_shader_ctx *ctx,906int rel_patch_chan,907int temp_reg, bool is_patch_var)908{909int r;910911/* MUL temp.x, patch_stride (input_vals.x), rel_patch_id (r0.y (tcs)) */912/* ADD913Dimension - patch0_offset (input_vals.z),914Non-dim - patch0_data_offset (input_vals.w)915*/916r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,917temp_reg, 0,918ctx->tess_output_info, 0,9190, rel_patch_chan,920ctx->tess_output_info, is_patch_var ? 3 : 2);921if (r)922return r;923return 0;924}925926static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index)927{928return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg;929}930931static int r600_get_temp(struct r600_shader_ctx *ctx)932{933return ctx->temp_reg + ctx->max_driver_temp_used++;934}935936static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid)937{938int i;939i = ctx->shader->noutput++;940ctx->shader->output[i].name = TGSI_SEMANTIC_PRIMID;941ctx->shader->output[i].sid = 0;942ctx->shader->output[i].gpr = 0;943ctx->shader->output[i].interpolate = TGSI_INTERPOLATE_CONSTANT;944ctx->shader->output[i].write_mask = 0x4;945ctx->shader->output[i].spi_sid = prim_id_sid;946947return 0;948}949950static int tgsi_barrier(struct r600_shader_ctx *ctx)951{952struct r600_bytecode_alu alu;953int r;954955memset(&alu, 0, sizeof(struct r600_bytecode_alu));956alu.op = ctx->inst_info->op;957alu.last = 1;958959r = r600_bytecode_add_alu(ctx->bc, &alu);960if (r)961return r;962return 0;963}964965static void choose_spill_arrays(struct r600_shader_ctx *ctx, int *regno, unsigned *scratch_space_needed)966{967// pick largest array and spill it, repeat until the number of temps is under limit or we run out of arrays968unsigned n = ctx->info.array_max[TGSI_FILE_TEMPORARY];969unsigned narrays_left = n;970bool *spilled = ctx->spilled_arrays; // assumed calloc:ed971972*scratch_space_needed = 0;973while (*regno > 124 && narrays_left) {974unsigned i;975unsigned largest = 0;976unsigned largest_index = 0;977978for (i = 0; i < n; i++) {979unsigned size = ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1;980if (!spilled[i] && size > largest) {981largest = size;982largest_index = i;983}984}985986spilled[largest_index] = true;987*regno -= largest;988*scratch_space_needed += largest;989990narrays_left --;991}992993if (narrays_left == 0) {994ctx->info.indirect_files &= ~(1 << TGSI_FILE_TEMPORARY);995}996}997998/* Take spilled temp arrays into account when translating tgsi register999* indexes into r600 gprs if spilled is false, or scratch array offset if1000* spilled is true */1001static int map_tgsi_reg_index_to_r600_gpr(struct r600_shader_ctx *ctx, unsigned tgsi_reg_index, bool *spilled)1002{1003unsigned i;1004unsigned spilled_size = 0;10051006for (i = 0; i < ctx->info.array_max[TGSI_FILE_TEMPORARY]; i++) {1007if (tgsi_reg_index >= ctx->array_infos[i].range.First && tgsi_reg_index <= ctx->array_infos[i].range.Last) {1008if (ctx->spilled_arrays[i]) {1009/* vec4 index into spilled scratch memory */1010*spilled = true;1011return tgsi_reg_index - ctx->array_infos[i].range.First + spilled_size;1012}1013else {1014/* regular GPR array */1015*spilled = false;1016return tgsi_reg_index - spilled_size + ctx->file_offset[TGSI_FILE_TEMPORARY];1017}1018}10191020if (tgsi_reg_index < ctx->array_infos[i].range.First)1021break;1022if (ctx->spilled_arrays[i]) {1023spilled_size += ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1;1024}1025}10261027/* regular GPR index, minus the holes from spilled arrays */1028*spilled = false;10291030return tgsi_reg_index - spilled_size + ctx->file_offset[TGSI_FILE_TEMPORARY];1031}10321033/* look up spill area base offset and array size for a spilled temp array */1034static void get_spilled_array_base_and_size(struct r600_shader_ctx *ctx, unsigned tgsi_reg_index,1035unsigned *array_base, unsigned *array_size)1036{1037unsigned i;1038unsigned offset = 0;10391040for (i = 0; i < ctx->info.array_max[TGSI_FILE_TEMPORARY]; i++) {1041if (ctx->spilled_arrays[i]) {1042unsigned size = ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1;10431044if (tgsi_reg_index >= ctx->array_infos[i].range.First && tgsi_reg_index <= ctx->array_infos[i].range.Last) {1045*array_base = offset;1046*array_size = size - 1; /* hw counts from 1 */10471048return;1049}10501051offset += size;1052}1053}1054}10551056static int tgsi_declaration(struct r600_shader_ctx *ctx)1057{1058struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;1059int r, i, j, count = d->Range.Last - d->Range.First + 1;10601061switch (d->Declaration.File) {1062case TGSI_FILE_INPUT:1063for (j = 0; j < count; j++) {1064i = ctx->shader->ninput + j;1065assert(i < ARRAY_SIZE(ctx->shader->input));1066ctx->shader->input[i].name = d->Semantic.Name;1067ctx->shader->input[i].sid = d->Semantic.Index + j;1068ctx->shader->input[i].interpolate = d->Interp.Interpolate;1069ctx->shader->input[i].interpolate_location = d->Interp.Location;1070ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j;1071if (ctx->type == PIPE_SHADER_FRAGMENT) {1072ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);1073switch (ctx->shader->input[i].name) {1074case TGSI_SEMANTIC_FACE:1075if (ctx->face_gpr != -1)1076ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */1077else1078ctx->face_gpr = ctx->shader->input[i].gpr;1079break;1080case TGSI_SEMANTIC_COLOR:1081ctx->colors_used++;1082break;1083case TGSI_SEMANTIC_POSITION:1084ctx->fragcoord_input = i;1085break;1086case TGSI_SEMANTIC_PRIMID:1087/* set this for now */1088ctx->shader->gs_prim_id_input = true;1089ctx->shader->ps_prim_id_input = i;1090break;1091}1092if (ctx->bc->chip_class >= EVERGREEN) {1093if ((r = evergreen_interp_input(ctx, i)))1094return r;1095}1096} else if (ctx->type == PIPE_SHADER_GEOMETRY) {1097/* FIXME probably skip inputs if they aren't passed in the ring */1098ctx->shader->input[i].ring_offset = ctx->next_ring_offset;1099ctx->next_ring_offset += 16;1100if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)1101ctx->shader->gs_prim_id_input = true;1102}1103}1104ctx->shader->ninput += count;1105break;1106case TGSI_FILE_OUTPUT:1107for (j = 0; j < count; j++) {1108i = ctx->shader->noutput + j;1109assert(i < ARRAY_SIZE(ctx->shader->output));1110ctx->shader->output[i].name = d->Semantic.Name;1111ctx->shader->output[i].sid = d->Semantic.Index + j;1112ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j;1113ctx->shader->output[i].interpolate = d->Interp.Interpolate;1114ctx->shader->output[i].write_mask = d->Declaration.UsageMask;1115if (ctx->type == PIPE_SHADER_VERTEX ||1116ctx->type == PIPE_SHADER_GEOMETRY ||1117ctx->type == PIPE_SHADER_TESS_EVAL) {1118ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);1119switch (d->Semantic.Name) {1120case TGSI_SEMANTIC_CLIPDIST:1121break;1122case TGSI_SEMANTIC_PSIZE:1123ctx->shader->vs_out_misc_write = 1;1124ctx->shader->vs_out_point_size = 1;1125break;1126case TGSI_SEMANTIC_EDGEFLAG:1127ctx->shader->vs_out_misc_write = 1;1128ctx->shader->vs_out_edgeflag = 1;1129ctx->edgeflag_output = i;1130break;1131case TGSI_SEMANTIC_VIEWPORT_INDEX:1132ctx->shader->vs_out_misc_write = 1;1133ctx->shader->vs_out_viewport = 1;1134break;1135case TGSI_SEMANTIC_LAYER:1136ctx->shader->vs_out_misc_write = 1;1137ctx->shader->vs_out_layer = 1;1138break;1139case TGSI_SEMANTIC_CLIPVERTEX:1140ctx->clip_vertex_write = TRUE;1141ctx->cv_output = i;1142break;1143}1144if (ctx->type == PIPE_SHADER_GEOMETRY) {1145ctx->gs_out_ring_offset += 16;1146}1147} else if (ctx->type == PIPE_SHADER_FRAGMENT) {1148switch (d->Semantic.Name) {1149case TGSI_SEMANTIC_COLOR:1150ctx->shader->nr_ps_max_color_exports++;1151break;1152}1153}1154}1155ctx->shader->noutput += count;1156break;1157case TGSI_FILE_TEMPORARY:1158if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {1159if (d->Array.ArrayID) {1160bool spilled;1161unsigned idx = map_tgsi_reg_index_to_r600_gpr(ctx,1162d->Range.First,1163&spilled);11641165if (!spilled) {1166r600_add_gpr_array(ctx->shader, idx,1167d->Range.Last - d->Range.First + 1, 0x0F);1168}1169}1170}1171break;11721173case TGSI_FILE_CONSTANT:1174case TGSI_FILE_SAMPLER:1175case TGSI_FILE_SAMPLER_VIEW:1176case TGSI_FILE_ADDRESS:1177case TGSI_FILE_BUFFER:1178case TGSI_FILE_IMAGE:1179case TGSI_FILE_MEMORY:1180break;11811182case TGSI_FILE_HW_ATOMIC:1183i = ctx->shader->nhwatomic_ranges;1184ctx->shader->atomics[i].start = d->Range.First;1185ctx->shader->atomics[i].end = d->Range.Last;1186ctx->shader->atomics[i].hw_idx = ctx->shader->atomic_base + ctx->shader->nhwatomic;1187ctx->shader->atomics[i].array_id = d->Array.ArrayID;1188ctx->shader->atomics[i].buffer_id = d->Dim.Index2D;1189ctx->shader->nhwatomic_ranges++;1190ctx->shader->nhwatomic += count;1191break;11921193case TGSI_FILE_SYSTEM_VALUE:1194if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK ||1195d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID ||1196d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) {1197break; /* Already handled from allocate_system_value_inputs */1198} else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {1199break;1200} else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)1201break;1202else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID)1203break;1204else if (d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ||1205d->Semantic.Name == TGSI_SEMANTIC_TESSOUTER) {1206int param = r600_get_lds_unique_index(d->Semantic.Name, 0);1207int dreg = d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ? 3 : 2;1208unsigned temp_reg = r600_get_temp(ctx);12091210r = get_lds_offset0(ctx, 2, temp_reg, true);1211if (r)1212return r;12131214r = single_alu_op2(ctx, ALU_OP2_ADD_INT,1215temp_reg, 0,1216temp_reg, 0,1217V_SQ_ALU_SRC_LITERAL, param * 16);1218if (r)1219return r;12201221do_lds_fetch_values(ctx, temp_reg, dreg, 0xf);1222}1223else if (d->Semantic.Name == TGSI_SEMANTIC_TESSCOORD) {1224/* MOV r1.x, r0.x;1225MOV r1.y, r0.y;1226*/1227for (i = 0; i < 2; i++) {1228struct r600_bytecode_alu alu;1229memset(&alu, 0, sizeof(struct r600_bytecode_alu));1230alu.op = ALU_OP1_MOV;1231alu.src[0].sel = 0;1232alu.src[0].chan = 0 + i;1233alu.dst.sel = 1;1234alu.dst.chan = 0 + i;1235alu.dst.write = 1;1236alu.last = (i == 1) ? 1 : 0;1237if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))1238return r;1239}1240/* ADD r1.z, 1.0f, -r0.x */1241struct r600_bytecode_alu alu;1242memset(&alu, 0, sizeof(struct r600_bytecode_alu));1243alu.op = ALU_OP2_ADD;1244alu.src[0].sel = V_SQ_ALU_SRC_1;1245alu.src[1].sel = 1;1246alu.src[1].chan = 0;1247alu.src[1].neg = 1;1248alu.dst.sel = 1;1249alu.dst.chan = 2;1250alu.dst.write = 1;1251alu.last = 1;1252if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))1253return r;12541255/* ADD r1.z, r1.z, -r1.y */1256alu.op = ALU_OP2_ADD;1257alu.src[0].sel = 1;1258alu.src[0].chan = 2;1259alu.src[1].sel = 1;1260alu.src[1].chan = 1;1261alu.src[1].neg = 1;1262alu.dst.sel = 1;1263alu.dst.chan = 2;1264alu.dst.write = 1;1265alu.last = 1;1266if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))1267return r;1268break;1269}1270break;1271default:1272R600_ERR("unsupported file %d declaration\n", d->Declaration.File);1273return -EINVAL;1274}1275return 0;1276}12771278static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset)1279{1280struct tgsi_parse_context parse;1281struct {1282boolean enabled;1283int *reg;1284unsigned name, alternate_name;1285} inputs[2] = {1286{ false, &ctx->face_gpr, TGSI_SEMANTIC_SAMPLEMASK, ~0u }, /* lives in Front Face GPR.z */12871288{ false, &ctx->fixed_pt_position_gpr, TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS } /* SAMPLEID is in Fixed Point Position GPR.w */1289};1290int num_regs = 0;1291unsigned k, i;12921293if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {1294return 0;1295}12961297/* need to scan shader for system values and interpolateAtSample/Offset/Centroid */1298while (!tgsi_parse_end_of_tokens(&parse)) {1299tgsi_parse_token(&parse);13001301if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {1302const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;1303if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||1304inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||1305inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)1306{1307int interpolate, location, k;13081309if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {1310location = TGSI_INTERPOLATE_LOC_CENTER;1311} else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {1312location = TGSI_INTERPOLATE_LOC_CENTER;1313/* Needs sample positions, currently those are always available */1314} else {1315location = TGSI_INTERPOLATE_LOC_CENTROID;1316}13171318interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];1319k = eg_get_interpolator_index(interpolate, location);1320if (k >= 0)1321ctx->eg_interpolators[k].enabled = true;1322}1323} else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) {1324struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration;1325if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {1326for (k = 0; k < ARRAY_SIZE(inputs); k++) {1327if (d->Semantic.Name == inputs[k].name ||1328d->Semantic.Name == inputs[k].alternate_name) {1329inputs[k].enabled = true;1330}1331}1332}1333}1334}13351336tgsi_parse_free(&parse);13371338if (ctx->info.reads_samplemask &&1339(ctx->info.uses_linear_sample || ctx->info.uses_persp_sample)) {1340inputs[1].enabled = true;1341}13421343if (ctx->bc->chip_class >= EVERGREEN) {1344int num_baryc = 0;1345/* assign gpr to each interpolator according to priority */1346for (i = 0; i < ARRAY_SIZE(ctx->eg_interpolators); i++) {1347if (ctx->eg_interpolators[i].enabled) {1348ctx->eg_interpolators[i].ij_index = num_baryc;1349num_baryc++;1350}1351}1352num_baryc = (num_baryc + 1) >> 1;1353gpr_offset += num_baryc;1354}13551356for (i = 0; i < ARRAY_SIZE(inputs); i++) {1357boolean enabled = inputs[i].enabled;1358int *reg = inputs[i].reg;1359unsigned name = inputs[i].name;13601361if (enabled) {1362int gpr = gpr_offset + num_regs++;1363ctx->shader->nsys_inputs++;13641365// add to inputs, allocate a gpr1366k = ctx->shader->ninput++;1367ctx->shader->input[k].name = name;1368ctx->shader->input[k].sid = 0;1369ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT;1370ctx->shader->input[k].interpolate_location = TGSI_INTERPOLATE_LOC_CENTER;1371*reg = ctx->shader->input[k].gpr = gpr;1372}1373}13741375return gpr_offset + num_regs;1376}13771378/*1379* for evergreen we need to scan the shader to find the number of GPRs we need to1380* reserve for interpolation and system values1381*1382* we need to know if we are going to emit any sample or centroid inputs1383* if perspective and linear are required1384*/1385static int evergreen_gpr_count(struct r600_shader_ctx *ctx)1386{1387unsigned i;13881389memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators));13901391/*1392* Could get this information from the shader info. But right now1393* we interpolate all declared inputs, whereas the shader info will1394* only contain the bits if the inputs are actually used, so it might1395* not be safe...1396*/1397for (i = 0; i < ctx->info.num_inputs; i++) {1398int k;1399/* skip position/face/mask/sampleid */1400if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||1401ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE ||1402ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEMASK ||1403ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEID)1404continue;14051406k = eg_get_interpolator_index(1407ctx->info.input_interpolate[i],1408ctx->info.input_interpolate_loc[i]);1409if (k >= 0)1410ctx->eg_interpolators[k].enabled = TRUE;1411}14121413/* XXX PULL MODEL and LINE STIPPLE */14141415return allocate_system_value_inputs(ctx, 0);1416}14171418/* sample_id_sel == NULL means fetch for current sample */1419static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_src *sample_id, int chan_sel)1420{1421struct r600_bytecode_vtx vtx;1422int r, t1;14231424t1 = r600_get_temp(ctx);14251426memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));1427vtx.op = FETCH_OP_VFETCH;1428vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;1429vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;1430if (sample_id == NULL) {1431assert(ctx->fixed_pt_position_gpr != -1);14321433vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w;1434vtx.src_sel_x = 3;1435}1436else {1437struct r600_bytecode_alu alu;14381439memset(&alu, 0, sizeof(struct r600_bytecode_alu));1440alu.op = ALU_OP1_MOV;1441r600_bytecode_src(&alu.src[0], sample_id, chan_sel);1442alu.dst.sel = t1;1443alu.dst.write = 1;1444alu.last = 1;1445r = r600_bytecode_add_alu(ctx->bc, &alu);1446if (r)1447return r;14481449vtx.src_gpr = t1;1450vtx.src_sel_x = 0;1451}1452vtx.mega_fetch_count = 16;1453vtx.dst_gpr = t1;1454vtx.dst_sel_x = 0;1455vtx.dst_sel_y = 1;1456vtx.dst_sel_z = 2;1457vtx.dst_sel_w = 3;1458vtx.data_format = FMT_32_32_32_32_FLOAT;1459vtx.num_format_all = 2;1460vtx.format_comp_all = 1;1461vtx.use_const_fields = 0;1462vtx.offset = 0;1463vtx.endian = r600_endian_swap(32);1464vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */14651466r = r600_bytecode_add_vtx(ctx->bc, &vtx);1467if (r)1468return r;14691470return t1;1471}14721473static int eg_load_helper_invocation(struct r600_shader_ctx *ctx)1474{1475int r;1476struct r600_bytecode_alu alu;14771478/* do a vtx fetch with wqm set on the vtx fetch */1479memset(&alu, 0, sizeof(struct r600_bytecode_alu));1480alu.op = ALU_OP1_MOV;1481alu.dst.sel = ctx->helper_invoc_reg;1482alu.dst.chan = 0;1483alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;1484alu.src[0].value = 0xffffffff;1485alu.dst.write = 1;1486alu.last = 1;1487r = r600_bytecode_add_alu(ctx->bc, &alu);1488if (r)1489return r;14901491/* do a vtx fetch in VPM mode */1492struct r600_bytecode_vtx vtx;1493memset(&vtx, 0, sizeof(vtx));1494vtx.op = FETCH_OP_GET_BUFFER_RESINFO;1495vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;1496vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;1497vtx.src_gpr = 0;1498vtx.mega_fetch_count = 16; /* no idea here really... */1499vtx.dst_gpr = ctx->helper_invoc_reg;1500vtx.dst_sel_x = 4;1501vtx.dst_sel_y = 7; /* SEL_Y */1502vtx.dst_sel_z = 7; /* SEL_Z */1503vtx.dst_sel_w = 7; /* SEL_W */1504vtx.data_format = FMT_32;1505if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx)))1506return r;1507ctx->bc->cf_last->vpm = 1;1508return 0;1509}15101511static int cm_load_helper_invocation(struct r600_shader_ctx *ctx)1512{1513int r;1514struct r600_bytecode_alu alu;15151516memset(&alu, 0, sizeof(struct r600_bytecode_alu));1517alu.op = ALU_OP1_MOV;1518alu.dst.sel = ctx->helper_invoc_reg;1519alu.dst.chan = 0;1520alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;1521alu.src[0].value = 0xffffffff;1522alu.dst.write = 1;1523alu.last = 1;1524r = r600_bytecode_add_alu(ctx->bc, &alu);1525if (r)1526return r;15271528memset(&alu, 0, sizeof(struct r600_bytecode_alu));1529alu.op = ALU_OP1_MOV;1530alu.dst.sel = ctx->helper_invoc_reg;1531alu.dst.chan = 0;1532alu.src[0].sel = V_SQ_ALU_SRC_0;1533alu.dst.write = 1;1534alu.last = 1;1535r = r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_VALID_PIXEL_MODE);1536if (r)1537return r;15381539return ctx->helper_invoc_reg;1540}15411542static int load_block_grid_size(struct r600_shader_ctx *ctx, bool load_block)1543{1544struct r600_bytecode_vtx vtx;1545int r, t1;15461547if (ctx->cs_block_size_loaded)1548return ctx->cs_block_size_reg;1549if (ctx->cs_grid_size_loaded)1550return ctx->cs_grid_size_reg;15511552t1 = load_block ? ctx->cs_block_size_reg : ctx->cs_grid_size_reg;1553struct r600_bytecode_alu alu;1554memset(&alu, 0, sizeof(struct r600_bytecode_alu));1555alu.op = ALU_OP1_MOV;1556alu.src[0].sel = V_SQ_ALU_SRC_0;1557alu.dst.sel = t1;1558alu.dst.write = 1;1559alu.last = 1;1560r = r600_bytecode_add_alu(ctx->bc, &alu);1561if (r)1562return r;15631564memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));1565vtx.op = FETCH_OP_VFETCH;1566vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;1567vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;1568vtx.src_gpr = t1;1569vtx.src_sel_x = 0;15701571vtx.mega_fetch_count = 16;1572vtx.dst_gpr = t1;1573vtx.dst_sel_x = 0;1574vtx.dst_sel_y = 1;1575vtx.dst_sel_z = 2;1576vtx.dst_sel_w = 7;1577vtx.data_format = FMT_32_32_32_32;1578vtx.num_format_all = 1;1579vtx.format_comp_all = 0;1580vtx.use_const_fields = 0;1581vtx.offset = load_block ? 0 : 16; // first element is size of buffer1582vtx.endian = r600_endian_swap(32);1583vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */15841585r = r600_bytecode_add_vtx(ctx->bc, &vtx);1586if (r)1587return r;15881589if (load_block)1590ctx->cs_block_size_loaded = true;1591else1592ctx->cs_grid_size_loaded = true;1593return t1;1594}15951596static void tgsi_src(struct r600_shader_ctx *ctx,1597const struct tgsi_full_src_register *tgsi_src,1598struct r600_shader_src *r600_src)1599{1600memset(r600_src, 0, sizeof(*r600_src));1601r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;1602r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;1603r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;1604r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;1605r600_src->neg = tgsi_src->Register.Negate;1606r600_src->abs = tgsi_src->Register.Absolute;16071608if (tgsi_src->Register.File == TGSI_FILE_TEMPORARY) {1609bool spilled;1610unsigned idx;16111612idx = map_tgsi_reg_index_to_r600_gpr(ctx, tgsi_src->Register.Index, &spilled);16131614if (spilled) {1615int reg = r600_get_temp(ctx);1616int r;16171618r600_src->sel = reg;16191620if (ctx->bc->chip_class < R700) {1621struct r600_bytecode_output cf;16221623memset(&cf, 0, sizeof(struct r600_bytecode_output));1624cf.op = CF_OP_MEM_SCRATCH;1625cf.elem_size = 3;1626cf.gpr = reg;1627cf.comp_mask = 0xF;1628cf.swizzle_x = 0;1629cf.swizzle_y = 1;1630cf.swizzle_z = 2;1631cf.swizzle_w = 3;1632cf.burst_count = 1;16331634get_spilled_array_base_and_size(ctx, tgsi_src->Register.Index,1635&cf.array_base, &cf.array_size);16361637if (tgsi_src->Register.Indirect) {1638cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;1639cf.index_gpr = ctx->bc->ar_reg;1640}1641else {1642cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ;1643cf.array_base += idx;1644cf.array_size = 0;1645}16461647r = r600_bytecode_add_output(ctx->bc, &cf);1648}1649else {1650struct r600_bytecode_vtx vtx;16511652if (r600_bytecode_get_need_wait_ack(ctx->bc)) {1653r600_bytecode_need_wait_ack(ctx->bc, false);1654r = r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);1655}16561657memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));1658vtx.op = FETCH_OP_READ_SCRATCH;1659vtx.dst_gpr = reg;1660vtx.uncached = 1; // Must bypass cache since prior spill written in same invocation1661vtx.elem_size = 3;1662vtx.data_format = FMT_32_32_32_32;1663vtx.num_format_all = V_038010_SQ_NUM_FORMAT_INT;1664vtx.dst_sel_x = tgsi_src->Register.SwizzleX;1665vtx.dst_sel_y = tgsi_src->Register.SwizzleY;1666vtx.dst_sel_z = tgsi_src->Register.SwizzleZ;1667vtx.dst_sel_w = tgsi_src->Register.SwizzleW;16681669get_spilled_array_base_and_size(ctx, tgsi_src->Register.Index,1670&vtx.array_base, &vtx.array_size);16711672if (tgsi_src->Register.Indirect) {1673vtx.indexed = 1;1674vtx.src_gpr = ctx->bc->ar_reg;1675}1676else {1677vtx.array_base += idx;1678vtx.array_size = 0;1679}16801681r = r600_bytecode_add_vtx(ctx->bc, &vtx);1682}16831684if (r)1685return;1686}1687else {1688if (tgsi_src->Register.Indirect)1689r600_src->rel = V_SQ_REL_RELATIVE;16901691r600_src->sel = idx;1692}16931694return;1695}16961697if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {1698int index;1699if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&1700(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&1701(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {17021703index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;1704r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel);1705if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)1706return;1707}1708index = tgsi_src->Register.Index;1709r600_src->sel = V_SQ_ALU_SRC_LITERAL;1710memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));1711} else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {1712if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEMASK) {1713r600_src->swizzle[0] = 2; // Z value1714r600_src->swizzle[1] = 2;1715r600_src->swizzle[2] = 2;1716r600_src->swizzle[3] = 2;1717r600_src->sel = ctx->face_gpr;1718} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEID) {1719r600_src->swizzle[0] = 3; // W value1720r600_src->swizzle[1] = 3;1721r600_src->swizzle[2] = 3;1722r600_src->swizzle[3] = 3;1723r600_src->sel = ctx->fixed_pt_position_gpr;1724} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEPOS) {1725r600_src->swizzle[0] = 0;1726r600_src->swizzle[1] = 1;1727r600_src->swizzle[2] = 4;1728r600_src->swizzle[3] = 4;1729r600_src->sel = load_sample_position(ctx, NULL, -1);1730} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {1731r600_src->swizzle[0] = 3;1732r600_src->swizzle[1] = 3;1733r600_src->swizzle[2] = 3;1734r600_src->swizzle[3] = 3;1735r600_src->sel = 0;1736} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {1737r600_src->swizzle[0] = 0;1738r600_src->swizzle[1] = 0;1739r600_src->swizzle[2] = 0;1740r600_src->swizzle[3] = 0;1741r600_src->sel = 0;1742} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_THREAD_ID) {1743r600_src->sel = 0;1744} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_ID) {1745r600_src->sel = 1;1746} else if (ctx->type != PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {1747r600_src->swizzle[0] = 3;1748r600_src->swizzle[1] = 3;1749r600_src->swizzle[2] = 3;1750r600_src->swizzle[3] = 3;1751r600_src->sel = 1;1752} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {1753r600_src->swizzle[0] = 2;1754r600_src->swizzle[1] = 2;1755r600_src->swizzle[2] = 2;1756r600_src->swizzle[3] = 2;1757r600_src->sel = 0;1758} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSCOORD) {1759r600_src->sel = 1;1760} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSINNER) {1761r600_src->sel = 3;1762} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSOUTER) {1763r600_src->sel = 2;1764} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTICESIN) {1765r600_src->sel = ctx->tess_input_info;1766r600_src->swizzle[0] = 2;1767r600_src->swizzle[1] = 2;1768r600_src->swizzle[2] = 2;1769r600_src->swizzle[3] = 2;1770} else if (ctx->type == PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {1771r600_src->sel = 0;1772r600_src->swizzle[0] = 0;1773r600_src->swizzle[1] = 0;1774r600_src->swizzle[2] = 0;1775r600_src->swizzle[3] = 0;1776} else if (ctx->type == PIPE_SHADER_TESS_EVAL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {1777r600_src->sel = 0;1778r600_src->swizzle[0] = 3;1779r600_src->swizzle[1] = 3;1780r600_src->swizzle[2] = 3;1781r600_src->swizzle[3] = 3;1782} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_GRID_SIZE) {1783r600_src->sel = load_block_grid_size(ctx, false);1784} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_SIZE) {1785r600_src->sel = load_block_grid_size(ctx, true);1786} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_HELPER_INVOCATION) {1787r600_src->sel = ctx->helper_invoc_reg;1788r600_src->swizzle[0] = 0;1789r600_src->swizzle[1] = 0;1790r600_src->swizzle[2] = 0;1791r600_src->swizzle[3] = 0;1792}1793} else {1794if (tgsi_src->Register.Indirect)1795r600_src->rel = V_SQ_REL_RELATIVE;1796r600_src->sel = tgsi_src->Register.Index;1797r600_src->sel += ctx->file_offset[tgsi_src->Register.File];1798}1799if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) {1800if (tgsi_src->Register.Dimension) {1801r600_src->kc_bank = tgsi_src->Dimension.Index;1802if (tgsi_src->Dimension.Indirect) {1803r600_src->kc_rel = 1;1804}1805}1806}1807}18081809static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,1810unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,1811unsigned int dst_reg)1812{1813struct r600_bytecode_vtx vtx;1814unsigned int ar_reg;1815int r;18161817if (offset) {1818struct r600_bytecode_alu alu;18191820memset(&alu, 0, sizeof(alu));18211822alu.op = ALU_OP2_ADD_INT;1823alu.src[0].sel = ctx->bc->ar_reg;1824alu.src[0].chan = ar_chan;18251826alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;1827alu.src[1].value = offset;18281829alu.dst.sel = dst_reg;1830alu.dst.chan = ar_chan;1831alu.dst.write = 1;1832alu.last = 1;18331834if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))1835return r;18361837ar_reg = dst_reg;1838} else {1839ar_reg = ctx->bc->ar_reg;1840}18411842memset(&vtx, 0, sizeof(vtx));1843vtx.buffer_id = cb_idx;1844vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;1845vtx.src_gpr = ar_reg;1846vtx.src_sel_x = ar_chan;1847vtx.mega_fetch_count = 16;1848vtx.dst_gpr = dst_reg;1849vtx.dst_sel_x = 0; /* SEL_X */1850vtx.dst_sel_y = 1; /* SEL_Y */1851vtx.dst_sel_z = 2; /* SEL_Z */1852vtx.dst_sel_w = 3; /* SEL_W */1853vtx.data_format = FMT_32_32_32_32_FLOAT;1854vtx.num_format_all = 2; /* NUM_FORMAT_SCALED */1855vtx.format_comp_all = 1; /* FORMAT_COMP_SIGNED */1856vtx.endian = r600_endian_swap(32);1857vtx.buffer_index_mode = cb_rel; // cb_rel ? V_SQ_CF_INDEX_0 : V_SQ_CF_INDEX_NONE;18581859if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))1860return r;18611862return 0;1863}18641865static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)1866{1867struct r600_bytecode_vtx vtx;1868int r;1869unsigned index = src->Register.Index;1870unsigned vtx_id = src->Dimension.Index;1871int offset_reg = ctx->gs_rotated_input[vtx_id / 3];1872int offset_chan = vtx_id % 3;1873int t2 = 0;18741875/* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,1876* R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */18771878if (offset_reg == ctx->gs_rotated_input[0] && offset_chan == 2)1879offset_chan = 3;18801881if (src->Dimension.Indirect || src->Register.Indirect)1882t2 = r600_get_temp(ctx);18831884if (src->Dimension.Indirect) {1885int treg[3];1886struct r600_bytecode_alu alu;1887int r, i;1888unsigned addr_reg;1889addr_reg = get_address_file_reg(ctx, src->DimIndirect.Index);1890if (src->DimIndirect.Index > 0) {1891r = single_alu_op2(ctx, ALU_OP1_MOV,1892ctx->bc->ar_reg, 0,1893addr_reg, 0,18940, 0);1895if (r)1896return r;1897}1898/*1899we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.1900at least this is what fglrx seems to do. */1901for (i = 0; i < 3; i++) {1902treg[i] = r600_get_temp(ctx);1903}1904r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F);19051906for (i = 0; i < 3; i++) {1907memset(&alu, 0, sizeof(struct r600_bytecode_alu));1908alu.op = ALU_OP1_MOV;1909alu.src[0].sel = ctx->gs_rotated_input[0];1910alu.src[0].chan = i == 2 ? 3 : i;1911alu.dst.sel = treg[i];1912alu.dst.chan = 0;1913alu.dst.write = 1;1914alu.last = 1;1915r = r600_bytecode_add_alu(ctx->bc, &alu);1916if (r)1917return r;1918}1919memset(&alu, 0, sizeof(struct r600_bytecode_alu));1920alu.op = ALU_OP1_MOV;1921alu.src[0].sel = treg[0];1922alu.src[0].rel = 1;1923alu.dst.sel = t2;1924alu.dst.write = 1;1925alu.last = 1;1926r = r600_bytecode_add_alu(ctx->bc, &alu);1927if (r)1928return r;1929offset_reg = t2;1930offset_chan = 0;1931}19321933if (src->Register.Indirect) {1934int addr_reg;1935unsigned first = ctx->info.input_array_first[src->Indirect.ArrayID];19361937addr_reg = get_address_file_reg(ctx, src->Indirect.Index);19381939/* pull the value from index_reg */1940r = single_alu_op2(ctx, ALU_OP2_ADD_INT,1941t2, 1,1942addr_reg, 0,1943V_SQ_ALU_SRC_LITERAL, first);1944if (r)1945return r;1946r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,1947t2, 0,1948t2, 1,1949V_SQ_ALU_SRC_LITERAL, 4,1950offset_reg, offset_chan);1951if (r)1952return r;1953offset_reg = t2;1954offset_chan = 0;1955index = src->Register.Index - first;1956}19571958memset(&vtx, 0, sizeof(vtx));1959vtx.buffer_id = R600_GS_RING_CONST_BUFFER;1960vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;1961vtx.src_gpr = offset_reg;1962vtx.src_sel_x = offset_chan;1963vtx.offset = index * 16; /*bytes*/1964vtx.mega_fetch_count = 16;1965vtx.dst_gpr = dst_reg;1966vtx.dst_sel_x = 0; /* SEL_X */1967vtx.dst_sel_y = 1; /* SEL_Y */1968vtx.dst_sel_z = 2; /* SEL_Z */1969vtx.dst_sel_w = 3; /* SEL_W */1970if (ctx->bc->chip_class >= EVERGREEN) {1971vtx.use_const_fields = 1;1972} else {1973vtx.data_format = FMT_32_32_32_32_FLOAT;1974}19751976if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))1977return r;19781979return 0;1980}19811982static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)1983{1984struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;1985unsigned i;19861987for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {1988struct tgsi_full_src_register *src = &inst->Src[i];19891990if (src->Register.File == TGSI_FILE_INPUT) {1991if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) {1992/* primitive id is in R0.z */1993ctx->src[i].sel = 0;1994ctx->src[i].swizzle[0] = 2;1995}1996}1997if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) {1998int treg = r600_get_temp(ctx);19992000fetch_gs_input(ctx, src, treg);2001ctx->src[i].sel = treg;2002ctx->src[i].rel = 0;2003}2004}2005return 0;2006}200720082009/* Tessellation shaders pass outputs to the next shader using LDS.2010*2011* LS outputs = TCS(HS) inputs2012* TCS(HS) outputs = TES(DS) inputs2013*2014* The LDS layout is:2015* - TCS inputs for patch 02016* - TCS inputs for patch 12017* - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)2018* - ...2019* - TCS outputs for patch 0 = get_tcs_out_patch0_offset2020* - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset2021* - TCS outputs for patch 12022* - Per-patch TCS outputs for patch 12023* - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)2024* - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)2025* - ...2026*2027* All three shaders VS(LS), TCS, TES share the same LDS space.2028*/2029/* this will return with the dw address in temp_reg.x */2030static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg,2031const struct tgsi_full_dst_register *dst,2032const struct tgsi_full_src_register *src,2033int stride_bytes_reg, int stride_bytes_chan)2034{2035struct tgsi_full_dst_register reg;2036ubyte *name, *index, *array_first;2037int r;2038int param;2039struct tgsi_shader_info *info = &ctx->info;2040/* Set the register description. The address computation is the same2041* for sources and destinations. */2042if (src) {2043reg.Register.File = src->Register.File;2044reg.Register.Index = src->Register.Index;2045reg.Register.Indirect = src->Register.Indirect;2046reg.Register.Dimension = src->Register.Dimension;2047reg.Indirect = src->Indirect;2048reg.Dimension = src->Dimension;2049reg.DimIndirect = src->DimIndirect;2050} else2051reg = *dst;20522053/* If the register is 2-dimensional (e.g. an array of vertices2054* in a primitive), calculate the base address of the vertex. */2055if (reg.Register.Dimension) {2056int sel, chan;2057if (reg.Dimension.Indirect) {2058unsigned addr_reg;2059assert (reg.DimIndirect.File == TGSI_FILE_ADDRESS);20602061addr_reg = get_address_file_reg(ctx, reg.DimIndirect.Index);2062/* pull the value from index_reg */2063sel = addr_reg;2064chan = 0;2065} else {2066sel = V_SQ_ALU_SRC_LITERAL;2067chan = reg.Dimension.Index;2068}20692070r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,2071temp_reg, 0,2072stride_bytes_reg, stride_bytes_chan,2073sel, chan,2074temp_reg, 0);2075if (r)2076return r;2077}20782079if (reg.Register.File == TGSI_FILE_INPUT) {2080name = info->input_semantic_name;2081index = info->input_semantic_index;2082array_first = info->input_array_first;2083} else if (reg.Register.File == TGSI_FILE_OUTPUT) {2084name = info->output_semantic_name;2085index = info->output_semantic_index;2086array_first = info->output_array_first;2087} else {2088assert(0);2089return -1;2090}2091if (reg.Register.Indirect) {2092int addr_reg;2093int first;2094/* Add the relative address of the element. */2095if (reg.Indirect.ArrayID)2096first = array_first[reg.Indirect.ArrayID];2097else2098first = reg.Register.Index;20992100addr_reg = get_address_file_reg(ctx, reg.Indirect.Index);21012102/* pull the value from index_reg */2103r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,2104temp_reg, 0,2105V_SQ_ALU_SRC_LITERAL, 16,2106addr_reg, 0,2107temp_reg, 0);2108if (r)2109return r;21102111param = r600_get_lds_unique_index(name[first],2112index[first]);21132114} else {2115param = r600_get_lds_unique_index(name[reg.Register.Index],2116index[reg.Register.Index]);2117}21182119/* add to base_addr - passed in temp_reg.x */2120if (param) {2121r = single_alu_op2(ctx, ALU_OP2_ADD_INT,2122temp_reg, 0,2123temp_reg, 0,2124V_SQ_ALU_SRC_LITERAL, param * 16);2125if (r)2126return r;21272128}2129return 0;2130}21312132static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,2133unsigned dst_reg, unsigned mask)2134{2135struct r600_bytecode_alu alu;2136int r, i, lasti;21372138if ((ctx->bc->cf_last->ndw>>1) >= 0x60)2139ctx->bc->force_add_cf = 1;21402141lasti = tgsi_last_instruction(mask);2142for (i = 1; i <= lasti; i++) {2143if (!(mask & (1 << i)))2144continue;21452146r = single_alu_op2(ctx, ALU_OP2_ADD_INT,2147temp_reg, i,2148temp_reg, 0,2149V_SQ_ALU_SRC_LITERAL, 4 * i);2150if (r)2151return r;2152}2153for (i = 0; i <= lasti; i++) {2154if (!(mask & (1 << i)))2155continue;21562157/* emit an LDS_READ_RET */2158memset(&alu, 0, sizeof(alu));2159alu.op = LDS_OP1_LDS_READ_RET;2160alu.src[0].sel = temp_reg;2161alu.src[0].chan = i;2162alu.src[1].sel = V_SQ_ALU_SRC_0;2163alu.src[2].sel = V_SQ_ALU_SRC_0;2164alu.dst.chan = 0;2165alu.is_lds_idx_op = true;2166alu.last = 1;2167r = r600_bytecode_add_alu(ctx->bc, &alu);2168if (r)2169return r;2170}2171for (i = 0; i <= lasti; i++) {2172if (!(mask & (1 << i)))2173continue;21742175/* then read from LDS_OQ_A_POP */2176memset(&alu, 0, sizeof(alu));21772178alu.op = ALU_OP1_MOV;2179alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;2180alu.src[0].chan = 0;2181alu.dst.sel = dst_reg;2182alu.dst.chan = i;2183alu.dst.write = 1;2184alu.last = 1;2185r = r600_bytecode_add_alu(ctx->bc, &alu);2186if (r)2187return r;2188}2189return 0;2190}21912192static int fetch_mask(struct tgsi_src_register *reg)2193{2194int mask = 0;2195mask |= 1 << reg->SwizzleX;2196mask |= 1 << reg->SwizzleY;2197mask |= 1 << reg->SwizzleZ;2198mask |= 1 << reg->SwizzleW;2199return mask;2200}22012202static int fetch_tes_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)2203{2204int r;2205unsigned temp_reg = r600_get_temp(ctx);22062207r = get_lds_offset0(ctx, 2, temp_reg,2208src->Register.Dimension ? false : true);2209if (r)2210return r;22112212/* the base address is now in temp.x */2213r = r600_get_byte_address(ctx, temp_reg,2214NULL, src, ctx->tess_output_info, 1);2215if (r)2216return r;22172218r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));2219if (r)2220return r;2221return 0;2222}22232224static int fetch_tcs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)2225{2226int r;2227unsigned temp_reg = r600_get_temp(ctx);22282229/* t.x = ips * r0.y */2230r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,2231temp_reg, 0,2232ctx->tess_input_info, 0,22330, 1);22342235if (r)2236return r;22372238/* the base address is now in temp.x */2239r = r600_get_byte_address(ctx, temp_reg,2240NULL, src, ctx->tess_input_info, 1);2241if (r)2242return r;22432244r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));2245if (r)2246return r;2247return 0;2248}22492250static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)2251{2252int r;2253unsigned temp_reg = r600_get_temp(ctx);22542255r = get_lds_offset0(ctx, 1, temp_reg,2256src->Register.Dimension ? false : true);2257if (r)2258return r;2259/* the base address is now in temp.x */2260r = r600_get_byte_address(ctx, temp_reg,2261NULL, src,2262ctx->tess_output_info, 1);2263if (r)2264return r;22652266r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));2267if (r)2268return r;2269return 0;2270}22712272static int tgsi_split_lds_inputs(struct r600_shader_ctx *ctx)2273{2274struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;2275unsigned i;22762277for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {2278struct tgsi_full_src_register *src = &inst->Src[i];22792280if (ctx->type == PIPE_SHADER_TESS_EVAL && src->Register.File == TGSI_FILE_INPUT) {2281int treg = r600_get_temp(ctx);2282fetch_tes_input(ctx, src, treg);2283ctx->src[i].sel = treg;2284ctx->src[i].rel = 0;2285}2286if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_INPUT) {2287int treg = r600_get_temp(ctx);2288fetch_tcs_input(ctx, src, treg);2289ctx->src[i].sel = treg;2290ctx->src[i].rel = 0;2291}2292if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_OUTPUT) {2293int treg = r600_get_temp(ctx);2294fetch_tcs_output(ctx, src, treg);2295ctx->src[i].sel = treg;2296ctx->src[i].rel = 0;2297}2298}2299return 0;2300}23012302static int tgsi_split_constant(struct r600_shader_ctx *ctx)2303{2304struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;2305struct r600_bytecode_alu alu;2306int i, j, k, nconst, r;23072308for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {2309if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {2310nconst++;2311}2312tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);2313}2314for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {2315if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {2316continue;2317}23182319if (ctx->src[i].rel) {2320int chan = inst->Src[i].Indirect.Swizzle;2321int treg = r600_get_temp(ctx);2322if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg)))2323return r;23242325ctx->src[i].kc_bank = 0;2326ctx->src[i].kc_rel = 0;2327ctx->src[i].sel = treg;2328ctx->src[i].rel = 0;2329j--;2330} else if (j > 0) {2331int treg = r600_get_temp(ctx);2332for (k = 0; k < 4; k++) {2333memset(&alu, 0, sizeof(struct r600_bytecode_alu));2334alu.op = ALU_OP1_MOV;2335alu.src[0].sel = ctx->src[i].sel;2336alu.src[0].chan = k;2337alu.src[0].rel = ctx->src[i].rel;2338alu.src[0].kc_bank = ctx->src[i].kc_bank;2339alu.src[0].kc_rel = ctx->src[i].kc_rel;2340alu.dst.sel = treg;2341alu.dst.chan = k;2342alu.dst.write = 1;2343if (k == 3)2344alu.last = 1;2345r = r600_bytecode_add_alu(ctx->bc, &alu);2346if (r)2347return r;2348}2349ctx->src[i].sel = treg;2350ctx->src[i].rel =0;2351j--;2352}2353}2354return 0;2355}23562357/* need to move any immediate into a temp - for trig functions which use literal for PI stuff */2358static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)2359{2360struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;2361struct r600_bytecode_alu alu;2362int i, j, k, nliteral, r;23632364for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {2365if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {2366nliteral++;2367}2368}2369for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {2370if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {2371int treg = r600_get_temp(ctx);2372for (k = 0; k < 4; k++) {2373memset(&alu, 0, sizeof(struct r600_bytecode_alu));2374alu.op = ALU_OP1_MOV;2375alu.src[0].sel = ctx->src[i].sel;2376alu.src[0].chan = k;2377alu.src[0].value = ctx->src[i].value[k];2378alu.dst.sel = treg;2379alu.dst.chan = k;2380alu.dst.write = 1;2381if (k == 3)2382alu.last = 1;2383r = r600_bytecode_add_alu(ctx->bc, &alu);2384if (r)2385return r;2386}2387ctx->src[i].sel = treg;2388j--;2389}2390}2391return 0;2392}23932394static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)2395{2396int i, r, count = ctx->shader->ninput;23972398for (i = 0; i < count; i++) {2399if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {2400r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input);2401if (r)2402return r;2403}2404}2405return 0;2406}24072408static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so,2409int stream, unsigned *stream_item_size UNUSED)2410{2411unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];2412unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];2413int j, r;2414unsigned i;24152416/* Sanity checking. */2417if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) {2418R600_ERR("Too many stream outputs: %d\n", so->num_outputs);2419r = -EINVAL;2420goto out_err;2421}2422for (i = 0; i < so->num_outputs; i++) {2423if (so->output[i].output_buffer >= 4) {2424R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",2425so->output[i].output_buffer);2426r = -EINVAL;2427goto out_err;2428}2429}24302431/* Initialize locations where the outputs are stored. */2432for (i = 0; i < so->num_outputs; i++) {24332434so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;2435start_comp[i] = so->output[i].start_component;2436/* Lower outputs with dst_offset < start_component.2437*2438* We can only output 4D vectors with a write mask, e.g. we can2439* only output the W component at offset 3, etc. If we want2440* to store Y, Z, or W at buffer offset 0, we need to use MOV2441* to move it to X and output X. */2442if (so->output[i].dst_offset < so->output[i].start_component) {2443unsigned tmp = r600_get_temp(ctx);24442445for (j = 0; j < so->output[i].num_components; j++) {2446struct r600_bytecode_alu alu;2447memset(&alu, 0, sizeof(struct r600_bytecode_alu));2448alu.op = ALU_OP1_MOV;2449alu.src[0].sel = so_gpr[i];2450alu.src[0].chan = so->output[i].start_component + j;24512452alu.dst.sel = tmp;2453alu.dst.chan = j;2454alu.dst.write = 1;2455if (j == so->output[i].num_components - 1)2456alu.last = 1;2457r = r600_bytecode_add_alu(ctx->bc, &alu);2458if (r)2459return r;2460}2461start_comp[i] = 0;2462so_gpr[i] = tmp;2463}2464}24652466/* Write outputs to buffers. */2467for (i = 0; i < so->num_outputs; i++) {2468struct r600_bytecode_output output;24692470if (stream != -1 && stream != so->output[i].stream)2471continue;24722473memset(&output, 0, sizeof(struct r600_bytecode_output));2474output.gpr = so_gpr[i];2475output.elem_size = so->output[i].num_components - 1;2476if (output.elem_size == 2)2477output.elem_size = 3; // 3 not supported, write 4 with junk at end2478output.array_base = so->output[i].dst_offset - start_comp[i];2479output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;2480output.burst_count = 1;2481/* array_size is an upper limit for the burst_count2482* with MEM_STREAM instructions */2483output.array_size = 0xFFF;2484output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i];24852486if (ctx->bc->chip_class >= EVERGREEN) {2487switch (so->output[i].output_buffer) {2488case 0:2489output.op = CF_OP_MEM_STREAM0_BUF0;2490break;2491case 1:2492output.op = CF_OP_MEM_STREAM0_BUF1;2493break;2494case 2:2495output.op = CF_OP_MEM_STREAM0_BUF2;2496break;2497case 3:2498output.op = CF_OP_MEM_STREAM0_BUF3;2499break;2500}2501output.op += so->output[i].stream * 4;2502assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3);2503ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4;2504} else {2505switch (so->output[i].output_buffer) {2506case 0:2507output.op = CF_OP_MEM_STREAM0;2508break;2509case 1:2510output.op = CF_OP_MEM_STREAM1;2511break;2512case 2:2513output.op = CF_OP_MEM_STREAM2;2514break;2515case 3:2516output.op = CF_OP_MEM_STREAM3;2517break;2518}2519ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer;2520}2521r = r600_bytecode_add_output(ctx->bc, &output);2522if (r)2523goto out_err;2524}2525return 0;2526out_err:2527return r;2528}25292530static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx)2531{2532struct r600_bytecode_alu alu;2533unsigned reg;25342535if (!ctx->shader->vs_out_edgeflag)2536return;25372538reg = ctx->shader->output[ctx->edgeflag_output].gpr;25392540/* clamp(x, 0, 1) */2541memset(&alu, 0, sizeof(alu));2542alu.op = ALU_OP1_MOV;2543alu.src[0].sel = reg;2544alu.dst.sel = reg;2545alu.dst.write = 1;2546alu.dst.clamp = 1;2547alu.last = 1;2548r600_bytecode_add_alu(ctx->bc, &alu);25492550memset(&alu, 0, sizeof(alu));2551alu.op = ALU_OP1_FLT_TO_INT;2552alu.src[0].sel = reg;2553alu.dst.sel = reg;2554alu.dst.write = 1;2555alu.last = 1;2556r600_bytecode_add_alu(ctx->bc, &alu);2557}25582559int generate_gs_copy_shader(struct r600_context *rctx,2560struct r600_pipe_shader *gs,2561struct pipe_stream_output_info *so)2562{2563struct r600_shader_ctx ctx = {};2564struct r600_shader *gs_shader = &gs->shader;2565struct r600_pipe_shader *cshader;2566unsigned ocnt = gs_shader->noutput;2567struct r600_bytecode_alu alu;2568struct r600_bytecode_vtx vtx;2569struct r600_bytecode_output output;2570struct r600_bytecode_cf *cf_jump, *cf_pop,2571*last_exp_pos = NULL, *last_exp_param = NULL;2572int next_clip_pos = 61, next_param = 0;2573unsigned i, j;2574int ring;2575bool only_ring_0 = true;2576cshader = calloc(1, sizeof(struct r600_pipe_shader));2577if (!cshader)2578return 0;25792580memcpy(cshader->shader.output, gs_shader->output, ocnt *2581sizeof(struct r600_shader_io));25822583cshader->shader.noutput = ocnt;25842585ctx.shader = &cshader->shader;2586ctx.bc = &ctx.shader->bc;2587ctx.type = ctx.bc->type = PIPE_SHADER_VERTEX;25882589r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family,2590rctx->screen->has_compressed_msaa_texturing);25912592ctx.bc->isa = rctx->isa;25932594cf_jump = NULL;2595memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes));25962597/* R0.x = R0.x & 0x3fffffff */2598memset(&alu, 0, sizeof(alu));2599alu.op = ALU_OP2_AND_INT;2600alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;2601alu.src[1].value = 0x3fffffff;2602alu.dst.write = 1;2603r600_bytecode_add_alu(ctx.bc, &alu);26042605/* R0.y = R0.x >> 30 */2606memset(&alu, 0, sizeof(alu));2607alu.op = ALU_OP2_LSHR_INT;2608alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;2609alu.src[1].value = 0x1e;2610alu.dst.chan = 1;2611alu.dst.write = 1;2612alu.last = 1;2613r600_bytecode_add_alu(ctx.bc, &alu);26142615/* fetch vertex data from GSVS ring */2616for (i = 0; i < ocnt; ++i) {2617struct r600_shader_io *out = &ctx.shader->output[i];26182619out->gpr = i + 1;2620out->ring_offset = i * 16;26212622memset(&vtx, 0, sizeof(vtx));2623vtx.op = FETCH_OP_VFETCH;2624vtx.buffer_id = R600_GS_RING_CONST_BUFFER;2625vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;2626vtx.mega_fetch_count = 16;2627vtx.offset = out->ring_offset;2628vtx.dst_gpr = out->gpr;2629vtx.src_gpr = 0;2630vtx.dst_sel_x = 0;2631vtx.dst_sel_y = 1;2632vtx.dst_sel_z = 2;2633vtx.dst_sel_w = 3;2634if (rctx->b.chip_class >= EVERGREEN) {2635vtx.use_const_fields = 1;2636} else {2637vtx.data_format = FMT_32_32_32_32_FLOAT;2638}26392640r600_bytecode_add_vtx(ctx.bc, &vtx);2641}2642ctx.temp_reg = i + 1;2643for (ring = 3; ring >= 0; --ring) {2644bool enabled = false;2645for (i = 0; i < so->num_outputs; i++) {2646if (so->output[i].stream == ring) {2647enabled = true;2648if (ring > 0)2649only_ring_0 = false;2650break;2651}2652}2653if (ring != 0 && !enabled) {2654cshader->shader.ring_item_sizes[ring] = 0;2655continue;2656}26572658if (cf_jump) {2659// Patch up jump label2660r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);2661cf_pop = ctx.bc->cf_last;26622663cf_jump->cf_addr = cf_pop->id + 2;2664cf_jump->pop_count = 1;2665cf_pop->cf_addr = cf_pop->id + 2;2666cf_pop->pop_count = 1;2667}26682669/* PRED_SETE_INT __, R0.y, ring */2670memset(&alu, 0, sizeof(alu));2671alu.op = ALU_OP2_PRED_SETE_INT;2672alu.src[0].chan = 1;2673alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;2674alu.src[1].value = ring;2675alu.execute_mask = 1;2676alu.update_pred = 1;2677alu.last = 1;2678r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);26792680r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);2681cf_jump = ctx.bc->cf_last;26822683if (enabled)2684emit_streamout(&ctx, so, only_ring_0 ? -1 : ring, &cshader->shader.ring_item_sizes[ring]);2685cshader->shader.ring_item_sizes[ring] = ocnt * 16;2686}26872688/* bc adds nops - copy it */2689if (ctx.bc->chip_class == R600) {2690memset(&alu, 0, sizeof(struct r600_bytecode_alu));2691alu.op = ALU_OP0_NOP;2692alu.last = 1;2693r600_bytecode_add_alu(ctx.bc, &alu);26942695r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);2696}26972698/* export vertex data */2699/* XXX factor out common code with r600_shader_from_tgsi ? */2700for (i = 0; i < ocnt; ++i) {2701struct r600_shader_io *out = &ctx.shader->output[i];2702bool instream0 = true;2703if (out->name == TGSI_SEMANTIC_CLIPVERTEX)2704continue;27052706for (j = 0; j < so->num_outputs; j++) {2707if (so->output[j].register_index == i) {2708if (so->output[j].stream == 0)2709break;2710if (so->output[j].stream > 0)2711instream0 = false;2712}2713}2714if (!instream0)2715continue;2716memset(&output, 0, sizeof(output));2717output.gpr = out->gpr;2718output.elem_size = 3;2719output.swizzle_x = 0;2720output.swizzle_y = 1;2721output.swizzle_z = 2;2722output.swizzle_w = 3;2723output.burst_count = 1;2724output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;2725output.op = CF_OP_EXPORT;2726switch (out->name) {2727case TGSI_SEMANTIC_POSITION:2728output.array_base = 60;2729output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;2730break;27312732case TGSI_SEMANTIC_PSIZE:2733output.array_base = 61;2734if (next_clip_pos == 61)2735next_clip_pos = 62;2736output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;2737output.swizzle_y = 7;2738output.swizzle_z = 7;2739output.swizzle_w = 7;2740ctx.shader->vs_out_misc_write = 1;2741ctx.shader->vs_out_point_size = 1;2742break;2743case TGSI_SEMANTIC_LAYER:2744if (out->spi_sid) {2745/* duplicate it as PARAM to pass to the pixel shader */2746output.array_base = next_param++;2747r600_bytecode_add_output(ctx.bc, &output);2748last_exp_param = ctx.bc->cf_last;2749}2750output.array_base = 61;2751if (next_clip_pos == 61)2752next_clip_pos = 62;2753output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;2754output.swizzle_x = 7;2755output.swizzle_y = 7;2756output.swizzle_z = 0;2757output.swizzle_w = 7;2758ctx.shader->vs_out_misc_write = 1;2759ctx.shader->vs_out_layer = 1;2760break;2761case TGSI_SEMANTIC_VIEWPORT_INDEX:2762if (out->spi_sid) {2763/* duplicate it as PARAM to pass to the pixel shader */2764output.array_base = next_param++;2765r600_bytecode_add_output(ctx.bc, &output);2766last_exp_param = ctx.bc->cf_last;2767}2768output.array_base = 61;2769if (next_clip_pos == 61)2770next_clip_pos = 62;2771output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;2772ctx.shader->vs_out_misc_write = 1;2773ctx.shader->vs_out_viewport = 1;2774output.swizzle_x = 7;2775output.swizzle_y = 7;2776output.swizzle_z = 7;2777output.swizzle_w = 0;2778break;2779case TGSI_SEMANTIC_CLIPDIST:2780/* spi_sid is 0 for clipdistance outputs that were generated2781* for clipvertex - we don't need to pass them to PS */2782ctx.shader->clip_dist_write = gs->shader.clip_dist_write;2783ctx.shader->cull_dist_write = gs->shader.cull_dist_write;2784ctx.shader->cc_dist_mask = gs->shader.cc_dist_mask;2785if (out->spi_sid) {2786/* duplicate it as PARAM to pass to the pixel shader */2787output.array_base = next_param++;2788r600_bytecode_add_output(ctx.bc, &output);2789last_exp_param = ctx.bc->cf_last;2790}2791output.array_base = next_clip_pos++;2792output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;2793break;2794case TGSI_SEMANTIC_FOG:2795output.swizzle_y = 4; /* 0 */2796output.swizzle_z = 4; /* 0 */2797output.swizzle_w = 5; /* 1 */2798break;2799default:2800output.array_base = next_param++;2801break;2802}2803r600_bytecode_add_output(ctx.bc, &output);2804if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)2805last_exp_param = ctx.bc->cf_last;2806else2807last_exp_pos = ctx.bc->cf_last;2808}28092810if (!last_exp_pos) {2811memset(&output, 0, sizeof(output));2812output.gpr = 0;2813output.elem_size = 3;2814output.swizzle_x = 7;2815output.swizzle_y = 7;2816output.swizzle_z = 7;2817output.swizzle_w = 7;2818output.burst_count = 1;2819output.type = 2;2820output.op = CF_OP_EXPORT;2821output.array_base = 60;2822output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;2823r600_bytecode_add_output(ctx.bc, &output);2824last_exp_pos = ctx.bc->cf_last;2825}28262827if (!last_exp_param) {2828memset(&output, 0, sizeof(output));2829output.gpr = 0;2830output.elem_size = 3;2831output.swizzle_x = 7;2832output.swizzle_y = 7;2833output.swizzle_z = 7;2834output.swizzle_w = 7;2835output.burst_count = 1;2836output.type = 2;2837output.op = CF_OP_EXPORT;2838output.array_base = next_param++;2839output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;2840r600_bytecode_add_output(ctx.bc, &output);2841last_exp_param = ctx.bc->cf_last;2842}28432844last_exp_pos->op = CF_OP_EXPORT_DONE;2845last_exp_param->op = CF_OP_EXPORT_DONE;28462847r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);2848cf_pop = ctx.bc->cf_last;28492850cf_jump->cf_addr = cf_pop->id + 2;2851cf_jump->pop_count = 1;2852cf_pop->cf_addr = cf_pop->id + 2;2853cf_pop->pop_count = 1;28542855if (ctx.bc->chip_class == CAYMAN)2856cm_bytecode_add_cf_end(ctx.bc);2857else {2858r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);2859ctx.bc->cf_last->end_of_program = 1;2860}28612862gs->gs_copy_shader = cshader;2863cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;28642865ctx.bc->nstack = 1;28662867return r600_bytecode_build(ctx.bc);2868}28692870static int emit_inc_ring_offset(struct r600_shader_ctx *ctx, int idx, bool ind)2871{2872if (ind) {2873struct r600_bytecode_alu alu;2874int r;28752876memset(&alu, 0, sizeof(struct r600_bytecode_alu));2877alu.op = ALU_OP2_ADD_INT;2878alu.src[0].sel = ctx->gs_export_gpr_tregs[idx];2879alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;2880alu.src[1].value = ctx->gs_out_ring_offset >> 4;2881alu.dst.sel = ctx->gs_export_gpr_tregs[idx];2882alu.dst.write = 1;2883alu.last = 1;2884r = r600_bytecode_add_alu(ctx->bc, &alu);2885if (r)2886return r;2887}2888return 0;2889}28902891static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so UNUSED, int stream, bool ind)2892{2893struct r600_bytecode_output output;2894int ring_offset;2895unsigned i, k;2896int effective_stream = stream == -1 ? 0 : stream;2897int idx = 0;28982899for (i = 0; i < ctx->shader->noutput; i++) {2900if (ctx->gs_for_vs) {2901/* for ES we need to lookup corresponding ring offset expected by GS2902* (map this output to GS input by name and sid) */2903/* FIXME precompute offsets */2904ring_offset = -1;2905for(k = 0; k < ctx->gs_for_vs->ninput; ++k) {2906struct r600_shader_io *in = &ctx->gs_for_vs->input[k];2907struct r600_shader_io *out = &ctx->shader->output[i];2908if (in->name == out->name && in->sid == out->sid)2909ring_offset = in->ring_offset;2910}29112912if (ring_offset == -1)2913continue;2914} else {2915ring_offset = idx * 16;2916idx++;2917}29182919if (stream > 0 && ctx->shader->output[i].name == TGSI_SEMANTIC_POSITION)2920continue;2921/* next_ring_offset after parsing input decls contains total size of2922* single vertex data, gs_next_vertex - current vertex index */2923if (!ind)2924ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex;29252926memset(&output, 0, sizeof(struct r600_bytecode_output));2927output.gpr = ctx->shader->output[i].gpr;2928output.elem_size = 3;2929output.comp_mask = 0xF;2930output.burst_count = 1;29312932if (ind)2933output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;2934else2935output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;29362937switch (stream) {2938default:2939case 0:2940output.op = CF_OP_MEM_RING; break;2941case 1:2942output.op = CF_OP_MEM_RING1; break;2943case 2:2944output.op = CF_OP_MEM_RING2; break;2945case 3:2946output.op = CF_OP_MEM_RING3; break;2947}29482949if (ind) {2950output.array_base = ring_offset >> 2; /* in dwords */2951output.array_size = 0xfff;2952output.index_gpr = ctx->gs_export_gpr_tregs[effective_stream];2953} else2954output.array_base = ring_offset >> 2; /* in dwords */2955r600_bytecode_add_output(ctx->bc, &output);2956}29572958++ctx->gs_next_vertex;2959return 0;2960}296129622963static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx)2964{2965int r;2966struct r600_bytecode_vtx vtx;2967int temp_val = ctx->temp_reg;2968/* need to store the TCS output somewhere */2969r = single_alu_op2(ctx, ALU_OP1_MOV,2970temp_val, 0,2971V_SQ_ALU_SRC_LITERAL, 0,29720, 0);2973if (r)2974return r;29752976/* used by VS/TCS */2977if (ctx->tess_input_info) {2978/* fetch tcs input values into resv space */2979memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));2980vtx.op = FETCH_OP_VFETCH;2981vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;2982vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;2983vtx.mega_fetch_count = 16;2984vtx.data_format = FMT_32_32_32_32;2985vtx.num_format_all = 2;2986vtx.format_comp_all = 1;2987vtx.use_const_fields = 0;2988vtx.endian = r600_endian_swap(32);2989vtx.srf_mode_all = 1;2990vtx.offset = 0;2991vtx.dst_gpr = ctx->tess_input_info;2992vtx.dst_sel_x = 0;2993vtx.dst_sel_y = 1;2994vtx.dst_sel_z = 2;2995vtx.dst_sel_w = 3;2996vtx.src_gpr = temp_val;2997vtx.src_sel_x = 0;29982999r = r600_bytecode_add_vtx(ctx->bc, &vtx);3000if (r)3001return r;3002}30033004/* used by TCS/TES */3005if (ctx->tess_output_info) {3006/* fetch tcs output values into resv space */3007memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));3008vtx.op = FETCH_OP_VFETCH;3009vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;3010vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;3011vtx.mega_fetch_count = 16;3012vtx.data_format = FMT_32_32_32_32;3013vtx.num_format_all = 2;3014vtx.format_comp_all = 1;3015vtx.use_const_fields = 0;3016vtx.endian = r600_endian_swap(32);3017vtx.srf_mode_all = 1;3018vtx.offset = 16;3019vtx.dst_gpr = ctx->tess_output_info;3020vtx.dst_sel_x = 0;3021vtx.dst_sel_y = 1;3022vtx.dst_sel_z = 2;3023vtx.dst_sel_w = 3;3024vtx.src_gpr = temp_val;3025vtx.src_sel_x = 0;30263027r = r600_bytecode_add_vtx(ctx->bc, &vtx);3028if (r)3029return r;3030}3031return 0;3032}30333034static int emit_lds_vs_writes(struct r600_shader_ctx *ctx)3035{3036int j, r;3037int temp_reg;3038unsigned i;30393040/* fetch tcs input values into input_vals */3041ctx->tess_input_info = r600_get_temp(ctx);3042ctx->tess_output_info = 0;3043r = r600_fetch_tess_io_info(ctx);3044if (r)3045return r;30463047temp_reg = r600_get_temp(ctx);3048/* dst reg contains LDS address stride * idx */3049/* MUL vertexID, vertex_dw_stride */3050r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,3051temp_reg, 0,3052ctx->tess_input_info, 1,30530, 1); /* rel id in r0.y? */3054if (r)3055return r;30563057for (i = 0; i < ctx->shader->noutput; i++) {3058struct r600_bytecode_alu alu;3059int param = r600_get_lds_unique_index(ctx->shader->output[i].name,3060ctx->shader->output[i].sid);30613062if (param) {3063r = single_alu_op2(ctx, ALU_OP2_ADD_INT,3064temp_reg, 1,3065temp_reg, 0,3066V_SQ_ALU_SRC_LITERAL, param * 16);3067if (r)3068return r;3069}30703071r = single_alu_op2(ctx, ALU_OP2_ADD_INT,3072temp_reg, 2,3073temp_reg, param ? 1 : 0,3074V_SQ_ALU_SRC_LITERAL, 8);3075if (r)3076return r;307730783079for (j = 0; j < 2; j++) {3080int chan = (j == 1) ? 2 : (param ? 1 : 0);3081memset(&alu, 0, sizeof(struct r600_bytecode_alu));3082alu.op = LDS_OP3_LDS_WRITE_REL;3083alu.src[0].sel = temp_reg;3084alu.src[0].chan = chan;3085alu.src[1].sel = ctx->shader->output[i].gpr;3086alu.src[1].chan = j * 2;3087alu.src[2].sel = ctx->shader->output[i].gpr;3088alu.src[2].chan = (j * 2) + 1;3089alu.last = 1;3090alu.dst.chan = 0;3091alu.lds_idx = 1;3092alu.is_lds_idx_op = true;3093r = r600_bytecode_add_alu(ctx->bc, &alu);3094if (r)3095return r;3096}3097}3098return 0;3099}31003101static int r600_store_tcs_output(struct r600_shader_ctx *ctx)3102{3103struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;3104const struct tgsi_full_dst_register *dst = &inst->Dst[0];3105int i, r, lasti;3106int temp_reg = r600_get_temp(ctx);3107struct r600_bytecode_alu alu;3108unsigned write_mask = dst->Register.WriteMask;31093110if (inst->Dst[0].Register.File != TGSI_FILE_OUTPUT)3111return 0;31123113r = get_lds_offset0(ctx, 1, temp_reg, dst->Register.Dimension ? false : true);3114if (r)3115return r;31163117/* the base address is now in temp.x */3118r = r600_get_byte_address(ctx, temp_reg,3119&inst->Dst[0], NULL, ctx->tess_output_info, 1);3120if (r)3121return r;31223123/* LDS write */3124lasti = tgsi_last_instruction(write_mask);3125for (i = 1; i <= lasti; i++) {31263127if (!(write_mask & (1 << i)))3128continue;3129r = single_alu_op2(ctx, ALU_OP2_ADD_INT,3130temp_reg, i,3131temp_reg, 0,3132V_SQ_ALU_SRC_LITERAL, 4 * i);3133if (r)3134return r;3135}31363137for (i = 0; i <= lasti; i++) {3138if (!(write_mask & (1 << i)))3139continue;31403141if ((i == 0 && ((write_mask & 3) == 3)) ||3142(i == 2 && ((write_mask & 0xc) == 0xc))) {3143memset(&alu, 0, sizeof(struct r600_bytecode_alu));3144alu.op = LDS_OP3_LDS_WRITE_REL;3145alu.src[0].sel = temp_reg;3146alu.src[0].chan = i;31473148alu.src[1].sel = dst->Register.Index;3149alu.src[1].sel += ctx->file_offset[dst->Register.File];3150alu.src[1].chan = i;31513152alu.src[2].sel = dst->Register.Index;3153alu.src[2].sel += ctx->file_offset[dst->Register.File];3154alu.src[2].chan = i + 1;3155alu.lds_idx = 1;3156alu.dst.chan = 0;3157alu.last = 1;3158alu.is_lds_idx_op = true;3159r = r600_bytecode_add_alu(ctx->bc, &alu);3160if (r)3161return r;3162i += 1;3163continue;3164}3165memset(&alu, 0, sizeof(struct r600_bytecode_alu));3166alu.op = LDS_OP2_LDS_WRITE;3167alu.src[0].sel = temp_reg;3168alu.src[0].chan = i;31693170alu.src[1].sel = dst->Register.Index;3171alu.src[1].sel += ctx->file_offset[dst->Register.File];3172alu.src[1].chan = i;31733174alu.src[2].sel = V_SQ_ALU_SRC_0;3175alu.dst.chan = 0;3176alu.last = 1;3177alu.is_lds_idx_op = true;3178r = r600_bytecode_add_alu(ctx->bc, &alu);3179if (r)3180return r;3181}3182return 0;3183}31843185static int r600_tess_factor_read(struct r600_shader_ctx *ctx,3186int output_idx, int nc)3187{3188int param;3189unsigned temp_reg = r600_get_temp(ctx);3190unsigned name = ctx->shader->output[output_idx].name;3191int dreg = ctx->shader->output[output_idx].gpr;3192int r;31933194param = r600_get_lds_unique_index(name, 0);3195r = get_lds_offset0(ctx, 1, temp_reg, true);3196if (r)3197return r;31983199if (param) {3200r = single_alu_op2(ctx, ALU_OP2_ADD_INT,3201temp_reg, 0,3202temp_reg, 0,3203V_SQ_ALU_SRC_LITERAL, param * 16);3204if (r)3205return r;3206}32073208do_lds_fetch_values(ctx, temp_reg, dreg, ((1u << nc) - 1));3209return 0;3210}32113212static int r600_emit_tess_factor(struct r600_shader_ctx *ctx)3213{3214int stride, outer_comps, inner_comps;3215int tessinner_idx = -1, tessouter_idx = -1;3216int i, r;3217unsigned j;3218int temp_reg = r600_get_temp(ctx);3219int treg[3] = {-1, -1, -1};3220struct r600_bytecode_alu alu;3221struct r600_bytecode_cf *cf_jump, *cf_pop;32223223/* only execute factor emission for invocation 0 */3224/* PRED_SETE_INT __, R0.x, 0 */3225memset(&alu, 0, sizeof(alu));3226alu.op = ALU_OP2_PRED_SETE_INT;3227alu.src[0].chan = 2;3228alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;3229alu.execute_mask = 1;3230alu.update_pred = 1;3231alu.last = 1;3232r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_PUSH_BEFORE);32333234r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);3235cf_jump = ctx->bc->cf_last;32363237treg[0] = r600_get_temp(ctx);3238switch (ctx->shader->tcs_prim_mode) {3239case PIPE_PRIM_LINES:3240stride = 8; /* 2 dwords, 1 vec2 store */3241outer_comps = 2;3242inner_comps = 0;3243break;3244case PIPE_PRIM_TRIANGLES:3245stride = 16; /* 4 dwords, 1 vec4 store */3246outer_comps = 3;3247inner_comps = 1;3248treg[1] = r600_get_temp(ctx);3249break;3250case PIPE_PRIM_QUADS:3251stride = 24; /* 6 dwords, 2 stores (vec4 + vec2) */3252outer_comps = 4;3253inner_comps = 2;3254treg[1] = r600_get_temp(ctx);3255treg[2] = r600_get_temp(ctx);3256break;3257default:3258assert(0);3259return -1;3260}32613262/* R0 is InvocationID, RelPatchID, PatchID, tf_base */3263/* TF_WRITE takes index in R.x, value in R.y */3264for (j = 0; j < ctx->shader->noutput; j++) {3265if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSINNER)3266tessinner_idx = j;3267if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSOUTER)3268tessouter_idx = j;3269}32703271if (tessouter_idx == -1)3272return -1;32733274if (tessinner_idx == -1 && inner_comps)3275return -1;32763277if (tessouter_idx != -1) {3278r = r600_tess_factor_read(ctx, tessouter_idx, outer_comps);3279if (r)3280return r;3281}32823283if (tessinner_idx != -1) {3284r = r600_tess_factor_read(ctx, tessinner_idx, inner_comps);3285if (r)3286return r;3287}32883289/* r.x = tf_base(r0.w) + relpatchid(r0.y) * tf_stride */3290/* r.x = relpatchid(r0.y) * tf_stride */32913292/* multiply incoming r0.y * stride - t.x = r0.y * stride */3293/* add incoming r0.w to it: t.x = t.x + r0.w */3294r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,3295temp_reg, 0,32960, 1,3297V_SQ_ALU_SRC_LITERAL, stride,32980, 3);3299if (r)3300return r;33013302for (i = 0; i < outer_comps + inner_comps; i++) {3303int out_idx = i >= outer_comps ? tessinner_idx : tessouter_idx;3304int out_comp = i >= outer_comps ? i - outer_comps : i;33053306if (ctx->shader->tcs_prim_mode == PIPE_PRIM_LINES) {3307if (out_comp == 1)3308out_comp = 0;3309else if (out_comp == 0)3310out_comp = 1;3311}33123313r = single_alu_op2(ctx, ALU_OP2_ADD_INT,3314treg[i / 2], (2 * (i % 2)),3315temp_reg, 0,3316V_SQ_ALU_SRC_LITERAL, 4 * i);3317if (r)3318return r;3319r = single_alu_op2(ctx, ALU_OP1_MOV,3320treg[i / 2], 1 + (2 * (i%2)),3321ctx->shader->output[out_idx].gpr, out_comp,33220, 0);3323if (r)3324return r;3325}3326for (i = 0; i < outer_comps + inner_comps; i++) {3327struct r600_bytecode_gds gds;33283329memset(&gds, 0, sizeof(struct r600_bytecode_gds));3330gds.src_gpr = treg[i / 2];3331gds.src_sel_x = 2 * (i % 2);3332gds.src_sel_y = 1 + (2 * (i % 2));3333gds.src_sel_z = 4;3334gds.dst_sel_x = 7;3335gds.dst_sel_y = 7;3336gds.dst_sel_z = 7;3337gds.dst_sel_w = 7;3338gds.op = FETCH_OP_TF_WRITE;3339r = r600_bytecode_add_gds(ctx->bc, &gds);3340if (r)3341return r;3342}33433344// Patch up jump label3345r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);3346cf_pop = ctx->bc->cf_last;33473348cf_jump->cf_addr = cf_pop->id + 2;3349cf_jump->pop_count = 1;3350cf_pop->cf_addr = cf_pop->id + 2;3351cf_pop->pop_count = 1;33523353return 0;3354}33553356/*3357* We have to work out the thread ID for load and atomic3358* operations, which store the returned value to an index3359* in an intermediate buffer.3360* The index is calculated by taking the thread id,3361* calculated from the MBCNT instructions.3362* Then the shader engine ID is multiplied by 256,3363* and the wave id is added.3364* Then the result is multipled by 64 and thread id is3365* added.3366*/3367static int load_thread_id_gpr(struct r600_shader_ctx *ctx)3368{3369struct r600_bytecode_alu alu;3370int r;33713372memset(&alu, 0, sizeof(struct r600_bytecode_alu));3373alu.op = ALU_OP1_MBCNT_32LO_ACCUM_PREV_INT;3374alu.dst.sel = ctx->temp_reg;3375alu.dst.chan = 0;3376alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;3377alu.src[0].value = 0xffffffff;3378alu.dst.write = 1;3379r = r600_bytecode_add_alu(ctx->bc, &alu);3380if (r)3381return r;33823383memset(&alu, 0, sizeof(struct r600_bytecode_alu));3384alu.op = ALU_OP1_MBCNT_32HI_INT;3385alu.dst.sel = ctx->temp_reg;3386alu.dst.chan = 1;3387alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;3388alu.src[0].value = 0xffffffff;3389alu.dst.write = 1;3390r = r600_bytecode_add_alu(ctx->bc, &alu);3391if (r)3392return r;33933394memset(&alu, 0, sizeof(struct r600_bytecode_alu));3395alu.op = ALU_OP3_MULADD_UINT24;3396alu.dst.sel = ctx->temp_reg;3397alu.dst.chan = 2;3398alu.src[0].sel = EG_V_SQ_ALU_SRC_SE_ID;3399alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;3400alu.src[1].value = 256;3401alu.src[2].sel = EG_V_SQ_ALU_SRC_HW_WAVE_ID;3402alu.dst.write = 1;3403alu.is_op3 = 1;3404alu.last = 1;3405r = r600_bytecode_add_alu(ctx->bc, &alu);3406if (r)3407return r;34083409r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,3410ctx->thread_id_gpr, 1,3411ctx->temp_reg, 2,3412V_SQ_ALU_SRC_LITERAL, 0x40,3413ctx->temp_reg, 0);3414if (r)3415return r;3416return 0;3417}34183419static int r600_shader_from_tgsi(struct r600_context *rctx,3420struct r600_pipe_shader *pipeshader,3421union r600_shader_key key)3422{3423struct r600_screen *rscreen = rctx->screen;3424struct r600_shader *shader = &pipeshader->shader;3425struct tgsi_token *tokens = pipeshader->selector->tokens;3426struct pipe_stream_output_info so = pipeshader->selector->so;3427struct tgsi_full_immediate *immediate;3428struct r600_shader_ctx ctx;3429struct r600_bytecode_output output[ARRAY_SIZE(shader->output)];3430unsigned output_done, noutput;3431unsigned opcode;3432int j, k, r = 0;3433unsigned i;3434int next_param_base = 0, next_clip_base;3435int max_color_exports = MAX2(key.ps.nr_cbufs, 1);3436bool indirect_gprs;3437bool ring_outputs = false;3438bool lds_outputs = false;3439bool lds_inputs = false;3440bool pos_emitted = false;34413442ctx.bc = &shader->bc;3443ctx.shader = shader;34443445r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,3446rscreen->has_compressed_msaa_texturing);3447ctx.tokens = tokens;3448tgsi_scan_shader(tokens, &ctx.info);3449shader->indirect_files = ctx.info.indirect_files;34503451int narrays = ctx.info.array_max[TGSI_FILE_TEMPORARY];3452ctx.array_infos = calloc(narrays, sizeof(*ctx.array_infos));3453ctx.spilled_arrays = calloc(narrays, sizeof(bool));3454tgsi_scan_arrays(tokens, TGSI_FILE_TEMPORARY, narrays, ctx.array_infos);34553456shader->uses_helper_invocation = false;3457shader->uses_doubles = ctx.info.uses_doubles;3458shader->uses_atomics = ctx.info.file_mask[TGSI_FILE_HW_ATOMIC];3459shader->nsys_inputs = 0;34603461shader->uses_images = ctx.info.file_count[TGSI_FILE_IMAGE] > 0 ||3462ctx.info.file_count[TGSI_FILE_BUFFER] > 0;3463indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER));3464tgsi_parse_init(&ctx.parse, tokens);3465ctx.type = ctx.info.processor;3466shader->processor_type = ctx.type;3467ctx.bc->type = shader->processor_type;34683469switch (ctx.type) {3470case PIPE_SHADER_VERTEX:3471shader->vs_as_gs_a = key.vs.as_gs_a;3472shader->vs_as_es = key.vs.as_es;3473shader->vs_as_ls = key.vs.as_ls;3474shader->atomic_base = key.vs.first_atomic_counter;3475if (shader->vs_as_es)3476ring_outputs = true;3477if (shader->vs_as_ls)3478lds_outputs = true;3479break;3480case PIPE_SHADER_GEOMETRY:3481ring_outputs = true;3482shader->atomic_base = key.gs.first_atomic_counter;3483shader->gs_tri_strip_adj_fix = key.gs.tri_strip_adj_fix;3484break;3485case PIPE_SHADER_TESS_CTRL:3486shader->tcs_prim_mode = key.tcs.prim_mode;3487shader->atomic_base = key.tcs.first_atomic_counter;3488lds_outputs = true;3489lds_inputs = true;3490break;3491case PIPE_SHADER_TESS_EVAL:3492shader->tes_as_es = key.tes.as_es;3493shader->atomic_base = key.tes.first_atomic_counter;3494lds_inputs = true;3495if (shader->tes_as_es)3496ring_outputs = true;3497break;3498case PIPE_SHADER_FRAGMENT:3499shader->two_side = key.ps.color_two_side;3500shader->atomic_base = key.ps.first_atomic_counter;3501shader->rat_base = key.ps.nr_cbufs;3502shader->image_size_const_offset = key.ps.image_size_const_offset;3503break;3504case PIPE_SHADER_COMPUTE:3505shader->rat_base = 0;3506shader->image_size_const_offset = ctx.info.file_count[TGSI_FILE_SAMPLER];3507break;3508default:3509break;3510}35113512if (shader->vs_as_es || shader->tes_as_es) {3513ctx.gs_for_vs = &rctx->gs_shader->current->shader;3514} else {3515ctx.gs_for_vs = NULL;3516}35173518ctx.next_ring_offset = 0;3519ctx.gs_out_ring_offset = 0;3520ctx.gs_next_vertex = 0;3521ctx.gs_stream_output_info = &so;35223523ctx.thread_id_gpr = -1;3524ctx.face_gpr = -1;3525ctx.fixed_pt_position_gpr = -1;3526ctx.fragcoord_input = -1;3527ctx.colors_used = 0;3528ctx.clip_vertex_write = 0;35293530ctx.helper_invoc_reg = -1;3531ctx.cs_block_size_reg = -1;3532ctx.cs_grid_size_reg = -1;3533ctx.cs_block_size_loaded = false;3534ctx.cs_grid_size_loaded = false;35353536shader->nr_ps_color_exports = 0;3537shader->nr_ps_max_color_exports = 0;353835393540/* register allocations */3541/* Values [0,127] correspond to GPR[0..127].3542* Values [128,159] correspond to constant buffer bank 03543* Values [160,191] correspond to constant buffer bank 13544* Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)3545* Values [256,287] correspond to constant buffer bank 2 (EG)3546* Values [288,319] correspond to constant buffer bank 3 (EG)3547* Other special values are shown in the list below.3548* 244 ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)3549* 245 ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)3550* 246 ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)3551* 247 ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)3552* 248 SQ_ALU_SRC_0: special constant 0.0.3553* 249 SQ_ALU_SRC_1: special constant 1.0 float.3554* 250 SQ_ALU_SRC_1_INT: special constant 1 integer.3555* 251 SQ_ALU_SRC_M_1_INT: special constant -1 integer.3556* 252 SQ_ALU_SRC_0_5: special constant 0.5 float.3557* 253 SQ_ALU_SRC_LITERAL: literal constant.3558* 254 SQ_ALU_SRC_PV: previous vector result.3559* 255 SQ_ALU_SRC_PS: previous scalar result.3560*/3561for (i = 0; i < TGSI_FILE_COUNT; i++) {3562ctx.file_offset[i] = 0;3563}35643565if (ctx.type == PIPE_SHADER_VERTEX) {35663567ctx.file_offset[TGSI_FILE_INPUT] = 1;3568if (ctx.info.num_inputs)3569r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);3570}3571if (ctx.type == PIPE_SHADER_FRAGMENT) {3572if (ctx.bc->chip_class >= EVERGREEN)3573ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);3574else3575ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]);35763577for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {3578if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_HELPER_INVOCATION) {3579ctx.helper_invoc_reg = ctx.file_offset[TGSI_FILE_INPUT]++;3580shader->uses_helper_invocation = true;3581}3582}3583}3584if (ctx.type == PIPE_SHADER_GEOMETRY) {3585/* FIXME 1 would be enough in some cases (3 or less input vertices) */3586ctx.file_offset[TGSI_FILE_INPUT] = 2;3587}3588if (ctx.type == PIPE_SHADER_TESS_CTRL)3589ctx.file_offset[TGSI_FILE_INPUT] = 1;3590if (ctx.type == PIPE_SHADER_TESS_EVAL) {3591bool add_tesscoord = false, add_tess_inout = false;3592ctx.file_offset[TGSI_FILE_INPUT] = 1;3593for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {3594/* if we have tesscoord save one reg */3595if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSCOORD)3596add_tesscoord = true;3597if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSINNER ||3598ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSOUTER)3599add_tess_inout = true;3600}3601if (add_tesscoord || add_tess_inout)3602ctx.file_offset[TGSI_FILE_INPUT]++;3603if (add_tess_inout)3604ctx.file_offset[TGSI_FILE_INPUT]+=2;3605}3606if (ctx.type == PIPE_SHADER_COMPUTE) {3607ctx.file_offset[TGSI_FILE_INPUT] = 2;3608for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {3609if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_GRID_SIZE)3610ctx.cs_grid_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++;3611if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_BLOCK_SIZE)3612ctx.cs_block_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++;3613}3614}36153616ctx.file_offset[TGSI_FILE_OUTPUT] =3617ctx.file_offset[TGSI_FILE_INPUT] +3618ctx.info.file_max[TGSI_FILE_INPUT] + 1;3619ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +3620ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;36213622/* Outside the GPR range. This will be translated to one of the3623* kcache banks later. */3624ctx.file_offset[TGSI_FILE_CONSTANT] = 512;3625ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;36263627pipeshader->scratch_space_needed = 0;3628int regno = ctx.file_offset[TGSI_FILE_TEMPORARY] +3629ctx.info.file_max[TGSI_FILE_TEMPORARY];3630if (regno > 124) {3631choose_spill_arrays(&ctx, ®no, &pipeshader->scratch_space_needed);3632shader->indirect_files = ctx.info.indirect_files;3633}3634shader->needs_scratch_space = pipeshader->scratch_space_needed != 0;36353636ctx.bc->ar_reg = ++regno;3637ctx.bc->index_reg[0] = ++regno;3638ctx.bc->index_reg[1] = ++regno;36393640if (ctx.type == PIPE_SHADER_TESS_CTRL) {3641ctx.tess_input_info = ++regno;3642ctx.tess_output_info = ++regno;3643} else if (ctx.type == PIPE_SHADER_TESS_EVAL) {3644ctx.tess_input_info = ++regno;3645ctx.tess_output_info = ++regno;3646} else if (ctx.type == PIPE_SHADER_GEOMETRY) {3647ctx.gs_export_gpr_tregs[0] = ++regno;3648ctx.gs_export_gpr_tregs[1] = ++regno;3649ctx.gs_export_gpr_tregs[2] = ++regno;3650ctx.gs_export_gpr_tregs[3] = ++regno;3651if (ctx.shader->gs_tri_strip_adj_fix) {3652ctx.gs_rotated_input[0] = ++regno;3653ctx.gs_rotated_input[1] = ++regno;3654} else {3655ctx.gs_rotated_input[0] = 0;3656ctx.gs_rotated_input[1] = 1;3657}3658}36593660if (shader->uses_images) {3661ctx.thread_id_gpr = ++regno;3662}3663ctx.temp_reg = ++regno;36643665shader->max_arrays = 0;3666shader->num_arrays = 0;3667if (indirect_gprs) {36683669if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) {3670r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT],3671ctx.file_offset[TGSI_FILE_OUTPUT] -3672ctx.file_offset[TGSI_FILE_INPUT],36730x0F);3674}3675if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) {3676r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT],3677ctx.file_offset[TGSI_FILE_TEMPORARY] -3678ctx.file_offset[TGSI_FILE_OUTPUT],36790x0F);3680}3681}36823683ctx.nliterals = 0;3684ctx.literals = NULL;3685ctx.max_driver_temp_used = 0;36863687shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&3688ctx.info.colors_written == 1;3689shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];3690shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT];36913692if (ctx.type == PIPE_SHADER_VERTEX ||3693ctx.type == PIPE_SHADER_GEOMETRY ||3694ctx.type == PIPE_SHADER_TESS_EVAL) {3695shader->cc_dist_mask = (1 << (ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED] +3696ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED])) - 1;3697shader->clip_dist_write = (1 << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED]) - 1;3698shader->cull_dist_write = ((1 << ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED]) - 1) << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED];3699}37003701if (shader->vs_as_gs_a)3702vs_add_primid_output(&ctx, key.vs.prim_id_out);37033704if (ctx.thread_id_gpr != -1) {3705r = load_thread_id_gpr(&ctx);3706if (r)3707return r;3708}37093710if (ctx.type == PIPE_SHADER_TESS_EVAL)3711r600_fetch_tess_io_info(&ctx);37123713while (!tgsi_parse_end_of_tokens(&ctx.parse)) {3714tgsi_parse_token(&ctx.parse);3715switch (ctx.parse.FullToken.Token.Type) {3716case TGSI_TOKEN_TYPE_IMMEDIATE:3717immediate = &ctx.parse.FullToken.FullImmediate;3718ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);3719if(ctx.literals == NULL) {3720r = -ENOMEM;3721goto out_err;3722}3723ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;3724ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;3725ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;3726ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;3727ctx.nliterals++;3728break;3729case TGSI_TOKEN_TYPE_DECLARATION:3730r = tgsi_declaration(&ctx);3731if (r)3732goto out_err;3733break;3734case TGSI_TOKEN_TYPE_INSTRUCTION:3735case TGSI_TOKEN_TYPE_PROPERTY:3736break;3737default:3738R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);3739r = -EINVAL;3740goto out_err;3741}3742}37433744shader->ring_item_sizes[0] = ctx.next_ring_offset;3745shader->ring_item_sizes[1] = 0;3746shader->ring_item_sizes[2] = 0;3747shader->ring_item_sizes[3] = 0;37483749/* Process two side if needed */3750if (shader->two_side && ctx.colors_used) {3751int i, count = ctx.shader->ninput;3752unsigned next_lds_loc = ctx.shader->nlds;37533754/* additional inputs will be allocated right after the existing inputs,3755* we won't need them after the color selection, so we don't need to3756* reserve these gprs for the rest of the shader code and to adjust3757* output offsets etc. */3758int gpr = ctx.file_offset[TGSI_FILE_INPUT] +3759ctx.info.file_max[TGSI_FILE_INPUT] + 1;37603761/* if two sided and neither face or sample mask is used by shader, ensure face_gpr is emitted */3762if (ctx.face_gpr == -1) {3763i = ctx.shader->ninput++;3764ctx.shader->input[i].name = TGSI_SEMANTIC_FACE;3765ctx.shader->input[i].spi_sid = 0;3766ctx.shader->input[i].gpr = gpr++;3767ctx.face_gpr = ctx.shader->input[i].gpr;3768}37693770for (i = 0; i < count; i++) {3771if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) {3772int ni = ctx.shader->ninput++;3773memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io));3774ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;3775ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]);3776ctx.shader->input[ni].gpr = gpr++;3777// TGSI to LLVM needs to know the lds position of inputs.3778// Non LLVM path computes it later (in process_twoside_color)3779ctx.shader->input[ni].lds_pos = next_lds_loc++;3780ctx.shader->input[i].back_color_input = ni;3781if (ctx.bc->chip_class >= EVERGREEN) {3782if ((r = evergreen_interp_input(&ctx, ni)))3783return r;3784}3785}3786}3787}37883789if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN)3790shader->nr_ps_max_color_exports = 8;37913792if (ctx.shader->uses_helper_invocation) {3793if (ctx.bc->chip_class == CAYMAN)3794r = cm_load_helper_invocation(&ctx);3795else3796r = eg_load_helper_invocation(&ctx);3797if (r)3798return r;3799}38003801/*3802* XXX this relies on fixed_pt_position_gpr only being present when3803* this shader should be executed per sample. Should be the case for now...3804*/3805if (ctx.fixed_pt_position_gpr != -1 && ctx.info.reads_samplemask) {3806/*3807* Fix up sample mask. The hw always gives us coverage mask for3808* the pixel. However, for per-sample shading, we need the3809* coverage for the shader invocation only.3810* Also, with disabled msaa, only the first bit should be set3811* (luckily the same fixup works for both problems).3812* For now, we can only do it if we know this shader is always3813* executed per sample (due to usage of bits in the shader3814* forcing per-sample execution).3815* If the fb is not multisampled, we'd do unnecessary work but3816* it should still be correct.3817* It will however do nothing for sample shading according3818* to MinSampleShading.3819*/3820struct r600_bytecode_alu alu;3821int tmp = r600_get_temp(&ctx);3822assert(ctx.face_gpr != -1);3823memset(&alu, 0, sizeof(struct r600_bytecode_alu));38243825alu.op = ALU_OP2_LSHL_INT;3826alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;3827alu.src[0].value = 0x1;3828alu.src[1].sel = ctx.fixed_pt_position_gpr;3829alu.src[1].chan = 3;3830alu.dst.sel = tmp;3831alu.dst.chan = 0;3832alu.dst.write = 1;3833alu.last = 1;3834if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))3835return r;38363837memset(&alu, 0, sizeof(struct r600_bytecode_alu));3838alu.op = ALU_OP2_AND_INT;3839alu.src[0].sel = tmp;3840alu.src[1].sel = ctx.face_gpr;3841alu.src[1].chan = 2;3842alu.dst.sel = ctx.face_gpr;3843alu.dst.chan = 2;3844alu.dst.write = 1;3845alu.last = 1;3846if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))3847return r;3848}38493850if (ctx.fragcoord_input >= 0) {3851if (ctx.bc->chip_class == CAYMAN) {3852for (j = 0 ; j < 4; j++) {3853struct r600_bytecode_alu alu;3854memset(&alu, 0, sizeof(struct r600_bytecode_alu));3855alu.op = ALU_OP1_RECIP_IEEE;3856alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;3857alu.src[0].chan = 3;38583859alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;3860alu.dst.chan = j;3861alu.dst.write = (j == 3);3862alu.last = (j == 3);3863if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))3864return r;3865}3866} else {3867struct r600_bytecode_alu alu;3868memset(&alu, 0, sizeof(struct r600_bytecode_alu));3869alu.op = ALU_OP1_RECIP_IEEE;3870alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;3871alu.src[0].chan = 3;38723873alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;3874alu.dst.chan = 3;3875alu.dst.write = 1;3876alu.last = 1;3877if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))3878return r;3879}3880}38813882if (ctx.type == PIPE_SHADER_GEOMETRY) {3883struct r600_bytecode_alu alu;3884int r;38853886/* GS thread with no output workaround - emit a cut at start of GS */3887if (ctx.bc->chip_class == R600)3888r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX);38893890for (j = 0; j < 4; j++) {3891memset(&alu, 0, sizeof(struct r600_bytecode_alu));3892alu.op = ALU_OP1_MOV;3893alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;3894alu.src[0].value = 0;3895alu.dst.sel = ctx.gs_export_gpr_tregs[j];3896alu.dst.write = 1;3897alu.last = 1;3898r = r600_bytecode_add_alu(ctx.bc, &alu);3899if (r)3900return r;3901}39023903if (ctx.shader->gs_tri_strip_adj_fix) {3904r = single_alu_op2(&ctx, ALU_OP2_AND_INT,3905ctx.gs_rotated_input[0], 2,39060, 2,3907V_SQ_ALU_SRC_LITERAL, 1);3908if (r)3909return r;39103911for (i = 0; i < 6; i++) {3912int rotated = (i + 4) % 6;3913int offset_reg = i / 3;3914int offset_chan = i % 3;3915int rotated_offset_reg = rotated / 3;3916int rotated_offset_chan = rotated % 3;39173918if (offset_reg == 0 && offset_chan == 2)3919offset_chan = 3;3920if (rotated_offset_reg == 0 && rotated_offset_chan == 2)3921rotated_offset_chan = 3;39223923r = single_alu_op3(&ctx, ALU_OP3_CNDE_INT,3924ctx.gs_rotated_input[offset_reg], offset_chan,3925ctx.gs_rotated_input[0], 2,3926offset_reg, offset_chan,3927rotated_offset_reg, rotated_offset_chan);3928if (r)3929return r;3930}3931}3932}39333934if (ctx.type == PIPE_SHADER_TESS_CTRL)3935r600_fetch_tess_io_info(&ctx);39363937if (shader->two_side && ctx.colors_used) {3938if ((r = process_twoside_color_inputs(&ctx)))3939return r;3940}39413942tgsi_parse_init(&ctx.parse, tokens);3943while (!tgsi_parse_end_of_tokens(&ctx.parse)) {3944tgsi_parse_token(&ctx.parse);3945switch (ctx.parse.FullToken.Token.Type) {3946case TGSI_TOKEN_TYPE_INSTRUCTION:3947r = tgsi_is_supported(&ctx);3948if (r)3949goto out_err;3950ctx.max_driver_temp_used = 0;3951/* reserve first tmp for everyone */3952r600_get_temp(&ctx);39533954opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;3955if ((r = tgsi_split_constant(&ctx)))3956goto out_err;3957if ((r = tgsi_split_literal_constant(&ctx)))3958goto out_err;3959if (ctx.type == PIPE_SHADER_GEOMETRY) {3960if ((r = tgsi_split_gs_inputs(&ctx)))3961goto out_err;3962} else if (lds_inputs) {3963if ((r = tgsi_split_lds_inputs(&ctx)))3964goto out_err;3965}3966if (ctx.bc->chip_class == CAYMAN)3967ctx.inst_info = &cm_shader_tgsi_instruction[opcode];3968else if (ctx.bc->chip_class >= EVERGREEN)3969ctx.inst_info = &eg_shader_tgsi_instruction[opcode];3970else3971ctx.inst_info = &r600_shader_tgsi_instruction[opcode];39723973ctx.bc->precise |= ctx.parse.FullToken.FullInstruction.Instruction.Precise;39743975r = ctx.inst_info->process(&ctx);3976if (r)3977goto out_err;39783979if (ctx.type == PIPE_SHADER_TESS_CTRL) {3980r = r600_store_tcs_output(&ctx);3981if (r)3982goto out_err;3983}3984break;3985default:3986break;3987}3988}39893990/* Reset the temporary register counter. */3991ctx.max_driver_temp_used = 0;39923993noutput = shader->noutput;39943995if (!ring_outputs && ctx.clip_vertex_write) {3996unsigned clipdist_temp[2];39973998clipdist_temp[0] = r600_get_temp(&ctx);3999clipdist_temp[1] = r600_get_temp(&ctx);40004001/* need to convert a clipvertex write into clipdistance writes and not export4002the clip vertex anymore */40034004memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));4005shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;4006shader->output[noutput].gpr = clipdist_temp[0];4007noutput++;4008shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;4009shader->output[noutput].gpr = clipdist_temp[1];4010noutput++;40114012/* reset spi_sid for clipvertex output to avoid confusing spi */4013shader->output[ctx.cv_output].spi_sid = 0;40144015shader->clip_dist_write = 0xFF;4016shader->cc_dist_mask = 0xFF;40174018for (i = 0; i < 8; i++) {4019int oreg = i >> 2;4020int ochan = i & 3;40214022for (j = 0; j < 4; j++) {4023struct r600_bytecode_alu alu;4024memset(&alu, 0, sizeof(struct r600_bytecode_alu));4025alu.op = ALU_OP2_DOT4;4026alu.src[0].sel = shader->output[ctx.cv_output].gpr;4027alu.src[0].chan = j;40284029alu.src[1].sel = 512 + i;4030alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;4031alu.src[1].chan = j;40324033alu.dst.sel = clipdist_temp[oreg];4034alu.dst.chan = j;4035alu.dst.write = (j == ochan);4036if (j == 3)4037alu.last = 1;4038r = r600_bytecode_add_alu(ctx.bc, &alu);4039if (r)4040return r;4041}4042}4043}40444045/* Add stream outputs. */4046if (so.num_outputs) {4047bool emit = false;4048if (!lds_outputs && !ring_outputs && ctx.type == PIPE_SHADER_VERTEX)4049emit = true;4050if (!ring_outputs && ctx.type == PIPE_SHADER_TESS_EVAL)4051emit = true;4052if (emit)4053emit_streamout(&ctx, &so, -1, NULL);4054}4055pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;4056convert_edgeflag_to_int(&ctx);40574058if (ctx.type == PIPE_SHADER_TESS_CTRL)4059r600_emit_tess_factor(&ctx);40604061if (lds_outputs) {4062if (ctx.type == PIPE_SHADER_VERTEX) {4063if (ctx.shader->noutput)4064emit_lds_vs_writes(&ctx);4065}4066} else if (ring_outputs) {4067if (shader->vs_as_es || shader->tes_as_es) {4068ctx.gs_export_gpr_tregs[0] = r600_get_temp(&ctx);4069ctx.gs_export_gpr_tregs[1] = -1;4070ctx.gs_export_gpr_tregs[2] = -1;4071ctx.gs_export_gpr_tregs[3] = -1;40724073emit_gs_ring_writes(&ctx, &so, -1, FALSE);4074}4075} else {4076/* Export output */4077next_clip_base = shader->vs_out_misc_write ? 62 : 61;40784079for (i = 0, j = 0; i < noutput; i++, j++) {4080memset(&output[j], 0, sizeof(struct r600_bytecode_output));4081output[j].gpr = shader->output[i].gpr;4082output[j].elem_size = 3;4083output[j].swizzle_x = 0;4084output[j].swizzle_y = 1;4085output[j].swizzle_z = 2;4086output[j].swizzle_w = 3;4087output[j].burst_count = 1;4088output[j].type = 0xffffffff;4089output[j].op = CF_OP_EXPORT;4090switch (ctx.type) {4091case PIPE_SHADER_VERTEX:4092case PIPE_SHADER_TESS_EVAL:4093switch (shader->output[i].name) {4094case TGSI_SEMANTIC_POSITION:4095output[j].array_base = 60;4096output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;4097pos_emitted = true;4098break;40994100case TGSI_SEMANTIC_PSIZE:4101output[j].array_base = 61;4102output[j].swizzle_y = 7;4103output[j].swizzle_z = 7;4104output[j].swizzle_w = 7;4105output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;4106pos_emitted = true;4107break;4108case TGSI_SEMANTIC_EDGEFLAG:4109output[j].array_base = 61;4110output[j].swizzle_x = 7;4111output[j].swizzle_y = 0;4112output[j].swizzle_z = 7;4113output[j].swizzle_w = 7;4114output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;4115pos_emitted = true;4116break;4117case TGSI_SEMANTIC_LAYER:4118/* spi_sid is 0 for outputs that are4119* not consumed by PS */4120if (shader->output[i].spi_sid) {4121output[j].array_base = next_param_base++;4122output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;4123j++;4124memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));4125}4126output[j].array_base = 61;4127output[j].swizzle_x = 7;4128output[j].swizzle_y = 7;4129output[j].swizzle_z = 0;4130output[j].swizzle_w = 7;4131output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;4132pos_emitted = true;4133break;4134case TGSI_SEMANTIC_VIEWPORT_INDEX:4135/* spi_sid is 0 for outputs that are4136* not consumed by PS */4137if (shader->output[i].spi_sid) {4138output[j].array_base = next_param_base++;4139output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;4140j++;4141memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));4142}4143output[j].array_base = 61;4144output[j].swizzle_x = 7;4145output[j].swizzle_y = 7;4146output[j].swizzle_z = 7;4147output[j].swizzle_w = 0;4148output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;4149pos_emitted = true;4150break;4151case TGSI_SEMANTIC_CLIPVERTEX:4152j--;4153break;4154case TGSI_SEMANTIC_CLIPDIST:4155output[j].array_base = next_clip_base++;4156output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;4157pos_emitted = true;4158/* spi_sid is 0 for clipdistance outputs that were generated4159* for clipvertex - we don't need to pass them to PS */4160if (shader->output[i].spi_sid) {4161j++;4162/* duplicate it as PARAM to pass to the pixel shader */4163memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));4164output[j].array_base = next_param_base++;4165output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;4166}4167break;4168case TGSI_SEMANTIC_FOG:4169output[j].swizzle_y = 4; /* 0 */4170output[j].swizzle_z = 4; /* 0 */4171output[j].swizzle_w = 5; /* 1 */4172break;4173case TGSI_SEMANTIC_PRIMID:4174output[j].swizzle_x = 2;4175output[j].swizzle_y = 4; /* 0 */4176output[j].swizzle_z = 4; /* 0 */4177output[j].swizzle_w = 4; /* 0 */4178break;4179}41804181break;4182case PIPE_SHADER_FRAGMENT:4183if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {4184/* never export more colors than the number of CBs */4185if (shader->output[i].sid >= max_color_exports) {4186/* skip export */4187j--;4188continue;4189}4190output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;4191output[j].array_base = shader->output[i].sid;4192output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;4193shader->nr_ps_color_exports++;4194shader->ps_color_export_mask |= (0xf << (shader->output[i].sid * 4));41954196/* If the i-th target format is set, all previous target formats must4197* be non-zero to avoid hangs. - from radeonsi, seems to apply to eg as well.4198*/4199if (shader->output[i].sid > 0)4200for (unsigned x = 0; x < shader->output[i].sid; x++)4201shader->ps_color_export_mask |= (1 << (x*4));42024203if (shader->output[i].sid > shader->ps_export_highest)4204shader->ps_export_highest = shader->output[i].sid;4205if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) {4206for (k = 1; k < max_color_exports; k++) {4207j++;4208memset(&output[j], 0, sizeof(struct r600_bytecode_output));4209output[j].gpr = shader->output[i].gpr;4210output[j].elem_size = 3;4211output[j].swizzle_x = 0;4212output[j].swizzle_y = 1;4213output[j].swizzle_z = 2;4214output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;4215output[j].burst_count = 1;4216output[j].array_base = k;4217output[j].op = CF_OP_EXPORT;4218output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;4219shader->nr_ps_color_exports++;4220if (k > shader->ps_export_highest)4221shader->ps_export_highest = k;4222shader->ps_color_export_mask |= (0xf << (j * 4));4223}4224}4225} else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {4226output[j].array_base = 61;4227output[j].swizzle_x = 2;4228output[j].swizzle_y = 7;4229output[j].swizzle_z = output[j].swizzle_w = 7;4230output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;4231} else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {4232output[j].array_base = 61;4233output[j].swizzle_x = 7;4234output[j].swizzle_y = 1;4235output[j].swizzle_z = output[j].swizzle_w = 7;4236output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;4237} else if (shader->output[i].name == TGSI_SEMANTIC_SAMPLEMASK) {4238output[j].array_base = 61;4239output[j].swizzle_x = 7;4240output[j].swizzle_y = 7;4241output[j].swizzle_z = 0;4242output[j].swizzle_w = 7;4243output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;4244} else {4245R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);4246r = -EINVAL;4247goto out_err;4248}4249break;4250case PIPE_SHADER_TESS_CTRL:4251break;4252default:4253R600_ERR("unsupported processor type %d\n", ctx.type);4254r = -EINVAL;4255goto out_err;4256}42574258if (output[j].type == 0xffffffff) {4259output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;4260output[j].array_base = next_param_base++;4261}4262}42634264/* add fake position export */4265if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && pos_emitted == false) {4266memset(&output[j], 0, sizeof(struct r600_bytecode_output));4267output[j].gpr = 0;4268output[j].elem_size = 3;4269output[j].swizzle_x = 7;4270output[j].swizzle_y = 7;4271output[j].swizzle_z = 7;4272output[j].swizzle_w = 7;4273output[j].burst_count = 1;4274output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;4275output[j].array_base = 60;4276output[j].op = CF_OP_EXPORT;4277j++;4278}42794280/* add fake param output for vertex shader if no param is exported */4281if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && next_param_base == 0) {4282memset(&output[j], 0, sizeof(struct r600_bytecode_output));4283output[j].gpr = 0;4284output[j].elem_size = 3;4285output[j].swizzle_x = 7;4286output[j].swizzle_y = 7;4287output[j].swizzle_z = 7;4288output[j].swizzle_w = 7;4289output[j].burst_count = 1;4290output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;4291output[j].array_base = 0;4292output[j].op = CF_OP_EXPORT;4293j++;4294}42954296/* add fake pixel export */4297if (ctx.type == PIPE_SHADER_FRAGMENT && shader->nr_ps_color_exports == 0) {4298memset(&output[j], 0, sizeof(struct r600_bytecode_output));4299output[j].gpr = 0;4300output[j].elem_size = 3;4301output[j].swizzle_x = 7;4302output[j].swizzle_y = 7;4303output[j].swizzle_z = 7;4304output[j].swizzle_w = 7;4305output[j].burst_count = 1;4306output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;4307output[j].array_base = 0;4308output[j].op = CF_OP_EXPORT;4309j++;4310shader->nr_ps_color_exports++;4311shader->ps_color_export_mask = 0xf;4312}43134314noutput = j;43154316/* set export done on last export of each type */4317for (k = noutput - 1, output_done = 0; k >= 0; k--) {4318if (!(output_done & (1 << output[k].type))) {4319output_done |= (1 << output[k].type);4320output[k].op = CF_OP_EXPORT_DONE;4321}4322}4323/* add output to bytecode */4324for (i = 0; i < noutput; i++) {4325r = r600_bytecode_add_output(ctx.bc, &output[i]);4326if (r)4327goto out_err;4328}4329}43304331/* add program end */4332if (ctx.bc->chip_class == CAYMAN)4333cm_bytecode_add_cf_end(ctx.bc);4334else {4335const struct cf_op_info *last = NULL;43364337if (ctx.bc->cf_last)4338last = r600_isa_cf(ctx.bc->cf_last->op);43394340/* alu clause instructions don't have EOP bit, so add NOP */4341if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_POP)4342r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);43434344ctx.bc->cf_last->end_of_program = 1;4345}43464347/* check GPR limit - we have 124 = 128 - 44348* (4 are reserved as alu clause temporary registers) */4349if (ctx.bc->ngpr > 124) {4350R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);4351r = -ENOMEM;4352goto out_err;4353}43544355if (ctx.type == PIPE_SHADER_GEOMETRY) {4356if ((r = generate_gs_copy_shader(rctx, pipeshader, &so)))4357return r;4358}43594360free(ctx.spilled_arrays);4361free(ctx.array_infos);4362free(ctx.literals);4363tgsi_parse_free(&ctx.parse);4364return 0;4365out_err:4366free(ctx.spilled_arrays);4367free(ctx.array_infos);4368free(ctx.literals);4369tgsi_parse_free(&ctx.parse);4370return r;4371}43724373static int tgsi_unsupported(struct r600_shader_ctx *ctx)4374{4375const unsigned tgsi_opcode =4376ctx->parse.FullToken.FullInstruction.Instruction.Opcode;4377R600_ERR("%s tgsi opcode unsupported\n",4378tgsi_get_opcode_name(tgsi_opcode));4379return -EINVAL;4380}43814382static int tgsi_end(struct r600_shader_ctx *ctx UNUSED)4383{4384return 0;4385}43864387static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,4388const struct r600_shader_src *shader_src,4389unsigned chan)4390{4391bc_src->sel = shader_src->sel;4392bc_src->chan = shader_src->swizzle[chan];4393bc_src->neg = shader_src->neg;4394bc_src->abs = shader_src->abs;4395bc_src->rel = shader_src->rel;4396bc_src->value = shader_src->value[bc_src->chan];4397bc_src->kc_bank = shader_src->kc_bank;4398bc_src->kc_rel = shader_src->kc_rel;4399}44004401static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)4402{4403bc_src->abs = 1;4404bc_src->neg = 0;4405}44064407static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)4408{4409bc_src->neg = !bc_src->neg;4410}44114412static void tgsi_dst(struct r600_shader_ctx *ctx,4413const struct tgsi_full_dst_register *tgsi_dst,4414unsigned swizzle,4415struct r600_bytecode_alu_dst *r600_dst)4416{4417struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;44184419if (tgsi_dst->Register.File == TGSI_FILE_TEMPORARY) {4420bool spilled;4421unsigned idx;44224423idx = map_tgsi_reg_index_to_r600_gpr(ctx, tgsi_dst->Register.Index, &spilled);44244425if (spilled) {4426struct r600_bytecode_output cf;4427int reg = 0;4428int r;4429bool add_pending_output = true;44304431memset(&cf, 0, sizeof(struct r600_bytecode_output));4432get_spilled_array_base_and_size(ctx, tgsi_dst->Register.Index,4433&cf.array_base, &cf.array_size);44344435/* If no component has spilled, reserve a register and add the spill code4436* ctx->bc->n_pending_outputs is cleared after each instruction group */4437if (ctx->bc->n_pending_outputs == 0) {4438reg = r600_get_temp(ctx);4439} else {4440/* If we are already spilling and the output address is the same like4441* before then just reuse the same slot */4442struct r600_bytecode_output *tmpl = &ctx->bc->pending_outputs[ctx->bc->n_pending_outputs-1];4443if ((cf.array_base + idx == tmpl->array_base) ||4444(cf.array_base == tmpl->array_base &&4445tmpl->index_gpr == ctx->bc->ar_reg &&4446tgsi_dst->Register.Indirect)) {4447reg = ctx->bc->pending_outputs[0].gpr;4448add_pending_output = false;4449} else {4450reg = r600_get_temp(ctx);4451}4452}44534454r600_dst->sel = reg;4455r600_dst->chan = swizzle;4456r600_dst->write = 1;4457if (inst->Instruction.Saturate) {4458r600_dst->clamp = 1;4459}44604461/* Add new outputs as pending */4462if (add_pending_output) {4463cf.op = CF_OP_MEM_SCRATCH;4464cf.elem_size = 3;4465cf.gpr = reg;4466cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;4467cf.mark = 1;4468cf.comp_mask = inst->Dst[0].Register.WriteMask;4469cf.swizzle_x = 0;4470cf.swizzle_y = 1;4471cf.swizzle_z = 2;4472cf.swizzle_w = 3;4473cf.burst_count = 1;44744475if (tgsi_dst->Register.Indirect) {4476if (ctx->bc->chip_class < R700)4477cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;4478else4479cf.type = 3; // V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK;4480cf.index_gpr = ctx->bc->ar_reg;4481}4482else {4483cf.array_base += idx;4484cf.array_size = 0;4485}44864487r = r600_bytecode_add_pending_output(ctx->bc, &cf);4488if (r)4489return;44904491if (ctx->bc->chip_class >= R700)4492r600_bytecode_need_wait_ack(ctx->bc, true);4493}4494return;4495}4496else {4497r600_dst->sel = idx;4498}4499}4500else {4501r600_dst->sel = tgsi_dst->Register.Index;4502r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];4503}4504r600_dst->chan = swizzle;4505r600_dst->write = 1;4506if (inst->Instruction.Saturate) {4507r600_dst->clamp = 1;4508}4509if (ctx->type == PIPE_SHADER_TESS_CTRL) {4510if (tgsi_dst->Register.File == TGSI_FILE_OUTPUT) {4511return;4512}4513}4514if (tgsi_dst->Register.Indirect)4515r600_dst->rel = V_SQ_REL_RELATIVE;45164517}45184519static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool singledest, bool swap, int dest_temp, int op_override)4520{4521struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;4522unsigned write_mask = inst->Dst[0].Register.WriteMask;4523struct r600_bytecode_alu alu;4524int i, j, r, lasti = tgsi_last_instruction(write_mask);4525int use_tmp = 0;4526int swizzle_x = inst->Src[0].Register.SwizzleX;45274528if (singledest) {4529switch (write_mask) {4530case 0x1:4531if (swizzle_x == 2) {4532write_mask = 0xc;4533use_tmp = 3;4534} else4535write_mask = 0x3;4536break;4537case 0x2:4538if (swizzle_x == 2) {4539write_mask = 0xc;4540use_tmp = 3;4541} else {4542write_mask = 0x3;4543use_tmp = 1;4544}4545break;4546case 0x4:4547if (swizzle_x == 0) {4548write_mask = 0x3;4549use_tmp = 1;4550} else4551write_mask = 0xc;4552break;4553case 0x8:4554if (swizzle_x == 0) {4555write_mask = 0x3;4556use_tmp = 1;4557} else {4558write_mask = 0xc;4559use_tmp = 3;4560}4561break;4562}4563}45644565lasti = tgsi_last_instruction(write_mask);4566for (i = 0; i <= lasti; i++) {45674568if (!(write_mask & (1 << i)))4569continue;45704571memset(&alu, 0, sizeof(struct r600_bytecode_alu));45724573if (singledest) {4574if (use_tmp || dest_temp) {4575alu.dst.sel = use_tmp ? ctx->temp_reg : dest_temp;4576alu.dst.chan = i;4577alu.dst.write = 1;4578} else {4579tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);4580}4581if (i == 1 || i == 3)4582alu.dst.write = 0;4583} else4584tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);45854586alu.op = op_override ? op_override : ctx->inst_info->op;4587if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DABS) {4588r600_bytecode_src(&alu.src[0], &ctx->src[0], i);4589} else if (!swap) {4590for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {4591r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));4592}4593} else {4594r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i));4595r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i));4596}45974598/* handle some special cases */4599if (i == 1 || i == 3) {4600switch (ctx->parse.FullToken.FullInstruction.Instruction.Opcode) {4601case TGSI_OPCODE_DABS:4602r600_bytecode_src_set_abs(&alu.src[0]);4603break;4604default:4605break;4606}4607}4608if (i == lasti) {4609alu.last = 1;4610}4611r = r600_bytecode_add_alu(ctx->bc, &alu);4612if (r)4613return r;4614}46154616if (use_tmp) {4617write_mask = inst->Dst[0].Register.WriteMask;46184619lasti = tgsi_last_instruction(write_mask);4620/* move result from temp to dst */4621for (i = 0; i <= lasti; i++) {4622if (!(write_mask & (1 << i)))4623continue;46244625memset(&alu, 0, sizeof(struct r600_bytecode_alu));4626alu.op = ALU_OP1_MOV;46274628if (dest_temp) {4629alu.dst.sel = dest_temp;4630alu.dst.chan = i;4631alu.dst.write = 1;4632} else4633tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);4634alu.src[0].sel = ctx->temp_reg;4635alu.src[0].chan = use_tmp - 1;4636alu.last = (i == lasti);46374638r = r600_bytecode_add_alu(ctx->bc, &alu);4639if (r)4640return r;4641}4642}4643return 0;4644}46454646static int tgsi_op2_64(struct r600_shader_ctx *ctx)4647{4648struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;4649unsigned write_mask = inst->Dst[0].Register.WriteMask;4650/* confirm writemasking */4651if ((write_mask & 0x3) != 0x3 &&4652(write_mask & 0xc) != 0xc) {4653fprintf(stderr, "illegal writemask for 64-bit: 0x%x\n", write_mask);4654return -1;4655}4656return tgsi_op2_64_params(ctx, false, false, 0, 0);4657}46584659static int tgsi_op2_64_single_dest(struct r600_shader_ctx *ctx)4660{4661return tgsi_op2_64_params(ctx, true, false, 0, 0);4662}46634664static int tgsi_op2_64_single_dest_s(struct r600_shader_ctx *ctx)4665{4666return tgsi_op2_64_params(ctx, true, true, 0, 0);4667}46684669static int tgsi_op3_64(struct r600_shader_ctx *ctx)4670{4671struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;4672struct r600_bytecode_alu alu;4673int i, j, r;4674int lasti = 3;4675int tmp = r600_get_temp(ctx);46764677for (i = 0; i < lasti + 1; i++) {46784679memset(&alu, 0, sizeof(struct r600_bytecode_alu));4680alu.op = ctx->inst_info->op;4681for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {4682r600_bytecode_src(&alu.src[j], &ctx->src[j], i == 3 ? 0 : 1);4683}46844685if (inst->Dst[0].Register.WriteMask & (1 << i))4686tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);4687else4688alu.dst.sel = tmp;46894690alu.dst.chan = i;4691alu.is_op3 = 1;4692if (i == lasti) {4693alu.last = 1;4694}4695r = r600_bytecode_add_alu(ctx->bc, &alu);4696if (r)4697return r;4698}4699return 0;4700}47014702static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)4703{4704struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;4705struct r600_bytecode_alu alu;4706unsigned write_mask = inst->Dst[0].Register.WriteMask;4707int i, j, r, lasti = tgsi_last_instruction(write_mask);4708/* use temp register if trans_only and more than one dst component */4709int use_tmp = trans_only && (write_mask ^ (1 << lasti));4710unsigned op = ctx->inst_info->op;47114712if (op == ALU_OP2_MUL_IEEE &&4713ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])4714op = ALU_OP2_MUL;47154716/* nir_to_tgsi lowers nir_op_isub to UADD + negate, since r600 doesn't support4717* source modifiers with integer ops we switch back to SUB_INT */4718bool src1_neg = ctx->src[1].neg;4719if (op == ALU_OP2_ADD_INT && src1_neg) {4720src1_neg = false;4721op = ALU_OP2_SUB_INT;4722}47234724for (i = 0; i <= lasti; i++) {4725if (!(write_mask & (1 << i)))4726continue;47274728memset(&alu, 0, sizeof(struct r600_bytecode_alu));4729if (use_tmp) {4730alu.dst.sel = ctx->temp_reg;4731alu.dst.chan = i;4732alu.dst.write = 1;4733} else4734tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);47354736alu.op = op;4737if (!swap) {4738for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {4739r600_bytecode_src(&alu.src[j], &ctx->src[j], i);4740}4741alu.src[1].neg = src1_neg;4742} else {4743r600_bytecode_src(&alu.src[0], &ctx->src[1], i);4744r600_bytecode_src(&alu.src[1], &ctx->src[0], i);4745}4746if (i == lasti || trans_only) {4747alu.last = 1;4748}4749r = r600_bytecode_add_alu(ctx->bc, &alu);4750if (r)4751return r;4752}47534754if (use_tmp) {4755/* move result from temp to dst */4756for (i = 0; i <= lasti; i++) {4757if (!(write_mask & (1 << i)))4758continue;47594760memset(&alu, 0, sizeof(struct r600_bytecode_alu));4761alu.op = ALU_OP1_MOV;4762tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);4763alu.src[0].sel = ctx->temp_reg;4764alu.src[0].chan = i;4765alu.last = (i == lasti);47664767r = r600_bytecode_add_alu(ctx->bc, &alu);4768if (r)4769return r;4770}4771}4772return 0;4773}47744775static int tgsi_op2(struct r600_shader_ctx *ctx)4776{4777return tgsi_op2_s(ctx, 0, 0);4778}47794780static int tgsi_op2_swap(struct r600_shader_ctx *ctx)4781{4782return tgsi_op2_s(ctx, 1, 0);4783}47844785static int tgsi_op2_trans(struct r600_shader_ctx *ctx)4786{4787return tgsi_op2_s(ctx, 0, 1);4788}47894790static int tgsi_ineg(struct r600_shader_ctx *ctx)4791{4792struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;4793struct r600_bytecode_alu alu;4794int i, r;4795int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);47964797for (i = 0; i < lasti + 1; i++) {47984799if (!(inst->Dst[0].Register.WriteMask & (1 << i)))4800continue;4801memset(&alu, 0, sizeof(struct r600_bytecode_alu));4802alu.op = ctx->inst_info->op;48034804alu.src[0].sel = V_SQ_ALU_SRC_0;48054806r600_bytecode_src(&alu.src[1], &ctx->src[0], i);48074808tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);48094810if (i == lasti) {4811alu.last = 1;4812}4813r = r600_bytecode_add_alu(ctx->bc, &alu);4814if (r)4815return r;4816}4817return 0;48184819}48204821static int tgsi_dneg(struct r600_shader_ctx *ctx)4822{4823struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;4824struct r600_bytecode_alu alu;4825int i, r;4826int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);48274828for (i = 0; i < lasti + 1; i++) {48294830if (!(inst->Dst[0].Register.WriteMask & (1 << i)))4831continue;4832memset(&alu, 0, sizeof(struct r600_bytecode_alu));4833alu.op = ALU_OP1_MOV;48344835r600_bytecode_src(&alu.src[0], &ctx->src[0], i);48364837if (i == 1 || i == 3)4838r600_bytecode_src_toggle_neg(&alu.src[0]);4839tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);48404841if (i == lasti) {4842alu.last = 1;4843}4844r = r600_bytecode_add_alu(ctx->bc, &alu);4845if (r)4846return r;4847}4848return 0;48494850}48514852static int tgsi_dfracexp(struct r600_shader_ctx *ctx)4853{4854struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;4855struct r600_bytecode_alu alu;4856unsigned write_mask = inst->Dst[0].Register.WriteMask;4857int i, j, r;48584859for (i = 0; i <= 3; i++) {4860memset(&alu, 0, sizeof(struct r600_bytecode_alu));4861alu.op = ctx->inst_info->op;48624863alu.dst.sel = ctx->temp_reg;4864alu.dst.chan = i;4865alu.dst.write = 1;4866for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {4867r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));4868}48694870if (i == 3)4871alu.last = 1;48724873r = r600_bytecode_add_alu(ctx->bc, &alu);4874if (r)4875return r;4876}48774878/* Replicate significand result across channels. */4879for (i = 0; i <= 3; i++) {4880if (!(write_mask & (1 << i)))4881continue;48824883memset(&alu, 0, sizeof(struct r600_bytecode_alu));4884alu.op = ALU_OP1_MOV;4885alu.src[0].chan = (i & 1) + 2;4886alu.src[0].sel = ctx->temp_reg;48874888tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);4889alu.dst.write = 1;4890alu.last = 1;4891r = r600_bytecode_add_alu(ctx->bc, &alu);4892if (r)4893return r;4894}48954896for (i = 0; i <= 3; i++) {4897if (inst->Dst[1].Register.WriteMask & (1 << i)) {4898/* MOV third channels to writemask dst1 */4899memset(&alu, 0, sizeof(struct r600_bytecode_alu));4900alu.op = ALU_OP1_MOV;4901alu.src[0].chan = 1;4902alu.src[0].sel = ctx->temp_reg;49034904tgsi_dst(ctx, &inst->Dst[1], i, &alu.dst);4905alu.last = 1;4906r = r600_bytecode_add_alu(ctx->bc, &alu);4907if (r)4908return r;4909break;4910}4911}4912return 0;4913}491449154916static int egcm_int_to_double(struct r600_shader_ctx *ctx)4917{4918struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;4919struct r600_bytecode_alu alu;4920int i, c, r;4921int write_mask = inst->Dst[0].Register.WriteMask;4922int temp_reg = r600_get_temp(ctx);49234924assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D ||4925inst->Instruction.Opcode == TGSI_OPCODE_U2D);49264927for (c = 0; c < 2; c++) {4928int dchan = c * 2;4929if (write_mask & (0x3 << dchan)) {4930/* split into 24-bit int and 8-bit int */4931memset(&alu, 0, sizeof(struct r600_bytecode_alu));4932alu.op = ALU_OP2_AND_INT;4933alu.dst.sel = temp_reg;4934alu.dst.chan = dchan;4935r600_bytecode_src(&alu.src[0], &ctx->src[0], c);4936alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;4937alu.src[1].value = 0xffffff00;4938alu.dst.write = 1;4939r = r600_bytecode_add_alu(ctx->bc, &alu);4940if (r)4941return r;49424943memset(&alu, 0, sizeof(struct r600_bytecode_alu));4944alu.op = ALU_OP2_AND_INT;4945alu.dst.sel = temp_reg;4946alu.dst.chan = dchan + 1;4947r600_bytecode_src(&alu.src[0], &ctx->src[0], c);4948alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;4949alu.src[1].value = 0xff;4950alu.dst.write = 1;4951alu.last = 1;4952r = r600_bytecode_add_alu(ctx->bc, &alu);4953if (r)4954return r;4955}4956}49574958for (c = 0; c < 2; c++) {4959int dchan = c * 2;4960if (write_mask & (0x3 << dchan)) {4961for (i = dchan; i <= dchan + 1; i++) {4962memset(&alu, 0, sizeof(struct r600_bytecode_alu));4963alu.op = i == dchan ? ctx->inst_info->op : ALU_OP1_UINT_TO_FLT;49644965alu.src[0].sel = temp_reg;4966alu.src[0].chan = i;4967alu.dst.sel = temp_reg;4968alu.dst.chan = i;4969alu.dst.write = 1;4970if (ctx->bc->chip_class == CAYMAN)4971alu.last = i == dchan + 1;4972else4973alu.last = 1; /* trans only ops on evergreen */49744975r = r600_bytecode_add_alu(ctx->bc, &alu);4976if (r)4977return r;4978}4979}4980}49814982for (c = 0; c < 2; c++) {4983int dchan = c * 2;4984if (write_mask & (0x3 << dchan)) {4985for (i = 0; i < 4; i++) {4986memset(&alu, 0, sizeof(struct r600_bytecode_alu));4987alu.op = ALU_OP1_FLT32_TO_FLT64;49884989alu.src[0].chan = dchan + (i / 2);4990if (i == 0 || i == 2)4991alu.src[0].sel = temp_reg;4992else {4993alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;4994alu.src[0].value = 0x0;4995}4996alu.dst.sel = ctx->temp_reg;4997alu.dst.chan = i;4998alu.last = i == 3;4999alu.dst.write = 1;50005001r = r600_bytecode_add_alu(ctx->bc, &alu);5002if (r)5003return r;5004}50055006for (i = 0; i <= 1; i++) {5007memset(&alu, 0, sizeof(struct r600_bytecode_alu));5008alu.op = ALU_OP2_ADD_64;50095010alu.src[0].chan = fp64_switch(i);5011alu.src[0].sel = ctx->temp_reg;50125013alu.src[1].chan = fp64_switch(i + 2);5014alu.src[1].sel = ctx->temp_reg;5015tgsi_dst(ctx, &inst->Dst[0], dchan + i, &alu.dst);5016alu.last = i == 1;50175018r = r600_bytecode_add_alu(ctx->bc, &alu);5019if (r)5020return r;5021}5022}5023}50245025return 0;5026}50275028static int egcm_double_to_int(struct r600_shader_ctx *ctx)5029{5030struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;5031struct r600_bytecode_alu alu;5032int i, r;5033int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);5034int treg = r600_get_temp(ctx);5035assert(inst->Instruction.Opcode == TGSI_OPCODE_D2I ||5036inst->Instruction.Opcode == TGSI_OPCODE_D2U);50375038/* do a 64->32 into a temp register */5039r = tgsi_op2_64_params(ctx, true, false, treg, ALU_OP1_FLT64_TO_FLT32);5040if (r)5041return r;50425043for (i = 0; i <= lasti; i++) {5044if (!(inst->Dst[0].Register.WriteMask & (1 << i)))5045continue;5046memset(&alu, 0, sizeof(struct r600_bytecode_alu));5047alu.op = ctx->inst_info->op;50485049alu.src[0].chan = i;5050alu.src[0].sel = treg;5051tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);5052alu.last = (i == lasti);50535054r = r600_bytecode_add_alu(ctx->bc, &alu);5055if (r)5056return r;5057}50585059return 0;5060}50615062static int cayman_emit_unary_double_raw(struct r600_bytecode *bc,5063unsigned op,5064int dst_reg,5065struct r600_shader_src *src,5066bool abs)5067{5068struct r600_bytecode_alu alu;5069const int last_slot = 3;5070int r;50715072/* these have to write the result to X/Y by the looks of it */5073for (int i = 0 ; i < last_slot; i++) {5074memset(&alu, 0, sizeof(struct r600_bytecode_alu));5075alu.op = op;50765077r600_bytecode_src(&alu.src[0], src, 1);5078r600_bytecode_src(&alu.src[1], src, 0);50795080if (abs)5081r600_bytecode_src_set_abs(&alu.src[1]);50825083alu.dst.sel = dst_reg;5084alu.dst.chan = i;5085alu.dst.write = (i == 0 || i == 1);50865087if (bc->chip_class != CAYMAN || i == last_slot - 1)5088alu.last = 1;5089r = r600_bytecode_add_alu(bc, &alu);5090if (r)5091return r;5092}50935094return 0;5095}50965097static int cayman_emit_double_instr(struct r600_shader_ctx *ctx)5098{5099struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;5100int i, r;5101struct r600_bytecode_alu alu;5102int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);5103int t1 = ctx->temp_reg;51045105/* should only be one src regs */5106assert(inst->Instruction.NumSrcRegs == 1);51075108/* only support one double at a time */5109assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||5110inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);51115112r = cayman_emit_unary_double_raw(5113ctx->bc, ctx->inst_info->op, t1,5114&ctx->src[0],5115ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ ||5116ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT);5117if (r)5118return r;51195120for (i = 0 ; i <= lasti; i++) {5121if (!(inst->Dst[0].Register.WriteMask & (1 << i)))5122continue;5123memset(&alu, 0, sizeof(struct r600_bytecode_alu));5124alu.op = ALU_OP1_MOV;5125alu.src[0].sel = t1;5126alu.src[0].chan = (i == 0 || i == 2) ? 0 : 1;5127tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);5128alu.dst.write = 1;5129if (i == lasti)5130alu.last = 1;5131r = r600_bytecode_add_alu(ctx->bc, &alu);5132if (r)5133return r;5134}5135return 0;5136}51375138static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)5139{5140struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;5141int i, j, r;5142struct r600_bytecode_alu alu;5143int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;51445145for (i = 0 ; i < last_slot; i++) {5146memset(&alu, 0, sizeof(struct r600_bytecode_alu));5147alu.op = ctx->inst_info->op;5148for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {5149r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);51505151/* RSQ should take the absolute value of src */5152if (inst->Instruction.Opcode == TGSI_OPCODE_RSQ) {5153r600_bytecode_src_set_abs(&alu.src[j]);5154}5155}5156tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);5157alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;51585159if (i == last_slot - 1)5160alu.last = 1;5161r = r600_bytecode_add_alu(ctx->bc, &alu);5162if (r)5163return r;5164}5165return 0;5166}51675168static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)5169{5170struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;5171int i, j, k, r;5172struct r600_bytecode_alu alu;5173int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);5174int t1 = ctx->temp_reg;51755176for (k = 0; k <= lasti; k++) {5177if (!(inst->Dst[0].Register.WriteMask & (1 << k)))5178continue;51795180for (i = 0 ; i < 4; i++) {5181memset(&alu, 0, sizeof(struct r600_bytecode_alu));5182alu.op = ctx->inst_info->op;5183for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {5184r600_bytecode_src(&alu.src[j], &ctx->src[j], k);5185}5186alu.dst.sel = t1;5187alu.dst.chan = i;5188alu.dst.write = (i == k);5189if (i == 3)5190alu.last = 1;5191r = r600_bytecode_add_alu(ctx->bc, &alu);5192if (r)5193return r;5194}5195}51965197for (i = 0 ; i <= lasti; i++) {5198if (!(inst->Dst[0].Register.WriteMask & (1 << i)))5199continue;5200memset(&alu, 0, sizeof(struct r600_bytecode_alu));5201alu.op = ALU_OP1_MOV;5202alu.src[0].sel = t1;5203alu.src[0].chan = i;5204tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);5205alu.dst.write = 1;5206if (i == lasti)5207alu.last = 1;5208r = r600_bytecode_add_alu(ctx->bc, &alu);5209if (r)5210return r;5211}52125213return 0;5214}521552165217static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)5218{5219struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;5220int i, j, k, r;5221struct r600_bytecode_alu alu;5222int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);5223int t1 = ctx->temp_reg;52245225/* t1 would get overwritten below if we actually tried to5226* multiply two pairs of doubles at a time. */5227assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||5228inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);52295230k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;52315232for (i = 0; i < 4; i++) {5233memset(&alu, 0, sizeof(struct r600_bytecode_alu));5234alu.op = ctx->inst_info->op;5235for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {5236r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));5237}5238alu.dst.sel = t1;5239alu.dst.chan = i;5240alu.dst.write = 1;5241if (i == 3)5242alu.last = 1;5243r = r600_bytecode_add_alu(ctx->bc, &alu);5244if (r)5245return r;5246}52475248for (i = 0; i <= lasti; i++) {5249if (!(inst->Dst[0].Register.WriteMask & (1 << i)))5250continue;5251memset(&alu, 0, sizeof(struct r600_bytecode_alu));5252alu.op = ALU_OP1_MOV;5253alu.src[0].sel = t1;5254alu.src[0].chan = i;5255tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);5256alu.dst.write = 1;5257if (i == lasti)5258alu.last = 1;5259r = r600_bytecode_add_alu(ctx->bc, &alu);5260if (r)5261return r;5262}52635264return 0;5265}52665267/*5268* Emit RECIP_64 + MUL_64 to implement division.5269*/5270static int cayman_ddiv_instr(struct r600_shader_ctx *ctx)5271{5272struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;5273int r;5274struct r600_bytecode_alu alu;5275int t1 = ctx->temp_reg;5276int k;52775278/* Only support one double at a time. This is the same constraint as5279* in DMUL lowering. */5280assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||5281inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);52825283k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;52845285r = cayman_emit_unary_double_raw(ctx->bc, ALU_OP2_RECIP_64, t1, &ctx->src[1], false);5286if (r)5287return r;52885289for (int i = 0; i < 4; i++) {5290memset(&alu, 0, sizeof(struct r600_bytecode_alu));5291alu.op = ALU_OP2_MUL_64;52925293r600_bytecode_src(&alu.src[0], &ctx->src[0], k * 2 + ((i == 3) ? 0 : 1));52945295alu.src[1].sel = t1;5296alu.src[1].chan = (i == 3) ? 0 : 1;52975298alu.dst.sel = t1;5299alu.dst.chan = i;5300alu.dst.write = 1;5301if (i == 3)5302alu.last = 1;5303r = r600_bytecode_add_alu(ctx->bc, &alu);5304if (r)5305return r;5306}53075308for (int i = 0; i < 2; i++) {5309memset(&alu, 0, sizeof(struct r600_bytecode_alu));5310alu.op = ALU_OP1_MOV;5311alu.src[0].sel = t1;5312alu.src[0].chan = i;5313tgsi_dst(ctx, &inst->Dst[0], k * 2 + i, &alu.dst);5314alu.dst.write = 1;5315if (i == 1)5316alu.last = 1;5317r = r600_bytecode_add_alu(ctx->bc, &alu);5318if (r)5319return r;5320}5321return 0;5322}53235324/*5325* r600 - trunc to -PI..PI range5326* r700 - normalize by dividing by 2PI5327* see fdo bug 279015328*/5329static int tgsi_setup_trig(struct r600_shader_ctx *ctx)5330{5331int r;5332struct r600_bytecode_alu alu;53335334memset(&alu, 0, sizeof(struct r600_bytecode_alu));5335alu.op = ALU_OP3_MULADD;5336alu.is_op3 = 1;53375338alu.dst.chan = 0;5339alu.dst.sel = ctx->temp_reg;5340alu.dst.write = 1;53415342r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);53435344alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;5345alu.src[1].chan = 0;5346alu.src[1].value = u_bitcast_f2u(0.5f * M_1_PI);5347alu.src[2].sel = V_SQ_ALU_SRC_0_5;5348alu.src[2].chan = 0;5349alu.last = 1;5350r = r600_bytecode_add_alu(ctx->bc, &alu);5351if (r)5352return r;53535354memset(&alu, 0, sizeof(struct r600_bytecode_alu));5355alu.op = ALU_OP1_FRACT;53565357alu.dst.chan = 0;5358alu.dst.sel = ctx->temp_reg;5359alu.dst.write = 1;53605361alu.src[0].sel = ctx->temp_reg;5362alu.src[0].chan = 0;5363alu.last = 1;5364r = r600_bytecode_add_alu(ctx->bc, &alu);5365if (r)5366return r;53675368memset(&alu, 0, sizeof(struct r600_bytecode_alu));5369alu.op = ALU_OP3_MULADD;5370alu.is_op3 = 1;53715372alu.dst.chan = 0;5373alu.dst.sel = ctx->temp_reg;5374alu.dst.write = 1;53755376alu.src[0].sel = ctx->temp_reg;5377alu.src[0].chan = 0;53785379alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;5380alu.src[1].chan = 0;5381alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;5382alu.src[2].chan = 0;53835384if (ctx->bc->chip_class == R600) {5385alu.src[1].value = u_bitcast_f2u(2.0f * M_PI);5386alu.src[2].value = u_bitcast_f2u(-M_PI);5387} else {5388alu.src[1].sel = V_SQ_ALU_SRC_1;5389alu.src[2].sel = V_SQ_ALU_SRC_0_5;5390alu.src[2].neg = 1;5391}53925393alu.last = 1;5394r = r600_bytecode_add_alu(ctx->bc, &alu);5395if (r)5396return r;5397return 0;5398}53995400static int cayman_trig(struct r600_shader_ctx *ctx)5401{5402struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;5403struct r600_bytecode_alu alu;5404int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;5405int i, r;54065407r = tgsi_setup_trig(ctx);5408if (r)5409return r;541054115412for (i = 0; i < last_slot; i++) {5413memset(&alu, 0, sizeof(struct r600_bytecode_alu));5414alu.op = ctx->inst_info->op;5415alu.dst.chan = i;54165417tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);5418alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;54195420alu.src[0].sel = ctx->temp_reg;5421alu.src[0].chan = 0;5422if (i == last_slot - 1)5423alu.last = 1;5424r = r600_bytecode_add_alu(ctx->bc, &alu);5425if (r)5426return r;5427}5428return 0;5429}54305431static int tgsi_trig(struct r600_shader_ctx *ctx)5432{5433struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;5434struct r600_bytecode_alu alu;5435int i, r;5436int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);54375438r = tgsi_setup_trig(ctx);5439if (r)5440return r;54415442memset(&alu, 0, sizeof(struct r600_bytecode_alu));5443alu.op = ctx->inst_info->op;5444alu.dst.chan = 0;5445alu.dst.sel = ctx->temp_reg;5446alu.dst.write = 1;54475448alu.src[0].sel = ctx->temp_reg;5449alu.src[0].chan = 0;5450alu.last = 1;5451r = r600_bytecode_add_alu(ctx->bc, &alu);5452if (r)5453return r;54545455/* replicate result */5456for (i = 0; i < lasti + 1; i++) {5457if (!(inst->Dst[0].Register.WriteMask & (1 << i)))5458continue;54595460memset(&alu, 0, sizeof(struct r600_bytecode_alu));5461alu.op = ALU_OP1_MOV;54625463alu.src[0].sel = ctx->temp_reg;5464tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);5465if (i == lasti)5466alu.last = 1;5467r = r600_bytecode_add_alu(ctx->bc, &alu);5468if (r)5469return r;5470}5471return 0;5472}54735474static int tgsi_kill(struct r600_shader_ctx *ctx)5475{5476const struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;5477struct r600_bytecode_alu alu;5478int i, r;54795480for (i = 0; i < 4; i++) {5481memset(&alu, 0, sizeof(struct r600_bytecode_alu));5482alu.op = ctx->inst_info->op;54835484alu.dst.chan = i;54855486alu.src[0].sel = V_SQ_ALU_SRC_0;54875488if (inst->Instruction.Opcode == TGSI_OPCODE_KILL) {5489alu.src[1].sel = V_SQ_ALU_SRC_1;5490alu.src[1].neg = 1;5491} else {5492r600_bytecode_src(&alu.src[1], &ctx->src[0], i);5493}5494if (i == 3) {5495alu.last = 1;5496}5497r = r600_bytecode_add_alu(ctx->bc, &alu);5498if (r)5499return r;5500}55015502/* kill must be last in ALU */5503ctx->bc->force_add_cf = 1;5504ctx->shader->uses_kill = TRUE;5505return 0;5506}55075508static int tgsi_lit(struct r600_shader_ctx *ctx)5509{5510struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;5511struct r600_bytecode_alu alu;5512int r;55135514/* tmp.x = max(src.y, 0.0) */5515memset(&alu, 0, sizeof(struct r600_bytecode_alu));5516alu.op = ALU_OP2_MAX;5517r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);5518alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/5519alu.src[1].chan = 1;55205521alu.dst.sel = ctx->temp_reg;5522alu.dst.chan = 0;5523alu.dst.write = 1;55245525alu.last = 1;5526r = r600_bytecode_add_alu(ctx->bc, &alu);5527if (r)5528return r;55295530if (inst->Dst[0].Register.WriteMask & (1 << 2))5531{5532int chan;5533int sel;5534unsigned i;55355536if (ctx->bc->chip_class == CAYMAN) {5537for (i = 0; i < 3; i++) {5538/* tmp.z = log(tmp.x) */5539memset(&alu, 0, sizeof(struct r600_bytecode_alu));5540alu.op = ALU_OP1_LOG_CLAMPED;5541alu.src[0].sel = ctx->temp_reg;5542alu.src[0].chan = 0;5543alu.dst.sel = ctx->temp_reg;5544alu.dst.chan = i;5545if (i == 2) {5546alu.dst.write = 1;5547alu.last = 1;5548} else5549alu.dst.write = 0;55505551r = r600_bytecode_add_alu(ctx->bc, &alu);5552if (r)5553return r;5554}5555} else {5556/* tmp.z = log(tmp.x) */5557memset(&alu, 0, sizeof(struct r600_bytecode_alu));5558alu.op = ALU_OP1_LOG_CLAMPED;5559alu.src[0].sel = ctx->temp_reg;5560alu.src[0].chan = 0;5561alu.dst.sel = ctx->temp_reg;5562alu.dst.chan = 2;5563alu.dst.write = 1;5564alu.last = 1;5565r = r600_bytecode_add_alu(ctx->bc, &alu);5566if (r)5567return r;5568}55695570chan = alu.dst.chan;5571sel = alu.dst.sel;55725573/* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */5574memset(&alu, 0, sizeof(struct r600_bytecode_alu));5575alu.op = ALU_OP3_MUL_LIT;5576alu.src[0].sel = sel;5577alu.src[0].chan = chan;5578r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);5579r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);5580alu.dst.sel = ctx->temp_reg;5581alu.dst.chan = 0;5582alu.dst.write = 1;5583alu.is_op3 = 1;5584alu.last = 1;5585r = r600_bytecode_add_alu(ctx->bc, &alu);5586if (r)5587return r;55885589if (ctx->bc->chip_class == CAYMAN) {5590for (i = 0; i < 3; i++) {5591/* dst.z = exp(tmp.x) */5592memset(&alu, 0, sizeof(struct r600_bytecode_alu));5593alu.op = ALU_OP1_EXP_IEEE;5594alu.src[0].sel = ctx->temp_reg;5595alu.src[0].chan = 0;5596tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);5597if (i == 2) {5598alu.dst.write = 1;5599alu.last = 1;5600} else5601alu.dst.write = 0;5602r = r600_bytecode_add_alu(ctx->bc, &alu);5603if (r)5604return r;5605}5606} else {5607/* dst.z = exp(tmp.x) */5608memset(&alu, 0, sizeof(struct r600_bytecode_alu));5609alu.op = ALU_OP1_EXP_IEEE;5610alu.src[0].sel = ctx->temp_reg;5611alu.src[0].chan = 0;5612tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);5613alu.last = 1;5614r = r600_bytecode_add_alu(ctx->bc, &alu);5615if (r)5616return r;5617}5618}56195620/* dst.x, <- 1.0 */5621memset(&alu, 0, sizeof(struct r600_bytecode_alu));5622alu.op = ALU_OP1_MOV;5623alu.src[0].sel = V_SQ_ALU_SRC_1; /*1.0*/5624alu.src[0].chan = 0;5625tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);5626alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;5627r = r600_bytecode_add_alu(ctx->bc, &alu);5628if (r)5629return r;56305631/* dst.y = max(src.x, 0.0) */5632memset(&alu, 0, sizeof(struct r600_bytecode_alu));5633alu.op = ALU_OP2_MAX;5634r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);5635alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/5636alu.src[1].chan = 0;5637tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);5638alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;5639r = r600_bytecode_add_alu(ctx->bc, &alu);5640if (r)5641return r;56425643/* dst.w, <- 1.0 */5644memset(&alu, 0, sizeof(struct r600_bytecode_alu));5645alu.op = ALU_OP1_MOV;5646alu.src[0].sel = V_SQ_ALU_SRC_1;5647alu.src[0].chan = 0;5648tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);5649alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;5650alu.last = 1;5651r = r600_bytecode_add_alu(ctx->bc, &alu);5652if (r)5653return r;56545655return 0;5656}56575658static int tgsi_rsq(struct r600_shader_ctx *ctx)5659{5660struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;5661struct r600_bytecode_alu alu;5662int i, r;56635664memset(&alu, 0, sizeof(struct r600_bytecode_alu));56655666alu.op = ALU_OP1_RECIPSQRT_IEEE;56675668for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {5669r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);5670r600_bytecode_src_set_abs(&alu.src[i]);5671}5672alu.dst.sel = ctx->temp_reg;5673alu.dst.write = 1;5674alu.last = 1;5675r = r600_bytecode_add_alu(ctx->bc, &alu);5676if (r)5677return r;5678/* replicate result */5679return tgsi_helper_tempx_replicate(ctx);5680}56815682static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)5683{5684struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;5685struct r600_bytecode_alu alu;5686int i, r;56875688for (i = 0; i < 4; i++) {5689memset(&alu, 0, sizeof(struct r600_bytecode_alu));5690alu.src[0].sel = ctx->temp_reg;5691alu.op = ALU_OP1_MOV;5692alu.dst.chan = i;5693tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);5694alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;5695if (i == 3)5696alu.last = 1;5697r = r600_bytecode_add_alu(ctx->bc, &alu);5698if (r)5699return r;5700}5701return 0;5702}57035704static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)5705{5706struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;5707struct r600_bytecode_alu alu;5708int i, r;57095710memset(&alu, 0, sizeof(struct r600_bytecode_alu));5711alu.op = ctx->inst_info->op;5712for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {5713r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);5714}5715alu.dst.sel = ctx->temp_reg;5716alu.dst.write = 1;5717alu.last = 1;5718r = r600_bytecode_add_alu(ctx->bc, &alu);5719if (r)5720return r;5721/* replicate result */5722return tgsi_helper_tempx_replicate(ctx);5723}57245725static int cayman_pow(struct r600_shader_ctx *ctx)5726{5727struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;5728int i, r;5729struct r600_bytecode_alu alu;5730int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;57315732for (i = 0; i < 3; i++) {5733memset(&alu, 0, sizeof(struct r600_bytecode_alu));5734alu.op = ALU_OP1_LOG_IEEE;5735r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);5736alu.dst.sel = ctx->temp_reg;5737alu.dst.chan = i;5738alu.dst.write = 1;5739if (i == 2)5740alu.last = 1;5741r = r600_bytecode_add_alu(ctx->bc, &alu);5742if (r)5743return r;5744}57455746/* b * LOG2(a) */5747memset(&alu, 0, sizeof(struct r600_bytecode_alu));5748alu.op = ALU_OP2_MUL;5749r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);5750alu.src[1].sel = ctx->temp_reg;5751alu.dst.sel = ctx->temp_reg;5752alu.dst.write = 1;5753alu.last = 1;5754r = r600_bytecode_add_alu(ctx->bc, &alu);5755if (r)5756return r;57575758for (i = 0; i < last_slot; i++) {5759/* POW(a,b) = EXP2(b * LOG2(a))*/5760memset(&alu, 0, sizeof(struct r600_bytecode_alu));5761alu.op = ALU_OP1_EXP_IEEE;5762alu.src[0].sel = ctx->temp_reg;57635764tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);5765alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;5766if (i == last_slot - 1)5767alu.last = 1;5768r = r600_bytecode_add_alu(ctx->bc, &alu);5769if (r)5770return r;5771}5772return 0;5773}57745775static int tgsi_pow(struct r600_shader_ctx *ctx)5776{5777struct r600_bytecode_alu alu;5778int r;57795780/* LOG2(a) */5781memset(&alu, 0, sizeof(struct r600_bytecode_alu));5782alu.op = ALU_OP1_LOG_IEEE;5783r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);5784alu.dst.sel = ctx->temp_reg;5785alu.dst.write = 1;5786alu.last = 1;5787r = r600_bytecode_add_alu(ctx->bc, &alu);5788if (r)5789return r;5790/* b * LOG2(a) */5791memset(&alu, 0, sizeof(struct r600_bytecode_alu));5792alu.op = ALU_OP2_MUL;5793r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);5794alu.src[1].sel = ctx->temp_reg;5795alu.dst.sel = ctx->temp_reg;5796alu.dst.write = 1;5797alu.last = 1;5798r = r600_bytecode_add_alu(ctx->bc, &alu);5799if (r)5800return r;5801/* POW(a,b) = EXP2(b * LOG2(a))*/5802memset(&alu, 0, sizeof(struct r600_bytecode_alu));5803alu.op = ALU_OP1_EXP_IEEE;5804alu.src[0].sel = ctx->temp_reg;5805alu.dst.sel = ctx->temp_reg;5806alu.dst.write = 1;5807alu.last = 1;5808r = r600_bytecode_add_alu(ctx->bc, &alu);5809if (r)5810return r;5811return tgsi_helper_tempx_replicate(ctx);5812}58135814static int emit_mul_int_op(struct r600_bytecode *bc,5815struct r600_bytecode_alu *alu_src)5816{5817struct r600_bytecode_alu alu;5818int i, r;5819alu = *alu_src;5820if (bc->chip_class == CAYMAN) {5821for (i = 0; i < 4; i++) {5822alu.dst.chan = i;5823alu.dst.write = (i == alu_src->dst.chan);5824alu.last = (i == 3);58255826r = r600_bytecode_add_alu(bc, &alu);5827if (r)5828return r;5829}5830} else {5831alu.last = 1;5832r = r600_bytecode_add_alu(bc, &alu);5833if (r)5834return r;5835}5836return 0;5837}58385839static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)5840{5841struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;5842struct r600_bytecode_alu alu;5843int i, r, j;5844unsigned write_mask = inst->Dst[0].Register.WriteMask;5845int lasti = tgsi_last_instruction(write_mask);5846int tmp0 = ctx->temp_reg;5847int tmp1 = r600_get_temp(ctx);5848int tmp2 = r600_get_temp(ctx);5849int tmp3 = r600_get_temp(ctx);5850int tmp4 = 0;58515852/* Use additional temp if dst register and src register are the same */5853if (inst->Src[0].Register.Index == inst->Dst[0].Register.Index ||5854inst->Src[1].Register.Index == inst->Dst[0].Register.Index) {5855tmp4 = r600_get_temp(ctx);5856}58575858/* Unsigned path:5859*5860* we need to represent src1 as src2*q + r, where q - quotient, r - remainder5861*5862* 1. tmp0.x = rcp (src2) = 2^32/src2 + e, where e is rounding error5863* 2. tmp0.z = lo (tmp0.x * src2)5864* 3. tmp0.w = -tmp0.z5865* 4. tmp0.y = hi (tmp0.x * src2)5866* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src2))5867* 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error5868* 7. tmp1.x = tmp0.x - tmp0.w5869* 8. tmp1.y = tmp0.x + tmp0.w5870* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)5871* 10. tmp0.z = hi(tmp0.x * src1) = q5872* 11. tmp0.y = lo (tmp0.z * src2) = src2*q = src1 - r5873*5874* 12. tmp0.w = src1 - tmp0.y = r5875* 13. tmp1.x = tmp0.w >= src2 = r >= src2 (uint comparison)5876* 14. tmp1.y = src1 >= tmp0.y = r >= 0 (uint comparison)5877*5878* if DIV5879*5880* 15. tmp1.z = tmp0.z + 1 = q + 15881* 16. tmp1.w = tmp0.z - 1 = q - 15882*5883* else MOD5884*5885* 15. tmp1.z = tmp0.w - src2 = r - src25886* 16. tmp1.w = tmp0.w + src2 = r + src25887*5888* endif5889*5890* 17. tmp1.x = tmp1.x & tmp1.y5891*5892* DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z5893* MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z5894*5895* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z5896* 20. dst = src2==0 ? MAX_UINT : tmp0.z5897*5898* Signed path:5899*5900* Same as unsigned, using abs values of the operands,5901* and fixing the sign of the result in the end.5902*/59035904for (i = 0; i < 4; i++) {5905if (!(write_mask & (1<<i)))5906continue;59075908if (signed_op) {59095910/* tmp2.x = -src0 */5911memset(&alu, 0, sizeof(struct r600_bytecode_alu));5912alu.op = ALU_OP2_SUB_INT;59135914alu.dst.sel = tmp2;5915alu.dst.chan = 0;5916alu.dst.write = 1;59175918alu.src[0].sel = V_SQ_ALU_SRC_0;59195920r600_bytecode_src(&alu.src[1], &ctx->src[0], i);59215922alu.last = 1;5923if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))5924return r;59255926/* tmp2.y = -src1 */5927memset(&alu, 0, sizeof(struct r600_bytecode_alu));5928alu.op = ALU_OP2_SUB_INT;59295930alu.dst.sel = tmp2;5931alu.dst.chan = 1;5932alu.dst.write = 1;59335934alu.src[0].sel = V_SQ_ALU_SRC_0;59355936r600_bytecode_src(&alu.src[1], &ctx->src[1], i);59375938alu.last = 1;5939if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))5940return r;59415942/* tmp2.z sign bit is set if src0 and src2 signs are different */5943/* it will be a sign of the quotient */5944if (!mod) {59455946memset(&alu, 0, sizeof(struct r600_bytecode_alu));5947alu.op = ALU_OP2_XOR_INT;59485949alu.dst.sel = tmp2;5950alu.dst.chan = 2;5951alu.dst.write = 1;59525953r600_bytecode_src(&alu.src[0], &ctx->src[0], i);5954r600_bytecode_src(&alu.src[1], &ctx->src[1], i);59555956alu.last = 1;5957if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))5958return r;5959}59605961/* tmp2.x = |src0| */5962memset(&alu, 0, sizeof(struct r600_bytecode_alu));5963alu.op = ALU_OP3_CNDGE_INT;5964alu.is_op3 = 1;59655966alu.dst.sel = tmp2;5967alu.dst.chan = 0;5968alu.dst.write = 1;59695970r600_bytecode_src(&alu.src[0], &ctx->src[0], i);5971r600_bytecode_src(&alu.src[1], &ctx->src[0], i);5972alu.src[2].sel = tmp2;5973alu.src[2].chan = 0;59745975alu.last = 1;5976if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))5977return r;59785979/* tmp2.y = |src1| */5980memset(&alu, 0, sizeof(struct r600_bytecode_alu));5981alu.op = ALU_OP3_CNDGE_INT;5982alu.is_op3 = 1;59835984alu.dst.sel = tmp2;5985alu.dst.chan = 1;5986alu.dst.write = 1;59875988r600_bytecode_src(&alu.src[0], &ctx->src[1], i);5989r600_bytecode_src(&alu.src[1], &ctx->src[1], i);5990alu.src[2].sel = tmp2;5991alu.src[2].chan = 1;59925993alu.last = 1;5994if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))5995return r;59965997}59985999/* 1. tmp0.x = rcp_u (src2) = 2^32/src2 + e, where e is rounding error */6000if (ctx->bc->chip_class == CAYMAN) {6001/* tmp3.x = u2f(src2) */6002memset(&alu, 0, sizeof(struct r600_bytecode_alu));6003alu.op = ALU_OP1_UINT_TO_FLT;60046005alu.dst.sel = tmp3;6006alu.dst.chan = 0;6007alu.dst.write = 1;60086009if (signed_op) {6010alu.src[0].sel = tmp2;6011alu.src[0].chan = 1;6012} else {6013r600_bytecode_src(&alu.src[0], &ctx->src[1], i);6014}60156016alu.last = 1;6017if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))6018return r;60196020/* tmp0.x = recip(tmp3.x) */6021for (j = 0 ; j < 3; j++) {6022memset(&alu, 0, sizeof(struct r600_bytecode_alu));6023alu.op = ALU_OP1_RECIP_IEEE;60246025alu.dst.sel = tmp0;6026alu.dst.chan = j;6027alu.dst.write = (j == 0);60286029alu.src[0].sel = tmp3;6030alu.src[0].chan = 0;60316032if (j == 2)6033alu.last = 1;6034if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))6035return r;6036}60376038memset(&alu, 0, sizeof(struct r600_bytecode_alu));6039alu.op = ALU_OP2_MUL;60406041alu.src[0].sel = tmp0;6042alu.src[0].chan = 0;60436044alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;6045alu.src[1].value = 0x4f800000;60466047alu.dst.sel = tmp3;6048alu.dst.write = 1;6049alu.last = 1;6050r = r600_bytecode_add_alu(ctx->bc, &alu);6051if (r)6052return r;60536054memset(&alu, 0, sizeof(struct r600_bytecode_alu));6055alu.op = ALU_OP1_FLT_TO_UINT;60566057alu.dst.sel = tmp0;6058alu.dst.chan = 0;6059alu.dst.write = 1;60606061alu.src[0].sel = tmp3;6062alu.src[0].chan = 0;60636064alu.last = 1;6065if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))6066return r;60676068} else {6069memset(&alu, 0, sizeof(struct r600_bytecode_alu));6070alu.op = ALU_OP1_RECIP_UINT;60716072alu.dst.sel = tmp0;6073alu.dst.chan = 0;6074alu.dst.write = 1;60756076if (signed_op) {6077alu.src[0].sel = tmp2;6078alu.src[0].chan = 1;6079} else {6080r600_bytecode_src(&alu.src[0], &ctx->src[1], i);6081}60826083alu.last = 1;6084if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))6085return r;6086}60876088/* 2. tmp0.z = lo (tmp0.x * src2) */6089memset(&alu, 0, sizeof(struct r600_bytecode_alu));6090alu.op = ALU_OP2_MULLO_UINT;60916092alu.dst.sel = tmp0;6093alu.dst.chan = 2;6094alu.dst.write = 1;60956096alu.src[0].sel = tmp0;6097alu.src[0].chan = 0;6098if (signed_op) {6099alu.src[1].sel = tmp2;6100alu.src[1].chan = 1;6101} else {6102r600_bytecode_src(&alu.src[1], &ctx->src[1], i);6103}61046105if ((r = emit_mul_int_op(ctx->bc, &alu)))6106return r;61076108/* 3. tmp0.w = -tmp0.z */6109memset(&alu, 0, sizeof(struct r600_bytecode_alu));6110alu.op = ALU_OP2_SUB_INT;61116112alu.dst.sel = tmp0;6113alu.dst.chan = 3;6114alu.dst.write = 1;61156116alu.src[0].sel = V_SQ_ALU_SRC_0;6117alu.src[1].sel = tmp0;6118alu.src[1].chan = 2;61196120alu.last = 1;6121if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))6122return r;61236124/* 4. tmp0.y = hi (tmp0.x * src2) */6125memset(&alu, 0, sizeof(struct r600_bytecode_alu));6126alu.op = ALU_OP2_MULHI_UINT;61276128alu.dst.sel = tmp0;6129alu.dst.chan = 1;6130alu.dst.write = 1;61316132alu.src[0].sel = tmp0;6133alu.src[0].chan = 0;61346135if (signed_op) {6136alu.src[1].sel = tmp2;6137alu.src[1].chan = 1;6138} else {6139r600_bytecode_src(&alu.src[1], &ctx->src[1], i);6140}61416142if ((r = emit_mul_int_op(ctx->bc, &alu)))6143return r;61446145/* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src)) */6146memset(&alu, 0, sizeof(struct r600_bytecode_alu));6147alu.op = ALU_OP3_CNDE_INT;6148alu.is_op3 = 1;61496150alu.dst.sel = tmp0;6151alu.dst.chan = 2;6152alu.dst.write = 1;61536154alu.src[0].sel = tmp0;6155alu.src[0].chan = 1;6156alu.src[1].sel = tmp0;6157alu.src[1].chan = 3;6158alu.src[2].sel = tmp0;6159alu.src[2].chan = 2;61606161alu.last = 1;6162if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))6163return r;61646165/* 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error */6166memset(&alu, 0, sizeof(struct r600_bytecode_alu));6167alu.op = ALU_OP2_MULHI_UINT;61686169alu.dst.sel = tmp0;6170alu.dst.chan = 3;6171alu.dst.write = 1;61726173alu.src[0].sel = tmp0;6174alu.src[0].chan = 2;61756176alu.src[1].sel = tmp0;6177alu.src[1].chan = 0;61786179if ((r = emit_mul_int_op(ctx->bc, &alu)))6180return r;61816182/* 7. tmp1.x = tmp0.x - tmp0.w */6183memset(&alu, 0, sizeof(struct r600_bytecode_alu));6184alu.op = ALU_OP2_SUB_INT;61856186alu.dst.sel = tmp1;6187alu.dst.chan = 0;6188alu.dst.write = 1;61896190alu.src[0].sel = tmp0;6191alu.src[0].chan = 0;6192alu.src[1].sel = tmp0;6193alu.src[1].chan = 3;61946195alu.last = 1;6196if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))6197return r;61986199/* 8. tmp1.y = tmp0.x + tmp0.w */6200memset(&alu, 0, sizeof(struct r600_bytecode_alu));6201alu.op = ALU_OP2_ADD_INT;62026203alu.dst.sel = tmp1;6204alu.dst.chan = 1;6205alu.dst.write = 1;62066207alu.src[0].sel = tmp0;6208alu.src[0].chan = 0;6209alu.src[1].sel = tmp0;6210alu.src[1].chan = 3;62116212alu.last = 1;6213if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))6214return r;62156216/* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */6217memset(&alu, 0, sizeof(struct r600_bytecode_alu));6218alu.op = ALU_OP3_CNDE_INT;6219alu.is_op3 = 1;62206221alu.dst.sel = tmp0;6222alu.dst.chan = 0;6223alu.dst.write = 1;62246225alu.src[0].sel = tmp0;6226alu.src[0].chan = 1;6227alu.src[1].sel = tmp1;6228alu.src[1].chan = 1;6229alu.src[2].sel = tmp1;6230alu.src[2].chan = 0;62316232alu.last = 1;6233if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))6234return r;62356236/* 10. tmp0.z = hi(tmp0.x * src1) = q */6237memset(&alu, 0, sizeof(struct r600_bytecode_alu));6238alu.op = ALU_OP2_MULHI_UINT;62396240alu.dst.sel = tmp0;6241alu.dst.chan = 2;6242alu.dst.write = 1;62436244alu.src[0].sel = tmp0;6245alu.src[0].chan = 0;62466247if (signed_op) {6248alu.src[1].sel = tmp2;6249alu.src[1].chan = 0;6250} else {6251r600_bytecode_src(&alu.src[1], &ctx->src[0], i);6252}62536254if ((r = emit_mul_int_op(ctx->bc, &alu)))6255return r;62566257/* 11. tmp0.y = lo (src2 * tmp0.z) = src2*q = src1 - r */6258memset(&alu, 0, sizeof(struct r600_bytecode_alu));6259alu.op = ALU_OP2_MULLO_UINT;62606261alu.dst.sel = tmp0;6262alu.dst.chan = 1;6263alu.dst.write = 1;62646265if (signed_op) {6266alu.src[0].sel = tmp2;6267alu.src[0].chan = 1;6268} else {6269r600_bytecode_src(&alu.src[0], &ctx->src[1], i);6270}62716272alu.src[1].sel = tmp0;6273alu.src[1].chan = 2;62746275if ((r = emit_mul_int_op(ctx->bc, &alu)))6276return r;62776278/* 12. tmp0.w = src1 - tmp0.y = r */6279memset(&alu, 0, sizeof(struct r600_bytecode_alu));6280alu.op = ALU_OP2_SUB_INT;62816282alu.dst.sel = tmp0;6283alu.dst.chan = 3;6284alu.dst.write = 1;62856286if (signed_op) {6287alu.src[0].sel = tmp2;6288alu.src[0].chan = 0;6289} else {6290r600_bytecode_src(&alu.src[0], &ctx->src[0], i);6291}62926293alu.src[1].sel = tmp0;6294alu.src[1].chan = 1;62956296alu.last = 1;6297if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))6298return r;62996300/* 13. tmp1.x = tmp0.w >= src2 = r >= src2 */6301memset(&alu, 0, sizeof(struct r600_bytecode_alu));6302alu.op = ALU_OP2_SETGE_UINT;63036304alu.dst.sel = tmp1;6305alu.dst.chan = 0;6306alu.dst.write = 1;63076308alu.src[0].sel = tmp0;6309alu.src[0].chan = 3;6310if (signed_op) {6311alu.src[1].sel = tmp2;6312alu.src[1].chan = 1;6313} else {6314r600_bytecode_src(&alu.src[1], &ctx->src[1], i);6315}63166317alu.last = 1;6318if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))6319return r;63206321/* 14. tmp1.y = src1 >= tmp0.y = r >= 0 */6322memset(&alu, 0, sizeof(struct r600_bytecode_alu));6323alu.op = ALU_OP2_SETGE_UINT;63246325alu.dst.sel = tmp1;6326alu.dst.chan = 1;6327alu.dst.write = 1;63286329if (signed_op) {6330alu.src[0].sel = tmp2;6331alu.src[0].chan = 0;6332} else {6333r600_bytecode_src(&alu.src[0], &ctx->src[0], i);6334}63356336alu.src[1].sel = tmp0;6337alu.src[1].chan = 1;63386339alu.last = 1;6340if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))6341return r;63426343if (mod) { /* UMOD */63446345/* 15. tmp1.z = tmp0.w - src2 = r - src2 */6346memset(&alu, 0, sizeof(struct r600_bytecode_alu));6347alu.op = ALU_OP2_SUB_INT;63486349alu.dst.sel = tmp1;6350alu.dst.chan = 2;6351alu.dst.write = 1;63526353alu.src[0].sel = tmp0;6354alu.src[0].chan = 3;63556356if (signed_op) {6357alu.src[1].sel = tmp2;6358alu.src[1].chan = 1;6359} else {6360r600_bytecode_src(&alu.src[1], &ctx->src[1], i);6361}63626363alu.last = 1;6364if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))6365return r;63666367/* 16. tmp1.w = tmp0.w + src2 = r + src2 */6368memset(&alu, 0, sizeof(struct r600_bytecode_alu));6369alu.op = ALU_OP2_ADD_INT;63706371alu.dst.sel = tmp1;6372alu.dst.chan = 3;6373alu.dst.write = 1;63746375alu.src[0].sel = tmp0;6376alu.src[0].chan = 3;6377if (signed_op) {6378alu.src[1].sel = tmp2;6379alu.src[1].chan = 1;6380} else {6381r600_bytecode_src(&alu.src[1], &ctx->src[1], i);6382}63836384alu.last = 1;6385if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))6386return r;63876388} else { /* UDIV */63896390/* 15. tmp1.z = tmp0.z + 1 = q + 1 DIV */6391memset(&alu, 0, sizeof(struct r600_bytecode_alu));6392alu.op = ALU_OP2_ADD_INT;63936394alu.dst.sel = tmp1;6395alu.dst.chan = 2;6396alu.dst.write = 1;63976398alu.src[0].sel = tmp0;6399alu.src[0].chan = 2;6400alu.src[1].sel = V_SQ_ALU_SRC_1_INT;64016402alu.last = 1;6403if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))6404return r;64056406/* 16. tmp1.w = tmp0.z - 1 = q - 1 */6407memset(&alu, 0, sizeof(struct r600_bytecode_alu));6408alu.op = ALU_OP2_ADD_INT;64096410alu.dst.sel = tmp1;6411alu.dst.chan = 3;6412alu.dst.write = 1;64136414alu.src[0].sel = tmp0;6415alu.src[0].chan = 2;6416alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;64176418alu.last = 1;6419if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))6420return r;64216422}64236424/* 17. tmp1.x = tmp1.x & tmp1.y */6425memset(&alu, 0, sizeof(struct r600_bytecode_alu));6426alu.op = ALU_OP2_AND_INT;64276428alu.dst.sel = tmp1;6429alu.dst.chan = 0;6430alu.dst.write = 1;64316432alu.src[0].sel = tmp1;6433alu.src[0].chan = 0;6434alu.src[1].sel = tmp1;6435alu.src[1].chan = 1;64366437alu.last = 1;6438if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))6439return r;64406441/* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z DIV */6442/* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z MOD */6443memset(&alu, 0, sizeof(struct r600_bytecode_alu));6444alu.op = ALU_OP3_CNDE_INT;6445alu.is_op3 = 1;64466447alu.dst.sel = tmp0;6448alu.dst.chan = 2;6449alu.dst.write = 1;64506451alu.src[0].sel = tmp1;6452alu.src[0].chan = 0;6453alu.src[1].sel = tmp0;6454alu.src[1].chan = mod ? 3 : 2;6455alu.src[2].sel = tmp1;6456alu.src[2].chan = 2;64576458alu.last = 1;6459if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))6460return r;64616462/* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */6463memset(&alu, 0, sizeof(struct r600_bytecode_alu));6464alu.op = ALU_OP3_CNDE_INT;6465alu.is_op3 = 1;64666467if (signed_op) {6468alu.dst.sel = tmp0;6469alu.dst.chan = 2;6470alu.dst.write = 1;6471} else {6472if (tmp4 > 0) {6473alu.dst.sel = tmp4;6474alu.dst.chan = i;6475alu.dst.write = 1;6476} else {6477tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);6478}6479}64806481alu.src[0].sel = tmp1;6482alu.src[0].chan = 1;6483alu.src[1].sel = tmp1;6484alu.src[1].chan = 3;6485alu.src[2].sel = tmp0;6486alu.src[2].chan = 2;64876488alu.last = 1;6489if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))6490return r;64916492if (signed_op) {64936494/* fix the sign of the result */64956496if (mod) {64976498/* tmp0.x = -tmp0.z */6499memset(&alu, 0, sizeof(struct r600_bytecode_alu));6500alu.op = ALU_OP2_SUB_INT;65016502alu.dst.sel = tmp0;6503alu.dst.chan = 0;6504alu.dst.write = 1;65056506alu.src[0].sel = V_SQ_ALU_SRC_0;6507alu.src[1].sel = tmp0;6508alu.src[1].chan = 2;65096510alu.last = 1;6511if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))6512return r;65136514/* sign of the remainder is the same as the sign of src0 */6515/* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */6516memset(&alu, 0, sizeof(struct r600_bytecode_alu));6517alu.op = ALU_OP3_CNDGE_INT;6518alu.is_op3 = 1;65196520if (tmp4 > 0) {6521alu.dst.sel = tmp4;6522alu.dst.chan = i;6523alu.dst.write = 1;6524} else {6525tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);6526}65276528r600_bytecode_src(&alu.src[0], &ctx->src[0], i);6529alu.src[1].sel = tmp0;6530alu.src[1].chan = 2;6531alu.src[2].sel = tmp0;6532alu.src[2].chan = 0;65336534alu.last = 1;6535if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))6536return r;65376538} else {65396540/* tmp0.x = -tmp0.z */6541memset(&alu, 0, sizeof(struct r600_bytecode_alu));6542alu.op = ALU_OP2_SUB_INT;65436544alu.dst.sel = tmp0;6545alu.dst.chan = 0;6546alu.dst.write = 1;65476548alu.src[0].sel = V_SQ_ALU_SRC_0;6549alu.src[1].sel = tmp0;6550alu.src[1].chan = 2;65516552alu.last = 1;6553if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))6554return r;65556556/* fix the quotient sign (same as the sign of src0*src1) */6557/* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */6558memset(&alu, 0, sizeof(struct r600_bytecode_alu));6559alu.op = ALU_OP3_CNDGE_INT;6560alu.is_op3 = 1;65616562if (tmp4 > 0) {6563alu.dst.sel = tmp4;6564alu.dst.chan = i;6565alu.dst.write = 1;6566} else {6567tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);6568}65696570alu.src[0].sel = tmp2;6571alu.src[0].chan = 2;6572alu.src[1].sel = tmp0;6573alu.src[1].chan = 2;6574alu.src[2].sel = tmp0;6575alu.src[2].chan = 0;65766577alu.last = 1;6578if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))6579return r;6580}6581}6582}65836584if (tmp4 > 0) {6585for (i = 0; i <= lasti; ++i) {6586if (!(write_mask & (1<<i)))6587continue;65886589memset(&alu, 0, sizeof(struct r600_bytecode_alu));6590alu.op = ALU_OP1_MOV;6591tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);6592alu.src[0].sel = tmp4;6593alu.src[0].chan = i;65946595if (i == lasti)6596alu.last = 1;6597if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))6598return r;6599}6600}66016602return 0;6603}66046605static int tgsi_udiv(struct r600_shader_ctx *ctx)6606{6607return tgsi_divmod(ctx, 0, 0);6608}66096610static int tgsi_umod(struct r600_shader_ctx *ctx)6611{6612return tgsi_divmod(ctx, 1, 0);6613}66146615static int tgsi_idiv(struct r600_shader_ctx *ctx)6616{6617return tgsi_divmod(ctx, 0, 1);6618}66196620static int tgsi_imod(struct r600_shader_ctx *ctx)6621{6622return tgsi_divmod(ctx, 1, 1);6623}662466256626static int tgsi_f2i(struct r600_shader_ctx *ctx)6627{6628struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;6629struct r600_bytecode_alu alu;6630int i, r;6631unsigned write_mask = inst->Dst[0].Register.WriteMask;6632int last_inst = tgsi_last_instruction(write_mask);66336634for (i = 0; i < 4; i++) {6635if (!(write_mask & (1<<i)))6636continue;66376638memset(&alu, 0, sizeof(struct r600_bytecode_alu));6639alu.op = ALU_OP1_TRUNC;66406641alu.dst.sel = ctx->temp_reg;6642alu.dst.chan = i;6643alu.dst.write = 1;66446645r600_bytecode_src(&alu.src[0], &ctx->src[0], i);6646if (i == last_inst)6647alu.last = 1;6648r = r600_bytecode_add_alu(ctx->bc, &alu);6649if (r)6650return r;6651}66526653for (i = 0; i < 4; i++) {6654if (!(write_mask & (1<<i)))6655continue;66566657memset(&alu, 0, sizeof(struct r600_bytecode_alu));6658alu.op = ctx->inst_info->op;66596660tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);66616662alu.src[0].sel = ctx->temp_reg;6663alu.src[0].chan = i;66646665if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT)6666alu.last = 1;6667r = r600_bytecode_add_alu(ctx->bc, &alu);6668if (r)6669return r;6670}66716672return 0;6673}66746675static int tgsi_iabs(struct r600_shader_ctx *ctx)6676{6677struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;6678struct r600_bytecode_alu alu;6679int i, r;6680unsigned write_mask = inst->Dst[0].Register.WriteMask;6681int last_inst = tgsi_last_instruction(write_mask);66826683/* tmp = -src */6684for (i = 0; i < 4; i++) {6685if (!(write_mask & (1<<i)))6686continue;66876688memset(&alu, 0, sizeof(struct r600_bytecode_alu));6689alu.op = ALU_OP2_SUB_INT;66906691alu.dst.sel = ctx->temp_reg;6692alu.dst.chan = i;6693alu.dst.write = 1;66946695r600_bytecode_src(&alu.src[1], &ctx->src[0], i);6696alu.src[0].sel = V_SQ_ALU_SRC_0;66976698if (i == last_inst)6699alu.last = 1;6700r = r600_bytecode_add_alu(ctx->bc, &alu);6701if (r)6702return r;6703}67046705/* dst = (src >= 0 ? src : tmp) */6706for (i = 0; i < 4; i++) {6707if (!(write_mask & (1<<i)))6708continue;67096710memset(&alu, 0, sizeof(struct r600_bytecode_alu));6711alu.op = ALU_OP3_CNDGE_INT;6712alu.is_op3 = 1;6713alu.dst.write = 1;67146715tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);67166717r600_bytecode_src(&alu.src[0], &ctx->src[0], i);6718r600_bytecode_src(&alu.src[1], &ctx->src[0], i);6719alu.src[2].sel = ctx->temp_reg;6720alu.src[2].chan = i;67216722if (i == last_inst)6723alu.last = 1;6724r = r600_bytecode_add_alu(ctx->bc, &alu);6725if (r)6726return r;6727}6728return 0;6729}67306731static int tgsi_issg(struct r600_shader_ctx *ctx)6732{6733struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;6734struct r600_bytecode_alu alu;6735int i, r;6736unsigned write_mask = inst->Dst[0].Register.WriteMask;6737int last_inst = tgsi_last_instruction(write_mask);67386739/* tmp = (src >= 0 ? src : -1) */6740for (i = 0; i < 4; i++) {6741if (!(write_mask & (1<<i)))6742continue;67436744memset(&alu, 0, sizeof(struct r600_bytecode_alu));6745alu.op = ALU_OP3_CNDGE_INT;6746alu.is_op3 = 1;67476748alu.dst.sel = ctx->temp_reg;6749alu.dst.chan = i;6750alu.dst.write = 1;67516752r600_bytecode_src(&alu.src[0], &ctx->src[0], i);6753r600_bytecode_src(&alu.src[1], &ctx->src[0], i);6754alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;67556756if (i == last_inst)6757alu.last = 1;6758r = r600_bytecode_add_alu(ctx->bc, &alu);6759if (r)6760return r;6761}67626763/* dst = (tmp > 0 ? 1 : tmp) */6764for (i = 0; i < 4; i++) {6765if (!(write_mask & (1<<i)))6766continue;67676768memset(&alu, 0, sizeof(struct r600_bytecode_alu));6769alu.op = ALU_OP3_CNDGT_INT;6770alu.is_op3 = 1;6771alu.dst.write = 1;67726773tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);67746775alu.src[0].sel = ctx->temp_reg;6776alu.src[0].chan = i;67776778alu.src[1].sel = V_SQ_ALU_SRC_1_INT;67796780alu.src[2].sel = ctx->temp_reg;6781alu.src[2].chan = i;67826783if (i == last_inst)6784alu.last = 1;6785r = r600_bytecode_add_alu(ctx->bc, &alu);6786if (r)6787return r;6788}6789return 0;6790}6791679267936794static int tgsi_ssg(struct r600_shader_ctx *ctx)6795{6796struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;6797unsigned write_mask = inst->Dst[0].Register.WriteMask;6798int last_inst = tgsi_last_instruction(write_mask);6799struct r600_bytecode_alu alu;6800int i, r;68016802/* tmp = (src > 0 ? 1 : src) */6803for (i = 0; i <= last_inst; i++) {6804if (!(write_mask & (1 << i)))6805continue;6806memset(&alu, 0, sizeof(struct r600_bytecode_alu));6807alu.op = ALU_OP3_CNDGT;6808alu.is_op3 = 1;68096810alu.dst.sel = ctx->temp_reg;6811alu.dst.chan = i;68126813r600_bytecode_src(&alu.src[0], &ctx->src[0], i);6814alu.src[1].sel = V_SQ_ALU_SRC_1;6815r600_bytecode_src(&alu.src[2], &ctx->src[0], i);68166817if (i == last_inst)6818alu.last = 1;6819r = r600_bytecode_add_alu(ctx->bc, &alu);6820if (r)6821return r;6822}68236824/* dst = (-tmp > 0 ? -1 : tmp) */6825for (i = 0; i <= last_inst; i++) {6826if (!(write_mask & (1 << i)))6827continue;6828memset(&alu, 0, sizeof(struct r600_bytecode_alu));6829alu.op = ALU_OP3_CNDGT;6830alu.is_op3 = 1;6831tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);68326833alu.src[0].sel = ctx->temp_reg;6834alu.src[0].chan = i;6835alu.src[0].neg = 1;68366837alu.src[1].sel = V_SQ_ALU_SRC_1;6838alu.src[1].neg = 1;68396840alu.src[2].sel = ctx->temp_reg;6841alu.src[2].chan = i;68426843if (i == last_inst)6844alu.last = 1;6845r = r600_bytecode_add_alu(ctx->bc, &alu);6846if (r)6847return r;6848}6849return 0;6850}68516852static int tgsi_bfi(struct r600_shader_ctx *ctx)6853{6854struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;6855struct r600_bytecode_alu alu;6856int i, r, t1, t2;68576858unsigned write_mask = inst->Dst[0].Register.WriteMask;6859int last_inst = tgsi_last_instruction(write_mask);68606861t1 = r600_get_temp(ctx);68626863for (i = 0; i < 4; i++) {6864if (!(write_mask & (1<<i)))6865continue;68666867memset(&alu, 0, sizeof(struct r600_bytecode_alu));6868alu.op = ALU_OP2_SETGE_INT;6869r600_bytecode_src(&alu.src[0], &ctx->src[3], i);6870alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;6871alu.src[1].value = 32;6872alu.dst.sel = ctx->temp_reg;6873alu.dst.chan = i;6874alu.dst.write = 1;6875alu.last = i == last_inst;6876r = r600_bytecode_add_alu(ctx->bc, &alu);6877if (r)6878return r;6879}68806881for (i = 0; i < 4; i++) {6882if (!(write_mask & (1<<i)))6883continue;68846885/* create mask tmp */6886memset(&alu, 0, sizeof(struct r600_bytecode_alu));6887alu.op = ALU_OP2_BFM_INT;6888alu.dst.sel = t1;6889alu.dst.chan = i;6890alu.dst.write = 1;6891alu.last = i == last_inst;68926893r600_bytecode_src(&alu.src[0], &ctx->src[3], i);6894r600_bytecode_src(&alu.src[1], &ctx->src[2], i);68956896r = r600_bytecode_add_alu(ctx->bc, &alu);6897if (r)6898return r;6899}69006901t2 = r600_get_temp(ctx);69026903for (i = 0; i < 4; i++) {6904if (!(write_mask & (1<<i)))6905continue;69066907/* shift insert left */6908memset(&alu, 0, sizeof(struct r600_bytecode_alu));6909alu.op = ALU_OP2_LSHL_INT;6910alu.dst.sel = t2;6911alu.dst.chan = i;6912alu.dst.write = 1;6913alu.last = i == last_inst;69146915r600_bytecode_src(&alu.src[0], &ctx->src[1], i);6916r600_bytecode_src(&alu.src[1], &ctx->src[2], i);69176918r = r600_bytecode_add_alu(ctx->bc, &alu);6919if (r)6920return r;6921}69226923for (i = 0; i < 4; i++) {6924if (!(write_mask & (1<<i)))6925continue;69266927/* actual bitfield insert */6928memset(&alu, 0, sizeof(struct r600_bytecode_alu));6929alu.op = ALU_OP3_BFI_INT;6930alu.is_op3 = 1;6931tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);6932alu.dst.chan = i;6933alu.dst.write = 1;6934alu.last = i == last_inst;69356936alu.src[0].sel = t1;6937alu.src[0].chan = i;6938alu.src[1].sel = t2;6939alu.src[1].chan = i;6940r600_bytecode_src(&alu.src[2], &ctx->src[0], i);69416942r = r600_bytecode_add_alu(ctx->bc, &alu);6943if (r)6944return r;6945}69466947for (i = 0; i < 4; i++) {6948if (!(write_mask & (1<<i)))6949continue;6950memset(&alu, 0, sizeof(struct r600_bytecode_alu));6951alu.op = ALU_OP3_CNDE_INT;6952alu.is_op3 = 1;6953alu.src[0].sel = ctx->temp_reg;6954alu.src[0].chan = i;6955r600_bytecode_src(&alu.src[2], &ctx->src[1], i);69566957tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);69586959alu.src[1].sel = alu.dst.sel;6960alu.src[1].chan = i;69616962alu.last = i == last_inst;6963r = r600_bytecode_add_alu(ctx->bc, &alu);6964if (r)6965return r;6966}6967return 0;6968}69696970static int tgsi_msb(struct r600_shader_ctx *ctx)6971{6972struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;6973struct r600_bytecode_alu alu;6974int i, r, t1, t2;69756976unsigned write_mask = inst->Dst[0].Register.WriteMask;6977int last_inst = tgsi_last_instruction(write_mask);69786979assert(ctx->inst_info->op == ALU_OP1_FFBH_INT ||6980ctx->inst_info->op == ALU_OP1_FFBH_UINT);69816982t1 = ctx->temp_reg;69836984/* bit position is indexed from lsb by TGSI, and from msb by the hardware */6985for (i = 0; i < 4; i++) {6986if (!(write_mask & (1<<i)))6987continue;69886989/* t1 = FFBH_INT / FFBH_UINT */6990memset(&alu, 0, sizeof(struct r600_bytecode_alu));6991alu.op = ctx->inst_info->op;6992alu.dst.sel = t1;6993alu.dst.chan = i;6994alu.dst.write = 1;6995alu.last = i == last_inst;69966997r600_bytecode_src(&alu.src[0], &ctx->src[0], i);69986999r = r600_bytecode_add_alu(ctx->bc, &alu);7000if (r)7001return r;7002}70037004t2 = r600_get_temp(ctx);70057006for (i = 0; i < 4; i++) {7007if (!(write_mask & (1<<i)))7008continue;70097010/* t2 = 31 - t1 */7011memset(&alu, 0, sizeof(struct r600_bytecode_alu));7012alu.op = ALU_OP2_SUB_INT;7013alu.dst.sel = t2;7014alu.dst.chan = i;7015alu.dst.write = 1;7016alu.last = i == last_inst;70177018alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;7019alu.src[0].value = 31;7020alu.src[1].sel = t1;7021alu.src[1].chan = i;70227023r = r600_bytecode_add_alu(ctx->bc, &alu);7024if (r)7025return r;7026}70277028for (i = 0; i < 4; i++) {7029if (!(write_mask & (1<<i)))7030continue;70317032/* result = t1 >= 0 ? t2 : t1 */7033memset(&alu, 0, sizeof(struct r600_bytecode_alu));7034alu.op = ALU_OP3_CNDGE_INT;7035alu.is_op3 = 1;7036tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);7037alu.dst.chan = i;7038alu.dst.write = 1;7039alu.last = i == last_inst;70407041alu.src[0].sel = t1;7042alu.src[0].chan = i;7043alu.src[1].sel = t2;7044alu.src[1].chan = i;7045alu.src[2].sel = t1;7046alu.src[2].chan = i;70477048r = r600_bytecode_add_alu(ctx->bc, &alu);7049if (r)7050return r;7051}70527053return 0;7054}70557056static int tgsi_interp_egcm(struct r600_shader_ctx *ctx)7057{7058struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;7059struct r600_bytecode_alu alu;7060int r, i = 0, k, interp_gpr, interp_base_chan, tmp, lasti;7061unsigned location;7062const int input = inst->Src[0].Register.Index + ctx->shader->nsys_inputs;70637064assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);70657066/* Interpolators have been marked for use already by allocate_system_value_inputs */7067if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||7068inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {7069location = TGSI_INTERPOLATE_LOC_CENTER; /* sample offset will be added explicitly */7070}7071else {7072location = TGSI_INTERPOLATE_LOC_CENTROID;7073ctx->shader->input[input].uses_interpolate_at_centroid = 1;7074}70757076k = eg_get_interpolator_index(ctx->shader->input[input].interpolate, location);7077if (k < 0)7078k = 0;7079interp_gpr = ctx->eg_interpolators[k].ij_index / 2;7080interp_base_chan = 2 * (ctx->eg_interpolators[k].ij_index % 2);70817082/* NOTE: currently offset is not perspective correct */7083if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||7084inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {7085int sample_gpr = -1;7086int gradientsH, gradientsV;7087struct r600_bytecode_tex tex;70887089if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {7090sample_gpr = load_sample_position(ctx, &ctx->src[1], ctx->src[1].swizzle[0]);7091}70927093gradientsH = r600_get_temp(ctx);7094gradientsV = r600_get_temp(ctx);7095for (i = 0; i < 2; i++) {7096memset(&tex, 0, sizeof(struct r600_bytecode_tex));7097tex.op = i == 0 ? FETCH_OP_GET_GRADIENTS_H : FETCH_OP_GET_GRADIENTS_V;7098tex.src_gpr = interp_gpr;7099tex.src_sel_x = interp_base_chan + 0;7100tex.src_sel_y = interp_base_chan + 1;7101tex.src_sel_z = 0;7102tex.src_sel_w = 0;7103tex.dst_gpr = i == 0 ? gradientsH : gradientsV;7104tex.dst_sel_x = 0;7105tex.dst_sel_y = 1;7106tex.dst_sel_z = 7;7107tex.dst_sel_w = 7;7108tex.inst_mod = 1; // Use per pixel gradient calculation7109tex.sampler_id = 0;7110tex.resource_id = tex.sampler_id;7111r = r600_bytecode_add_tex(ctx->bc, &tex);7112if (r)7113return r;7114}71157116for (i = 0; i < 2; i++) {7117memset(&alu, 0, sizeof(struct r600_bytecode_alu));7118alu.op = ALU_OP3_MULADD;7119alu.is_op3 = 1;7120alu.src[0].sel = gradientsH;7121alu.src[0].chan = i;7122if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {7123alu.src[1].sel = sample_gpr;7124alu.src[1].chan = 2;7125}7126else {7127r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);7128}7129alu.src[2].sel = interp_gpr;7130alu.src[2].chan = interp_base_chan + i;7131alu.dst.sel = ctx->temp_reg;7132alu.dst.chan = i;7133alu.last = i == 1;71347135r = r600_bytecode_add_alu(ctx->bc, &alu);7136if (r)7137return r;7138}71397140for (i = 0; i < 2; i++) {7141memset(&alu, 0, sizeof(struct r600_bytecode_alu));7142alu.op = ALU_OP3_MULADD;7143alu.is_op3 = 1;7144alu.src[0].sel = gradientsV;7145alu.src[0].chan = i;7146if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {7147alu.src[1].sel = sample_gpr;7148alu.src[1].chan = 3;7149}7150else {7151r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);7152}7153alu.src[2].sel = ctx->temp_reg;7154alu.src[2].chan = i;7155alu.dst.sel = ctx->temp_reg;7156alu.dst.chan = i;7157alu.last = i == 1;71587159r = r600_bytecode_add_alu(ctx->bc, &alu);7160if (r)7161return r;7162}7163}71647165tmp = r600_get_temp(ctx);7166for (i = 0; i < 8; i++) {7167memset(&alu, 0, sizeof(struct r600_bytecode_alu));7168alu.op = i < 4 ? ALU_OP2_INTERP_ZW : ALU_OP2_INTERP_XY;71697170alu.dst.sel = tmp;7171if ((i > 1 && i < 6)) {7172alu.dst.write = 1;7173}7174else {7175alu.dst.write = 0;7176}7177alu.dst.chan = i % 4;71787179if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||7180inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {7181alu.src[0].sel = ctx->temp_reg;7182alu.src[0].chan = 1 - (i % 2);7183} else {7184alu.src[0].sel = interp_gpr;7185alu.src[0].chan = interp_base_chan + 1 - (i % 2);7186}7187alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;7188alu.src[1].chan = 0;71897190alu.last = i % 4 == 3;7191alu.bank_swizzle_force = SQ_ALU_VEC_210;71927193r = r600_bytecode_add_alu(ctx->bc, &alu);7194if (r)7195return r;7196}71977198// INTERP can't swizzle dst7199lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);7200for (i = 0; i <= lasti; i++) {7201if (!(inst->Dst[0].Register.WriteMask & (1 << i)))7202continue;72037204memset(&alu, 0, sizeof(struct r600_bytecode_alu));7205alu.op = ALU_OP1_MOV;7206alu.src[0].sel = tmp;7207alu.src[0].chan = ctx->src[0].swizzle[i];7208tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);7209alu.dst.write = 1;7210alu.last = i == lasti;7211r = r600_bytecode_add_alu(ctx->bc, &alu);7212if (r)7213return r;7214}72157216return 0;7217}721872197220static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)7221{7222struct r600_bytecode_alu alu;7223int i, r;72247225for (i = 0; i < 4; i++) {7226memset(&alu, 0, sizeof(struct r600_bytecode_alu));7227if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {7228alu.op = ALU_OP0_NOP;7229alu.dst.chan = i;7230} else {7231alu.op = ALU_OP1_MOV;7232tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);7233alu.src[0].sel = ctx->temp_reg;7234alu.src[0].chan = i;7235}7236if (i == 3) {7237alu.last = 1;7238}7239r = r600_bytecode_add_alu(ctx->bc, &alu);7240if (r)7241return r;7242}7243return 0;7244}72457246static int tgsi_make_src_for_op3(struct r600_shader_ctx *ctx,7247unsigned writemask,7248struct r600_bytecode_alu_src *bc_src,7249const struct r600_shader_src *shader_src)7250{7251struct r600_bytecode_alu alu;7252int i, r;7253int lasti = tgsi_last_instruction(writemask);7254int temp_reg = 0;72557256r600_bytecode_src(&bc_src[0], shader_src, 0);7257r600_bytecode_src(&bc_src[1], shader_src, 1);7258r600_bytecode_src(&bc_src[2], shader_src, 2);7259r600_bytecode_src(&bc_src[3], shader_src, 3);72607261if (bc_src->abs) {7262temp_reg = r600_get_temp(ctx);72637264for (i = 0; i < lasti + 1; i++) {7265if (!(writemask & (1 << i)))7266continue;7267memset(&alu, 0, sizeof(struct r600_bytecode_alu));7268alu.op = ALU_OP1_MOV;7269alu.dst.sel = temp_reg;7270alu.dst.chan = i;7271alu.dst.write = 1;7272alu.src[0] = bc_src[i];7273if (i == lasti) {7274alu.last = 1;7275}7276r = r600_bytecode_add_alu(ctx->bc, &alu);7277if (r)7278return r;7279memset(&bc_src[i], 0, sizeof(*bc_src));7280bc_src[i].sel = temp_reg;7281bc_src[i].chan = i;7282}7283}7284return 0;7285}72867287static int tgsi_op3_dst(struct r600_shader_ctx *ctx, int dst)7288{7289struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;7290struct r600_bytecode_alu alu;7291struct r600_bytecode_alu_src srcs[4][4];7292int i, j, r;7293int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);7294unsigned op = ctx->inst_info->op;72957296if (op == ALU_OP3_MULADD_IEEE &&7297ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])7298op = ALU_OP3_MULADD;72997300for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {7301r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask,7302srcs[j], &ctx->src[j]);7303if (r)7304return r;7305}73067307for (i = 0; i < lasti + 1; i++) {7308if (!(inst->Dst[0].Register.WriteMask & (1 << i)))7309continue;73107311memset(&alu, 0, sizeof(struct r600_bytecode_alu));7312alu.op = op;7313for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {7314alu.src[j] = srcs[j][i];7315}73167317if (dst == -1) {7318tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);7319} else {7320alu.dst.sel = dst;7321}7322alu.dst.chan = i;7323alu.dst.write = 1;7324alu.is_op3 = 1;7325if (i == lasti) {7326alu.last = 1;7327}7328r = r600_bytecode_add_alu(ctx->bc, &alu);7329if (r)7330return r;7331}7332return 0;7333}73347335static int tgsi_op3(struct r600_shader_ctx *ctx)7336{7337return tgsi_op3_dst(ctx, -1);7338}73397340static int tgsi_dp(struct r600_shader_ctx *ctx)7341{7342struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;7343struct r600_bytecode_alu alu;7344int i, j, r;7345unsigned op = ctx->inst_info->op;7346if (op == ALU_OP2_DOT4_IEEE &&7347ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])7348op = ALU_OP2_DOT4;73497350for (i = 0; i < 4; i++) {7351memset(&alu, 0, sizeof(struct r600_bytecode_alu));7352alu.op = op;7353for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {7354r600_bytecode_src(&alu.src[j], &ctx->src[j], i);7355}73567357tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);7358alu.dst.chan = i;7359alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;7360/* handle some special cases */7361switch (inst->Instruction.Opcode) {7362case TGSI_OPCODE_DP2:7363if (i > 1) {7364alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;7365alu.src[0].chan = alu.src[1].chan = 0;7366}7367break;7368case TGSI_OPCODE_DP3:7369if (i > 2) {7370alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;7371alu.src[0].chan = alu.src[1].chan = 0;7372}7373break;7374default:7375break;7376}7377if (i == 3) {7378alu.last = 1;7379}7380r = r600_bytecode_add_alu(ctx->bc, &alu);7381if (r)7382return r;7383}7384return 0;7385}73867387static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,7388unsigned index)7389{7390struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;7391return (inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&7392inst->Src[index].Register.File != TGSI_FILE_INPUT &&7393inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||7394ctx->src[index].neg || ctx->src[index].abs ||7395(inst->Src[index].Register.File == TGSI_FILE_INPUT && ctx->type == PIPE_SHADER_GEOMETRY);7396}73977398static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,7399unsigned index)7400{7401struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;7402return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;7403}74047405static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading)7406{7407struct r600_bytecode_vtx vtx;7408struct r600_bytecode_alu alu;7409struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;7410int src_gpr, r, i;7411int id = tgsi_tex_get_src_gpr(ctx, 1);7412int sampler_index_mode = inst->Src[1].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE74137414src_gpr = tgsi_tex_get_src_gpr(ctx, 0);7415if (src_requires_loading) {7416for (i = 0; i < 4; i++) {7417memset(&alu, 0, sizeof(struct r600_bytecode_alu));7418alu.op = ALU_OP1_MOV;7419r600_bytecode_src(&alu.src[0], &ctx->src[0], i);7420alu.dst.sel = ctx->temp_reg;7421alu.dst.chan = i;7422if (i == 3)7423alu.last = 1;7424alu.dst.write = 1;7425r = r600_bytecode_add_alu(ctx->bc, &alu);7426if (r)7427return r;7428}7429src_gpr = ctx->temp_reg;7430}74317432memset(&vtx, 0, sizeof(vtx));7433vtx.op = FETCH_OP_VFETCH;7434vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;7435vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;7436vtx.src_gpr = src_gpr;7437vtx.mega_fetch_count = 16;7438vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;7439vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */7440vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */7441vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */7442vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */7443vtx.use_const_fields = 1;7444vtx.buffer_index_mode = sampler_index_mode;74457446if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))7447return r;74487449if (ctx->bc->chip_class >= EVERGREEN)7450return 0;74517452for (i = 0; i < 4; i++) {7453int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);7454if (!(inst->Dst[0].Register.WriteMask & (1 << i)))7455continue;74567457memset(&alu, 0, sizeof(struct r600_bytecode_alu));7458alu.op = ALU_OP2_AND_INT;74597460alu.dst.chan = i;7461alu.dst.sel = vtx.dst_gpr;7462alu.dst.write = 1;74637464alu.src[0].sel = vtx.dst_gpr;7465alu.src[0].chan = i;74667467alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL;7468alu.src[1].sel += (id * 2);7469alu.src[1].chan = i % 4;7470alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;74717472if (i == lasti)7473alu.last = 1;7474r = r600_bytecode_add_alu(ctx->bc, &alu);7475if (r)7476return r;7477}74787479if (inst->Dst[0].Register.WriteMask & 3) {7480memset(&alu, 0, sizeof(struct r600_bytecode_alu));7481alu.op = ALU_OP2_OR_INT;74827483alu.dst.chan = 3;7484alu.dst.sel = vtx.dst_gpr;7485alu.dst.write = 1;74867487alu.src[0].sel = vtx.dst_gpr;7488alu.src[0].chan = 3;74897490alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL + (id * 2) + 1;7491alu.src[1].chan = 0;7492alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;74937494alu.last = 1;7495r = r600_bytecode_add_alu(ctx->bc, &alu);7496if (r)7497return r;7498}7499return 0;7500}75017502static int r600_do_buffer_txq(struct r600_shader_ctx *ctx, int reg_idx, int offset, int eg_buffer_base)7503{7504struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;7505int r;7506int id = tgsi_tex_get_src_gpr(ctx, reg_idx) + offset;7507int sampler_index_mode = inst->Src[reg_idx].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE75087509if (ctx->bc->chip_class < EVERGREEN) {7510struct r600_bytecode_alu alu;7511memset(&alu, 0, sizeof(struct r600_bytecode_alu));7512alu.op = ALU_OP1_MOV;7513alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;7514/* r600 we have them at channel 2 of the second dword */7515alu.src[0].sel += (id * 2) + 1;7516alu.src[0].chan = 1;7517alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;7518tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);7519alu.last = 1;7520r = r600_bytecode_add_alu(ctx->bc, &alu);7521if (r)7522return r;7523return 0;7524} else {7525struct r600_bytecode_vtx vtx;7526memset(&vtx, 0, sizeof(vtx));7527vtx.op = FETCH_OP_GET_BUFFER_RESINFO;7528vtx.buffer_id = id + eg_buffer_base;7529vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;7530vtx.src_gpr = 0;7531vtx.mega_fetch_count = 16; /* no idea here really... */7532vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;7533vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */7534vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 4 : 7; /* SEL_Y */7535vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 4 : 7; /* SEL_Z */7536vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 4 : 7; /* SEL_W */7537vtx.data_format = FMT_32_32_32_32;7538vtx.buffer_index_mode = sampler_index_mode;75397540if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx)))7541return r;7542return 0;7543}7544}754575467547static int tgsi_tex(struct r600_shader_ctx *ctx)7548{7549struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;7550struct r600_bytecode_tex tex;7551struct r600_bytecode_tex grad_offs[3];7552struct r600_bytecode_alu alu;7553unsigned src_gpr;7554int r, i, j, n_grad_offs = 0;7555int opcode;7556bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing &&7557inst->Instruction.Opcode == TGSI_OPCODE_TXF &&7558(inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||7559inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA);75607561bool txf_add_offsets = inst->Texture.NumOffsets &&7562inst->Instruction.Opcode == TGSI_OPCODE_TXF &&7563inst->Texture.Texture != TGSI_TEXTURE_BUFFER;75647565/* Texture fetch instructions can only use gprs as source.7566* Also they cannot negate the source or take the absolute value */7567const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQS &&7568tgsi_tex_src_requires_loading(ctx, 0)) ||7569read_compressed_msaa || txf_add_offsets;75707571boolean src_loaded = FALSE;7572unsigned sampler_src_reg = 1;7573int8_t offset_x = 0, offset_y = 0, offset_z = 0;7574boolean has_txq_cube_array_z = false;7575unsigned sampler_index_mode;7576int array_index_offset_channel = -1;75777578if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&7579((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||7580inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))7581if (inst->Dst[0].Register.WriteMask & 4) {7582ctx->shader->has_txq_cube_array_z_comp = true;7583has_txq_cube_array_z = true;7584}75857586if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||7587inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||7588inst->Instruction.Opcode == TGSI_OPCODE_TXL2 ||7589inst->Instruction.Opcode == TGSI_OPCODE_TG4)7590sampler_src_reg = 2;75917592/* TGSI moves the sampler to src reg 3 for TXD */7593if (inst->Instruction.Opcode == TGSI_OPCODE_TXD)7594sampler_src_reg = 3;75957596sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE75977598src_gpr = tgsi_tex_get_src_gpr(ctx, 0);75997600if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {7601if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {7602if (ctx->bc->chip_class < EVERGREEN)7603ctx->shader->uses_tex_buffers = true;7604return r600_do_buffer_txq(ctx, 1, 0, R600_MAX_CONST_BUFFERS);7605}7606else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {7607if (ctx->bc->chip_class < EVERGREEN)7608ctx->shader->uses_tex_buffers = true;7609return do_vtx_fetch_inst(ctx, src_requires_loading);7610}7611}76127613if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {7614int out_chan;7615/* Add perspective divide */7616if (ctx->bc->chip_class == CAYMAN) {7617out_chan = 2;7618for (i = 0; i < 3; i++) {7619memset(&alu, 0, sizeof(struct r600_bytecode_alu));7620alu.op = ALU_OP1_RECIP_IEEE;7621r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);76227623alu.dst.sel = ctx->temp_reg;7624alu.dst.chan = i;7625if (i == 2)7626alu.last = 1;7627if (out_chan == i)7628alu.dst.write = 1;7629r = r600_bytecode_add_alu(ctx->bc, &alu);7630if (r)7631return r;7632}76337634} else {7635out_chan = 3;7636memset(&alu, 0, sizeof(struct r600_bytecode_alu));7637alu.op = ALU_OP1_RECIP_IEEE;7638r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);76397640alu.dst.sel = ctx->temp_reg;7641alu.dst.chan = out_chan;7642alu.last = 1;7643alu.dst.write = 1;7644r = r600_bytecode_add_alu(ctx->bc, &alu);7645if (r)7646return r;7647}76487649for (i = 0; i < 3; i++) {7650memset(&alu, 0, sizeof(struct r600_bytecode_alu));7651alu.op = ALU_OP2_MUL;7652alu.src[0].sel = ctx->temp_reg;7653alu.src[0].chan = out_chan;7654r600_bytecode_src(&alu.src[1], &ctx->src[0], i);7655alu.dst.sel = ctx->temp_reg;7656alu.dst.chan = i;7657alu.dst.write = 1;7658r = r600_bytecode_add_alu(ctx->bc, &alu);7659if (r)7660return r;7661}7662memset(&alu, 0, sizeof(struct r600_bytecode_alu));7663alu.op = ALU_OP1_MOV;7664alu.src[0].sel = V_SQ_ALU_SRC_1;7665alu.src[0].chan = 0;7666alu.dst.sel = ctx->temp_reg;7667alu.dst.chan = 3;7668alu.last = 1;7669alu.dst.write = 1;7670r = r600_bytecode_add_alu(ctx->bc, &alu);7671if (r)7672return r;7673src_loaded = TRUE;7674src_gpr = ctx->temp_reg;7675}767676777678if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||7679inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||7680inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||7681inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&7682inst->Instruction.Opcode != TGSI_OPCODE_TXQ) {76837684static const unsigned src0_swizzle[] = {2, 2, 0, 1};7685static const unsigned src1_swizzle[] = {1, 0, 2, 2};76867687/* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */7688for (i = 0; i < 4; i++) {7689memset(&alu, 0, sizeof(struct r600_bytecode_alu));7690alu.op = ALU_OP2_CUBE;7691r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);7692r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);7693alu.dst.sel = ctx->temp_reg;7694alu.dst.chan = i;7695if (i == 3)7696alu.last = 1;7697alu.dst.write = 1;7698r = r600_bytecode_add_alu(ctx->bc, &alu);7699if (r)7700return r;7701}77027703/* tmp1.z = RCP_e(|tmp1.z|) */7704if (ctx->bc->chip_class == CAYMAN) {7705for (i = 0; i < 3; i++) {7706memset(&alu, 0, sizeof(struct r600_bytecode_alu));7707alu.op = ALU_OP1_RECIP_IEEE;7708alu.src[0].sel = ctx->temp_reg;7709alu.src[0].chan = 2;7710alu.src[0].abs = 1;7711alu.dst.sel = ctx->temp_reg;7712alu.dst.chan = i;7713if (i == 2)7714alu.dst.write = 1;7715if (i == 2)7716alu.last = 1;7717r = r600_bytecode_add_alu(ctx->bc, &alu);7718if (r)7719return r;7720}7721} else {7722memset(&alu, 0, sizeof(struct r600_bytecode_alu));7723alu.op = ALU_OP1_RECIP_IEEE;7724alu.src[0].sel = ctx->temp_reg;7725alu.src[0].chan = 2;7726alu.src[0].abs = 1;7727alu.dst.sel = ctx->temp_reg;7728alu.dst.chan = 2;7729alu.dst.write = 1;7730alu.last = 1;7731r = r600_bytecode_add_alu(ctx->bc, &alu);7732if (r)7733return r;7734}77357736/* MULADD R0.x, R0.x, PS1, (0x3FC00000, 1.5f).x7737* MULADD R0.y, R0.y, PS1, (0x3FC00000, 1.5f).x7738* muladd has no writemask, have to use another temp7739*/7740memset(&alu, 0, sizeof(struct r600_bytecode_alu));7741alu.op = ALU_OP3_MULADD;7742alu.is_op3 = 1;77437744alu.src[0].sel = ctx->temp_reg;7745alu.src[0].chan = 0;7746alu.src[1].sel = ctx->temp_reg;7747alu.src[1].chan = 2;77487749alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;7750alu.src[2].chan = 0;7751alu.src[2].value = u_bitcast_f2u(1.5f);77527753alu.dst.sel = ctx->temp_reg;7754alu.dst.chan = 0;7755alu.dst.write = 1;77567757r = r600_bytecode_add_alu(ctx->bc, &alu);7758if (r)7759return r;77607761memset(&alu, 0, sizeof(struct r600_bytecode_alu));7762alu.op = ALU_OP3_MULADD;7763alu.is_op3 = 1;77647765alu.src[0].sel = ctx->temp_reg;7766alu.src[0].chan = 1;7767alu.src[1].sel = ctx->temp_reg;7768alu.src[1].chan = 2;77697770alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;7771alu.src[2].chan = 0;7772alu.src[2].value = u_bitcast_f2u(1.5f);77737774alu.dst.sel = ctx->temp_reg;7775alu.dst.chan = 1;7776alu.dst.write = 1;77777778alu.last = 1;7779r = r600_bytecode_add_alu(ctx->bc, &alu);7780if (r)7781return r;7782/* write initial compare value into Z component7783- W src 0 for shadow cube7784- X src 1 for shadow cube array */7785if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||7786inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {7787memset(&alu, 0, sizeof(struct r600_bytecode_alu));7788alu.op = ALU_OP1_MOV;7789if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)7790r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);7791else7792r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);7793alu.dst.sel = ctx->temp_reg;7794alu.dst.chan = 2;7795alu.dst.write = 1;7796alu.last = 1;7797r = r600_bytecode_add_alu(ctx->bc, &alu);7798if (r)7799return r;7800}78017802if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||7803inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {7804if (ctx->bc->chip_class >= EVERGREEN) {7805int mytmp = r600_get_temp(ctx);7806memset(&alu, 0, sizeof(struct r600_bytecode_alu));7807alu.op = ALU_OP1_MOV;7808alu.src[0].sel = ctx->temp_reg;7809alu.src[0].chan = 3;7810alu.dst.sel = mytmp;7811alu.dst.chan = 0;7812alu.dst.write = 1;7813alu.last = 1;7814r = r600_bytecode_add_alu(ctx->bc, &alu);7815if (r)7816return r;78177818/* Evaluate the array index according to floor(idx + 0.5). This7819* needs to be done before merging the face select value, because7820* otherwise the fractional part of the array index will interfere7821* with the face select value */7822memset(&alu, 0, sizeof(struct r600_bytecode_alu));7823r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);7824alu.op = ALU_OP1_RNDNE;7825alu.dst.sel = ctx->temp_reg;7826alu.dst.chan = 3;7827alu.dst.write = 1;7828alu.last = 1;7829r = r600_bytecode_add_alu(ctx->bc, &alu);7830if (r)7831return r;78327833/* Because the array slice index and the cube face index are merged7834* into one value we have to make sure the array slice index is >= 0,7835* otherwise the face selection will fail */7836memset(&alu, 0, sizeof(struct r600_bytecode_alu));7837alu.op = ALU_OP2_MAX;7838alu.src[0].sel = ctx->temp_reg;7839alu.src[0].chan = 3;7840alu.src[1].sel = V_SQ_ALU_SRC_0;7841alu.dst.sel = ctx->temp_reg;7842alu.dst.chan = 3;7843alu.dst.write = 1;7844alu.last = 1;7845r = r600_bytecode_add_alu(ctx->bc, &alu);7846if (r)7847return r;78487849/* have to multiply original layer by 8 and add to face id (temp.w) in Z */7850memset(&alu, 0, sizeof(struct r600_bytecode_alu));7851alu.op = ALU_OP3_MULADD;7852alu.is_op3 = 1;7853alu.src[0].sel = ctx->temp_reg;7854alu.src[0].chan = 3;7855alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;7856alu.src[1].chan = 0;7857alu.src[1].value = u_bitcast_f2u(8.0f);7858alu.src[2].sel = mytmp;7859alu.src[2].chan = 0;7860alu.dst.sel = ctx->temp_reg;7861alu.dst.chan = 3;7862alu.dst.write = 1;7863alu.last = 1;7864r = r600_bytecode_add_alu(ctx->bc, &alu);7865if (r)7866return r;7867} else if (ctx->bc->chip_class < EVERGREEN) {7868memset(&tex, 0, sizeof(struct r600_bytecode_tex));7869tex.op = FETCH_OP_SET_CUBEMAP_INDEX;7870tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);7871tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;7872tex.src_gpr = r600_get_temp(ctx);7873tex.src_sel_x = 0;7874tex.src_sel_y = 0;7875tex.src_sel_z = 0;7876tex.src_sel_w = 0;7877tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;7878tex.coord_type_x = 1;7879tex.coord_type_y = 1;7880tex.coord_type_z = 1;7881tex.coord_type_w = 1;7882memset(&alu, 0, sizeof(struct r600_bytecode_alu));7883alu.op = ALU_OP1_MOV;7884r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);7885alu.dst.sel = tex.src_gpr;7886alu.dst.chan = 0;7887alu.last = 1;7888alu.dst.write = 1;7889r = r600_bytecode_add_alu(ctx->bc, &alu);7890if (r)7891return r;78927893r = r600_bytecode_add_tex(ctx->bc, &tex);7894if (r)7895return r;7896}78977898}78997900/* for cube forms of lod and bias we need to route things */7901if (inst->Instruction.Opcode == TGSI_OPCODE_TXB ||7902inst->Instruction.Opcode == TGSI_OPCODE_TXL ||7903inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||7904inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {7905memset(&alu, 0, sizeof(struct r600_bytecode_alu));7906alu.op = ALU_OP1_MOV;7907if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||7908inst->Instruction.Opcode == TGSI_OPCODE_TXL2)7909r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);7910else7911r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);7912alu.dst.sel = ctx->temp_reg;7913alu.dst.chan = 2;7914alu.last = 1;7915alu.dst.write = 1;7916r = r600_bytecode_add_alu(ctx->bc, &alu);7917if (r)7918return r;7919}79207921src_loaded = TRUE;7922src_gpr = ctx->temp_reg;7923}79247925if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {7926int temp_h = 0, temp_v = 0;7927int start_val = 0;79287929/* if we've already loaded the src (i.e. CUBE don't reload it). */7930if (src_loaded == TRUE)7931start_val = 1;7932else7933src_loaded = TRUE;7934for (i = start_val; i < 3; i++) {7935int treg = r600_get_temp(ctx);79367937if (i == 0)7938src_gpr = treg;7939else if (i == 1)7940temp_h = treg;7941else7942temp_v = treg;79437944for (j = 0; j < 4; j++) {7945memset(&alu, 0, sizeof(struct r600_bytecode_alu));7946alu.op = ALU_OP1_MOV;7947r600_bytecode_src(&alu.src[0], &ctx->src[i], j);7948alu.dst.sel = treg;7949alu.dst.chan = j;7950if (j == 3)7951alu.last = 1;7952alu.dst.write = 1;7953r = r600_bytecode_add_alu(ctx->bc, &alu);7954if (r)7955return r;7956}7957}7958for (i = 1; i < 3; i++) {7959/* set gradients h/v */7960struct r600_bytecode_tex *t = &grad_offs[n_grad_offs++];7961memset(t, 0, sizeof(struct r600_bytecode_tex));7962t->op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H :7963FETCH_OP_SET_GRADIENTS_V;7964t->sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);7965t->sampler_index_mode = sampler_index_mode;7966t->resource_id = t->sampler_id + R600_MAX_CONST_BUFFERS;7967t->resource_index_mode = sampler_index_mode;79687969t->src_gpr = (i == 1) ? temp_h : temp_v;7970t->src_sel_x = 0;7971t->src_sel_y = 1;7972t->src_sel_z = 2;7973t->src_sel_w = 3;79747975t->dst_gpr = r600_get_temp(ctx); /* just to avoid confusing the asm scheduler */7976t->dst_sel_x = t->dst_sel_y = t->dst_sel_z = t->dst_sel_w = 7;7977if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {7978t->coord_type_x = 1;7979t->coord_type_y = 1;7980t->coord_type_z = 1;7981t->coord_type_w = 1;7982}7983}7984}79857986if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {7987/* Gather4 should follow the same rules as bilinear filtering, but the hardware7988* incorrectly forces nearest filtering if the texture format is integer.7989* The only effect it has on Gather4, which always returns 4 texels for7990* bilinear filtering, is that the final coordinates are off by 0.5 of7991* the texel size.7992*7993* The workaround is to subtract 0.5 from the unnormalized coordinates,7994* or (0.5 / size) from the normalized coordinates.7995*/7996if (inst->Texture.ReturnType == TGSI_RETURN_TYPE_SINT ||7997inst->Texture.ReturnType == TGSI_RETURN_TYPE_UINT) {7998int treg = r600_get_temp(ctx);79998000/* mov array and comparison oordinate to temp_reg if needed */8001if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||8002inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||8003inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) && !src_loaded) {8004int end = inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ? 3 : 2;8005for (i = 2; i <= end; i++) {8006memset(&alu, 0, sizeof(struct r600_bytecode_alu));8007alu.op = ALU_OP1_MOV;8008alu.dst.sel = ctx->temp_reg;8009alu.dst.chan = i;8010alu.dst.write = 1;8011alu.last = (i == end);8012r600_bytecode_src(&alu.src[0], &ctx->src[0], i);8013r = r600_bytecode_add_alu(ctx->bc, &alu);8014if (r)8015return r;8016}8017}80188019if (inst->Texture.Texture == TGSI_TEXTURE_RECT ||8020inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT) {8021for (i = 0; i < 2; i++) {8022memset(&alu, 0, sizeof(struct r600_bytecode_alu));8023alu.op = ALU_OP2_ADD;8024alu.dst.sel = ctx->temp_reg;8025alu.dst.chan = i;8026alu.dst.write = 1;8027alu.last = i == 1;8028if (src_loaded) {8029alu.src[0].sel = ctx->temp_reg;8030alu.src[0].chan = i;8031} else8032r600_bytecode_src(&alu.src[0], &ctx->src[0], i);8033alu.src[1].sel = V_SQ_ALU_SRC_0_5;8034alu.src[1].neg = 1;8035r = r600_bytecode_add_alu(ctx->bc, &alu);8036if (r)8037return r;8038}8039} else {8040/* execute a TXQ */8041memset(&tex, 0, sizeof(struct r600_bytecode_tex));8042tex.op = FETCH_OP_GET_TEXTURE_RESINFO;8043tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);8044tex.sampler_index_mode = sampler_index_mode;8045tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;8046tex.resource_index_mode = sampler_index_mode;8047tex.dst_gpr = treg;8048tex.src_sel_x = 4;8049tex.src_sel_y = 4;8050tex.src_sel_z = 4;8051tex.src_sel_w = 4;8052tex.dst_sel_x = 0;8053tex.dst_sel_y = 1;8054tex.dst_sel_z = 7;8055tex.dst_sel_w = 7;8056r = r600_bytecode_add_tex(ctx->bc, &tex);8057if (r)8058return r;80598060/* coord.xy = -0.5 * (1.0/int_to_flt(size)) + coord.xy */8061if (ctx->bc->chip_class == CAYMAN) {8062/* */8063for (i = 0; i < 2; i++) {8064memset(&alu, 0, sizeof(struct r600_bytecode_alu));8065alu.op = ALU_OP1_INT_TO_FLT;8066alu.dst.sel = treg;8067alu.dst.chan = i;8068alu.dst.write = 1;8069alu.src[0].sel = treg;8070alu.src[0].chan = i;8071alu.last = (i == 1) ? 1 : 0;8072r = r600_bytecode_add_alu(ctx->bc, &alu);8073if (r)8074return r;8075}8076for (j = 0; j < 2; j++) {8077for (i = 0; i < 3; i++) {8078memset(&alu, 0, sizeof(struct r600_bytecode_alu));8079alu.op = ALU_OP1_RECIP_IEEE;8080alu.src[0].sel = treg;8081alu.src[0].chan = j;8082alu.dst.sel = treg;8083alu.dst.chan = i;8084if (i == 2)8085alu.last = 1;8086if (i == j)8087alu.dst.write = 1;8088r = r600_bytecode_add_alu(ctx->bc, &alu);8089if (r)8090return r;8091}8092}8093} else {8094for (i = 0; i < 2; i++) {8095memset(&alu, 0, sizeof(struct r600_bytecode_alu));8096alu.op = ALU_OP1_INT_TO_FLT;8097alu.dst.sel = treg;8098alu.dst.chan = i;8099alu.dst.write = 1;8100alu.src[0].sel = treg;8101alu.src[0].chan = i;8102alu.last = 1;8103r = r600_bytecode_add_alu(ctx->bc, &alu);8104if (r)8105return r;8106}8107for (i = 0; i < 2; i++) {8108memset(&alu, 0, sizeof(struct r600_bytecode_alu));8109alu.op = ALU_OP1_RECIP_IEEE;8110alu.src[0].sel = treg;8111alu.src[0].chan = i;8112alu.dst.sel = treg;8113alu.dst.chan = i;8114alu.last = 1;8115alu.dst.write = 1;8116r = r600_bytecode_add_alu(ctx->bc, &alu);8117if (r)8118return r;8119}8120}8121for (i = 0; i < 2; i++) {8122memset(&alu, 0, sizeof(struct r600_bytecode_alu));8123alu.op = ALU_OP3_MULADD;8124alu.is_op3 = 1;8125alu.dst.sel = ctx->temp_reg;8126alu.dst.chan = i;8127alu.dst.write = 1;8128alu.last = i == 1;8129alu.src[0].sel = treg;8130alu.src[0].chan = i;8131alu.src[1].sel = V_SQ_ALU_SRC_0_5;8132alu.src[1].neg = 1;8133if (src_loaded) {8134alu.src[2].sel = ctx->temp_reg;8135alu.src[2].chan = i;8136} else8137r600_bytecode_src(&alu.src[2], &ctx->src[0], i);8138r = r600_bytecode_add_alu(ctx->bc, &alu);8139if (r)8140return r;8141}8142}8143src_loaded = TRUE;8144src_gpr = ctx->temp_reg;8145}8146}81478148if (src_requires_loading && !src_loaded) {8149for (i = 0; i < 4; i++) {8150memset(&alu, 0, sizeof(struct r600_bytecode_alu));8151alu.op = ALU_OP1_MOV;8152r600_bytecode_src(&alu.src[0], &ctx->src[0], i);8153alu.dst.sel = ctx->temp_reg;8154alu.dst.chan = i;8155if (i == 3)8156alu.last = 1;8157alu.dst.write = 1;8158r = r600_bytecode_add_alu(ctx->bc, &alu);8159if (r)8160return r;8161}8162src_loaded = TRUE;8163src_gpr = ctx->temp_reg;8164}81658166/* get offset values */8167if (inst->Texture.NumOffsets) {8168assert(inst->Texture.NumOffsets == 1);81698170/* The texture offset feature doesn't work with the TXF instruction8171* and must be emulated by adding the offset to the texture coordinates. */8172if (txf_add_offsets) {8173const struct tgsi_texture_offset *off = inst->TexOffsets;81748175switch (inst->Texture.Texture) {8176case TGSI_TEXTURE_3D:8177memset(&alu, 0, sizeof(struct r600_bytecode_alu));8178alu.op = ALU_OP2_ADD_INT;8179alu.src[0].sel = src_gpr;8180alu.src[0].chan = 2;8181alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;8182alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ];8183alu.dst.sel = src_gpr;8184alu.dst.chan = 2;8185alu.dst.write = 1;8186alu.last = 1;8187r = r600_bytecode_add_alu(ctx->bc, &alu);8188if (r)8189return r;8190FALLTHROUGH;81918192case TGSI_TEXTURE_2D:8193case TGSI_TEXTURE_SHADOW2D:8194case TGSI_TEXTURE_RECT:8195case TGSI_TEXTURE_SHADOWRECT:8196case TGSI_TEXTURE_2D_ARRAY:8197case TGSI_TEXTURE_SHADOW2D_ARRAY:8198memset(&alu, 0, sizeof(struct r600_bytecode_alu));8199alu.op = ALU_OP2_ADD_INT;8200alu.src[0].sel = src_gpr;8201alu.src[0].chan = 1;8202alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;8203alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY];8204alu.dst.sel = src_gpr;8205alu.dst.chan = 1;8206alu.dst.write = 1;8207alu.last = 1;8208r = r600_bytecode_add_alu(ctx->bc, &alu);8209if (r)8210return r;8211FALLTHROUGH;82128213case TGSI_TEXTURE_1D:8214case TGSI_TEXTURE_SHADOW1D:8215case TGSI_TEXTURE_1D_ARRAY:8216case TGSI_TEXTURE_SHADOW1D_ARRAY:8217memset(&alu, 0, sizeof(struct r600_bytecode_alu));8218alu.op = ALU_OP2_ADD_INT;8219alu.src[0].sel = src_gpr;8220alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;8221alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX];8222alu.dst.sel = src_gpr;8223alu.dst.write = 1;8224alu.last = 1;8225r = r600_bytecode_add_alu(ctx->bc, &alu);8226if (r)8227return r;8228break;8229/* texture offsets do not apply to other texture targets */8230}8231} else {8232switch (inst->Texture.Texture) {8233case TGSI_TEXTURE_3D:8234offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;8235FALLTHROUGH;8236case TGSI_TEXTURE_2D:8237case TGSI_TEXTURE_SHADOW2D:8238case TGSI_TEXTURE_RECT:8239case TGSI_TEXTURE_SHADOWRECT:8240case TGSI_TEXTURE_2D_ARRAY:8241case TGSI_TEXTURE_SHADOW2D_ARRAY:8242offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;8243FALLTHROUGH;8244case TGSI_TEXTURE_1D:8245case TGSI_TEXTURE_SHADOW1D:8246case TGSI_TEXTURE_1D_ARRAY:8247case TGSI_TEXTURE_SHADOW1D_ARRAY:8248offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;8249}8250}8251}82528253/* Obtain the sample index for reading a compressed MSAA color texture.8254* To read the FMASK, we use the ldfptr instruction, which tells us8255* where the samples are stored.8256* For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210,8257* which is the identity mapping. Each nibble says which physical sample8258* should be fetched to get that sample.8259*8260* Assume src.z contains the sample index. It should be modified like this:8261* src.z = (ldfptr() >> (src.z * 4)) & 0xF;8262* Then fetch the texel with src.8263*/8264if (read_compressed_msaa) {8265unsigned sample_chan = 3;8266unsigned temp = r600_get_temp(ctx);8267assert(src_loaded);82688269/* temp.w = ldfptr() */8270memset(&tex, 0, sizeof(struct r600_bytecode_tex));8271tex.op = FETCH_OP_LD;8272tex.inst_mod = 1; /* to indicate this is ldfptr */8273tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);8274tex.sampler_index_mode = sampler_index_mode;8275tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;8276tex.resource_index_mode = sampler_index_mode;8277tex.src_gpr = src_gpr;8278tex.dst_gpr = temp;8279tex.dst_sel_x = 7; /* mask out these components */8280tex.dst_sel_y = 7;8281tex.dst_sel_z = 7;8282tex.dst_sel_w = 0; /* store X */8283tex.src_sel_x = 0;8284tex.src_sel_y = 1;8285tex.src_sel_z = 2;8286tex.src_sel_w = 3;8287tex.offset_x = offset_x;8288tex.offset_y = offset_y;8289tex.offset_z = offset_z;8290r = r600_bytecode_add_tex(ctx->bc, &tex);8291if (r)8292return r;82938294/* temp.x = sample_index*4 */8295memset(&alu, 0, sizeof(struct r600_bytecode_alu));8296alu.op = ALU_OP2_MULLO_INT;8297alu.src[0].sel = src_gpr;8298alu.src[0].chan = sample_chan;8299alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;8300alu.src[1].value = 4;8301alu.dst.sel = temp;8302alu.dst.chan = 0;8303alu.dst.write = 1;8304r = emit_mul_int_op(ctx->bc, &alu);8305if (r)8306return r;83078308/* sample_index = temp.w >> temp.x */8309memset(&alu, 0, sizeof(struct r600_bytecode_alu));8310alu.op = ALU_OP2_LSHR_INT;8311alu.src[0].sel = temp;8312alu.src[0].chan = 3;8313alu.src[1].sel = temp;8314alu.src[1].chan = 0;8315alu.dst.sel = src_gpr;8316alu.dst.chan = sample_chan;8317alu.dst.write = 1;8318alu.last = 1;8319r = r600_bytecode_add_alu(ctx->bc, &alu);8320if (r)8321return r;83228323/* sample_index & 0xF */8324memset(&alu, 0, sizeof(struct r600_bytecode_alu));8325alu.op = ALU_OP2_AND_INT;8326alu.src[0].sel = src_gpr;8327alu.src[0].chan = sample_chan;8328alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;8329alu.src[1].value = 0xF;8330alu.dst.sel = src_gpr;8331alu.dst.chan = sample_chan;8332alu.dst.write = 1;8333alu.last = 1;8334r = r600_bytecode_add_alu(ctx->bc, &alu);8335if (r)8336return r;8337#if 08338/* visualize the FMASK */8339for (i = 0; i < 4; i++) {8340memset(&alu, 0, sizeof(struct r600_bytecode_alu));8341alu.op = ALU_OP1_INT_TO_FLT;8342alu.src[0].sel = src_gpr;8343alu.src[0].chan = sample_chan;8344alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;8345alu.dst.chan = i;8346alu.dst.write = 1;8347alu.last = 1;8348r = r600_bytecode_add_alu(ctx->bc, &alu);8349if (r)8350return r;8351}8352return 0;8353#endif8354}83558356/* does this shader want a num layers from TXQ for a cube array? */8357if (has_txq_cube_array_z) {8358int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);83598360memset(&alu, 0, sizeof(struct r600_bytecode_alu));8361alu.op = ALU_OP1_MOV;83628363alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;8364if (ctx->bc->chip_class >= EVERGREEN) {8365/* with eg each dword is number of cubes */8366alu.src[0].sel += id / 4;8367alu.src[0].chan = id % 4;8368} else {8369/* r600 we have them at channel 2 of the second dword */8370alu.src[0].sel += (id * 2) + 1;8371alu.src[0].chan = 2;8372}8373alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;8374tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);8375alu.last = 1;8376r = r600_bytecode_add_alu(ctx->bc, &alu);8377if (r)8378return r;8379/* disable writemask from texture instruction */8380inst->Dst[0].Register.WriteMask &= ~4;8381}83828383opcode = ctx->inst_info->op;8384if (opcode == FETCH_OP_GATHER4 &&8385inst->TexOffsets[0].File != TGSI_FILE_NULL &&8386inst->TexOffsets[0].File != TGSI_FILE_IMMEDIATE) {8387struct r600_bytecode_tex *t;8388opcode = FETCH_OP_GATHER4_O;83898390/* GATHER4_O/GATHER4_C_O use offset values loaded by8391SET_TEXTURE_OFFSETS instruction. The immediate offset values8392encoded in the instruction are ignored. */8393t = &grad_offs[n_grad_offs++];8394memset(t, 0, sizeof(struct r600_bytecode_tex));8395t->op = FETCH_OP_SET_TEXTURE_OFFSETS;8396t->sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);8397t->sampler_index_mode = sampler_index_mode;8398t->resource_id = t->sampler_id + R600_MAX_CONST_BUFFERS;8399t->resource_index_mode = sampler_index_mode;84008401t->src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index;8402t->src_sel_x = inst->TexOffsets[0].SwizzleX;8403t->src_sel_y = inst->TexOffsets[0].SwizzleY;8404if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||8405inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)8406/* make sure array index selector is 0, this is just a safety8407* precausion because TGSI seems to emit something strange here */8408t->src_sel_z = 4;8409else8410t->src_sel_z = inst->TexOffsets[0].SwizzleZ;84118412t->src_sel_w = 4;84138414t->dst_sel_x = 7;8415t->dst_sel_y = 7;8416t->dst_sel_z = 7;8417t->dst_sel_w = 7;8418}84198420if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||8421inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||8422inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||8423inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||8424inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||8425inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||8426inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {8427switch (opcode) {8428case FETCH_OP_SAMPLE:8429opcode = FETCH_OP_SAMPLE_C;8430break;8431case FETCH_OP_SAMPLE_L:8432opcode = FETCH_OP_SAMPLE_C_L;8433break;8434case FETCH_OP_SAMPLE_LB:8435opcode = FETCH_OP_SAMPLE_C_LB;8436break;8437case FETCH_OP_SAMPLE_G:8438opcode = FETCH_OP_SAMPLE_C_G;8439break;8440/* Texture gather variants */8441case FETCH_OP_GATHER4:8442opcode = FETCH_OP_GATHER4_C;8443break;8444case FETCH_OP_GATHER4_O:8445opcode = FETCH_OP_GATHER4_C_O;8446break;8447}8448}84498450memset(&tex, 0, sizeof(struct r600_bytecode_tex));8451tex.op = opcode;84528453tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);8454tex.sampler_index_mode = sampler_index_mode;8455tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;8456tex.resource_index_mode = sampler_index_mode;8457tex.src_gpr = src_gpr;8458tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;84598460if (inst->Instruction.Opcode == TGSI_OPCODE_DDX_FINE ||8461inst->Instruction.Opcode == TGSI_OPCODE_DDY_FINE) {8462tex.inst_mod = 1; /* per pixel gradient calculation instead of per 2x2 quad */8463}84648465if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {8466int8_t texture_component_select = ctx->literals[4 * inst->Src[1].Register.Index + inst->Src[1].Register.SwizzleX];8467tex.inst_mod = texture_component_select;84688469if (ctx->bc->chip_class == CAYMAN) {8470tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;8471tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;8472tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;8473tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;8474} else {8475/* GATHER4 result order is different from TGSI TG4 */8476tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 1 : 7;8477tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 2 : 7;8478tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 0 : 7;8479tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;8480}8481}8482else if (inst->Instruction.Opcode == TGSI_OPCODE_LODQ) {8483tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;8484tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;8485tex.dst_sel_z = 7;8486tex.dst_sel_w = 7;8487}8488else if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {8489tex.dst_sel_x = 3;8490tex.dst_sel_y = 7;8491tex.dst_sel_z = 7;8492tex.dst_sel_w = 7;8493}8494else {8495tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;8496tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;8497tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;8498tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;8499}850085018502if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {8503tex.src_sel_x = 4;8504tex.src_sel_y = 4;8505tex.src_sel_z = 4;8506tex.src_sel_w = 4;8507} else if (src_loaded) {8508tex.src_sel_x = 0;8509tex.src_sel_y = 1;8510tex.src_sel_z = 2;8511tex.src_sel_w = 3;8512} else {8513tex.src_sel_x = ctx->src[0].swizzle[0];8514tex.src_sel_y = ctx->src[0].swizzle[1];8515tex.src_sel_z = ctx->src[0].swizzle[2];8516tex.src_sel_w = ctx->src[0].swizzle[3];8517tex.src_rel = ctx->src[0].rel;8518}85198520if (inst->Texture.Texture == TGSI_TEXTURE_CUBE ||8521inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||8522inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||8523inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {8524tex.src_sel_x = 1;8525tex.src_sel_y = 0;8526tex.src_sel_z = 3;8527tex.src_sel_w = 2; /* route Z compare or Lod value into W */8528}85298530if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&8531inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {8532tex.coord_type_x = 1;8533tex.coord_type_y = 1;8534}8535tex.coord_type_z = 1;8536tex.coord_type_w = 1;85378538tex.offset_x = offset_x;8539tex.offset_y = offset_y;8540if (inst->Instruction.Opcode == TGSI_OPCODE_TG4 &&8541(inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||8542inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)) {8543tex.offset_z = 0;8544}8545else {8546tex.offset_z = offset_z;8547}85488549/* Put the depth for comparison in W.8550* TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.8551* Some instructions expect the depth in Z. */8552if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||8553inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||8554inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||8555inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&8556opcode != FETCH_OP_SAMPLE_C_L &&8557opcode != FETCH_OP_SAMPLE_C_LB) {8558tex.src_sel_w = tex.src_sel_z;8559}85608561if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||8562inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {8563if (opcode == FETCH_OP_SAMPLE_C_L ||8564opcode == FETCH_OP_SAMPLE_C_LB) {8565/* the array index is read from Y */8566tex.coord_type_y = 0;8567array_index_offset_channel = tex.src_sel_y;8568} else {8569/* the array index is read from Z */8570tex.coord_type_z = 0;8571tex.src_sel_z = tex.src_sel_y;8572array_index_offset_channel = tex.src_sel_z;8573}8574} else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||8575inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) {8576tex.coord_type_z = 0;8577array_index_offset_channel = tex.src_sel_z;8578} else if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||8579inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&8580(ctx->bc->chip_class >= EVERGREEN))8581/* the array index is read from Z, coordinate will be corrected elsewhere */8582tex.coord_type_z = 0;85838584/* We have array access to 1D or 2D ARRAY, the coordinates are not int ->8585* evaluate the array index */8586if (array_index_offset_channel >= 0 &&8587opcode != FETCH_OP_LD &&8588opcode != FETCH_OP_GET_TEXTURE_RESINFO) {8589memset(&alu, 0, sizeof(struct r600_bytecode_alu));8590alu.src[0].sel = tex.src_gpr;8591alu.src[0].chan = array_index_offset_channel;8592alu.src[0].rel = tex.src_rel;8593alu.op = ALU_OP1_RNDNE;8594alu.dst.sel = tex.src_gpr;8595alu.dst.chan = array_index_offset_channel;8596alu.dst.rel = tex.src_rel;8597alu.dst.write = 1;8598alu.last = 1;8599r = r600_bytecode_add_alu(ctx->bc, &alu);8600if (r)8601return r;8602}86038604/* mask unused source components */8605if (opcode == FETCH_OP_SAMPLE || opcode == FETCH_OP_GATHER4) {8606switch (inst->Texture.Texture) {8607case TGSI_TEXTURE_2D:8608case TGSI_TEXTURE_RECT:8609tex.src_sel_z = 7;8610tex.src_sel_w = 7;8611break;8612case TGSI_TEXTURE_1D_ARRAY:8613tex.src_sel_y = 7;8614tex.src_sel_w = 7;8615break;8616case TGSI_TEXTURE_1D:8617tex.src_sel_y = 7;8618tex.src_sel_z = 7;8619tex.src_sel_w = 7;8620break;8621}8622}86238624/* Emit set gradient and offset instructions. */8625for (i = 0; i < n_grad_offs; ++i) {8626r = r600_bytecode_add_tex(ctx->bc, &grad_offs[i]);8627if (r)8628return r;8629}86308631r = r600_bytecode_add_tex(ctx->bc, &tex);8632if (r)8633return r;86348635/* add shadow ambient support - gallium doesn't do it yet */8636return 0;8637}86388639static int find_hw_atomic_counter(struct r600_shader_ctx *ctx,8640struct tgsi_full_src_register *src)8641{8642unsigned i;86438644if (src->Register.Indirect) {8645for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {8646if (src->Indirect.ArrayID == ctx->shader->atomics[i].array_id)8647return ctx->shader->atomics[i].hw_idx;8648}8649} else {8650uint32_t index = src->Register.Index;8651for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {8652if (ctx->shader->atomics[i].buffer_id != (unsigned)src->Dimension.Index)8653continue;8654if (index > ctx->shader->atomics[i].end)8655continue;8656if (index < ctx->shader->atomics[i].start)8657continue;8658uint32_t offset = (index - ctx->shader->atomics[i].start);8659return ctx->shader->atomics[i].hw_idx + offset;8660}8661}8662assert(0);8663return -1;8664}86658666static int tgsi_set_gds_temp(struct r600_shader_ctx *ctx,8667int *uav_id_p, int *uav_index_mode_p)8668{8669struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;8670int uav_id, uav_index_mode = 0;8671int r;8672bool is_cm = (ctx->bc->chip_class == CAYMAN);86738674uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]);86758676if (inst->Src[0].Register.Indirect) {8677if (is_cm) {8678struct r600_bytecode_alu alu;8679memset(&alu, 0, sizeof(struct r600_bytecode_alu));8680alu.op = ALU_OP2_LSHL_INT;8681alu.src[0].sel = get_address_file_reg(ctx, inst->Src[0].Indirect.Index);8682alu.src[0].chan = 0;8683alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;8684alu.src[1].value = 2;8685alu.dst.sel = ctx->temp_reg;8686alu.dst.chan = 0;8687alu.dst.write = 1;8688alu.last = 1;8689r = r600_bytecode_add_alu(ctx->bc, &alu);8690if (r)8691return r;86928693r = single_alu_op2(ctx, ALU_OP2_ADD_INT,8694ctx->temp_reg, 0,8695ctx->temp_reg, 0,8696V_SQ_ALU_SRC_LITERAL, uav_id * 4);8697if (r)8698return r;8699} else8700uav_index_mode = 2;8701} else if (is_cm) {8702r = single_alu_op2(ctx, ALU_OP1_MOV,8703ctx->temp_reg, 0,8704V_SQ_ALU_SRC_LITERAL, uav_id * 4,87050, 0);8706if (r)8707return r;8708}8709*uav_id_p = uav_id;8710*uav_index_mode_p = uav_index_mode;8711return 0;8712}87138714static int tgsi_load_gds(struct r600_shader_ctx *ctx)8715{8716struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;8717int r;8718struct r600_bytecode_gds gds;8719int uav_id = 0;8720int uav_index_mode = 0;8721bool is_cm = (ctx->bc->chip_class == CAYMAN);87228723r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode);8724if (r)8725return r;87268727memset(&gds, 0, sizeof(struct r600_bytecode_gds));8728gds.op = FETCH_OP_GDS_READ_RET;8729gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;8730gds.uav_id = is_cm ? 0 : uav_id;8731gds.uav_index_mode = is_cm ? 0 : uav_index_mode;8732gds.src_gpr = ctx->temp_reg;8733gds.src_sel_x = (is_cm) ? 0 : 4;8734gds.src_sel_y = 4;8735gds.src_sel_z = 4;8736gds.dst_sel_x = 0;8737gds.dst_sel_y = 7;8738gds.dst_sel_z = 7;8739gds.dst_sel_w = 7;8740gds.src_gpr2 = 0;8741gds.alloc_consume = !is_cm;8742r = r600_bytecode_add_gds(ctx->bc, &gds);8743if (r)8744return r;87458746ctx->bc->cf_last->vpm = 1;8747return 0;8748}87498750/* this fixes up 1D arrays properly */8751static int load_index_src(struct r600_shader_ctx *ctx, int src_index, int *idx_gpr)8752{8753struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;8754int r, i;8755struct r600_bytecode_alu alu;8756int temp_reg = r600_get_temp(ctx);87578758for (i = 0; i < 4; i++) {8759bool def_val = true, write_zero = false;8760memset(&alu, 0, sizeof(struct r600_bytecode_alu));8761alu.op = ALU_OP1_MOV;8762alu.dst.sel = temp_reg;8763alu.dst.chan = i;87648765switch (inst->Memory.Texture) {8766case TGSI_TEXTURE_BUFFER:8767case TGSI_TEXTURE_1D:8768if (i == 1 || i == 2 || i == 3) {8769write_zero = true;8770}8771break;8772case TGSI_TEXTURE_1D_ARRAY:8773if (i == 1 || i == 3)8774write_zero = true;8775else if (i == 2) {8776r600_bytecode_src(&alu.src[0], &ctx->src[src_index], 1);8777def_val = false;8778}8779break;8780case TGSI_TEXTURE_2D:8781if (i == 2 || i == 3)8782write_zero = true;8783break;8784default:8785if (i == 3)8786write_zero = true;8787break;8788}87898790if (write_zero) {8791alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;8792alu.src[0].value = 0;8793} else if (def_val) {8794r600_bytecode_src(&alu.src[0], &ctx->src[src_index], i);8795}87968797if (i == 3)8798alu.last = 1;8799alu.dst.write = 1;8800r = r600_bytecode_add_alu(ctx->bc, &alu);8801if (r)8802return r;8803}8804*idx_gpr = temp_reg;8805return 0;8806}88078808static int load_buffer_coord(struct r600_shader_ctx *ctx, int src_idx,8809int temp_reg)8810{8811struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;8812int r;8813if (inst->Src[src_idx].Register.File == TGSI_FILE_IMMEDIATE) {8814int value = (ctx->literals[4 * inst->Src[src_idx].Register.Index + inst->Src[src_idx].Register.SwizzleX]);8815r = single_alu_op2(ctx, ALU_OP1_MOV,8816temp_reg, 0,8817V_SQ_ALU_SRC_LITERAL, value >> 2,88180, 0);8819if (r)8820return r;8821} else {8822struct r600_bytecode_alu alu;8823memset(&alu, 0, sizeof(struct r600_bytecode_alu));8824alu.op = ALU_OP2_LSHR_INT;8825r600_bytecode_src(&alu.src[0], &ctx->src[src_idx], 0);8826alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;8827alu.src[1].value = 2;8828alu.dst.sel = temp_reg;8829alu.dst.write = 1;8830alu.last = 1;8831r = r600_bytecode_add_alu(ctx->bc, &alu);8832if (r)8833return r;8834}8835return 0;8836}88378838static int tgsi_load_buffer(struct r600_shader_ctx *ctx)8839{8840struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;8841/* have to work out the offset into the RAT immediate return buffer */8842struct r600_bytecode_vtx vtx;8843struct r600_bytecode_cf *cf;8844int r;8845int temp_reg = r600_get_temp(ctx);8846unsigned rat_index_mode;8847unsigned base;88488849rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE8850base = R600_IMAGE_REAL_RESOURCE_OFFSET + ctx->info.file_count[TGSI_FILE_IMAGE];88518852r = load_buffer_coord(ctx, 1, temp_reg);8853if (r)8854return r;8855ctx->bc->cf_last->barrier = 1;8856memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));8857vtx.op = FETCH_OP_VFETCH;8858vtx.buffer_id = inst->Src[0].Register.Index + base;8859vtx.buffer_index_mode = rat_index_mode;8860vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;8861vtx.src_gpr = temp_reg;8862vtx.src_sel_x = 0;8863vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;8864vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */8865vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */8866vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */8867vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */8868vtx.num_format_all = 1;8869vtx.format_comp_all = 1;8870vtx.srf_mode_all = 0;88718872if (inst->Dst[0].Register.WriteMask & 8) {8873vtx.data_format = FMT_32_32_32_32;8874vtx.use_const_fields = 0;8875} else if (inst->Dst[0].Register.WriteMask & 4) {8876vtx.data_format = FMT_32_32_32;8877vtx.use_const_fields = 0;8878} else if (inst->Dst[0].Register.WriteMask & 2) {8879vtx.data_format = FMT_32_32;8880vtx.use_const_fields = 0;8881} else {8882vtx.data_format = FMT_32;8883vtx.use_const_fields = 0;8884}88858886r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);8887if (r)8888return r;8889cf = ctx->bc->cf_last;8890cf->barrier = 1;8891return 0;8892}88938894static int tgsi_load_rat(struct r600_shader_ctx *ctx)8895{8896struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;8897/* have to work out the offset into the RAT immediate return buffer */8898struct r600_bytecode_vtx vtx;8899struct r600_bytecode_cf *cf;8900int r;8901int idx_gpr;8902unsigned format, num_format, format_comp, endian;8903const struct util_format_description *desc;8904unsigned rat_index_mode;8905unsigned immed_base;89068907rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE89088909immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET;8910r = load_index_src(ctx, 1, &idx_gpr);8911if (r)8912return r;89138914if (rat_index_mode)8915egcm_load_index_reg(ctx->bc, 1, false);89168917r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);8918cf = ctx->bc->cf_last;89198920cf->rat.id = ctx->shader->rat_base + inst->Src[0].Register.Index;8921cf->rat.inst = V_RAT_INST_NOP_RTN;8922cf->rat.index_mode = rat_index_mode;8923cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;8924cf->output.gpr = ctx->thread_id_gpr;8925cf->output.index_gpr = idx_gpr;8926cf->output.comp_mask = 0xf;8927cf->output.burst_count = 1;8928cf->vpm = 1;8929cf->barrier = 1;8930cf->mark = 1;8931cf->output.elem_size = 0;89328933r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);8934cf = ctx->bc->cf_last;8935cf->barrier = 1;89368937desc = util_format_description(inst->Memory.Format);8938r600_vertex_data_type(inst->Memory.Format,8939&format, &num_format, &format_comp, &endian);8940memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));8941vtx.op = FETCH_OP_VFETCH;8942vtx.buffer_id = immed_base + inst->Src[0].Register.Index;8943vtx.buffer_index_mode = rat_index_mode;8944vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;8945vtx.src_gpr = ctx->thread_id_gpr;8946vtx.src_sel_x = 1;8947vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;8948vtx.dst_sel_x = desc->swizzle[0];8949vtx.dst_sel_y = desc->swizzle[1];8950vtx.dst_sel_z = desc->swizzle[2];8951vtx.dst_sel_w = desc->swizzle[3];8952vtx.srf_mode_all = 1;8953vtx.data_format = format;8954vtx.num_format_all = num_format;8955vtx.format_comp_all = format_comp;8956vtx.endian = endian;8957vtx.offset = 0;8958vtx.mega_fetch_count = 3;8959r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);8960if (r)8961return r;8962cf = ctx->bc->cf_last;8963cf->barrier = 1;8964return 0;8965}89668967static int tgsi_load_lds(struct r600_shader_ctx *ctx)8968{8969struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;8970struct r600_bytecode_alu alu;8971int r;8972int temp_reg = r600_get_temp(ctx);89738974memset(&alu, 0, sizeof(struct r600_bytecode_alu));8975alu.op = ALU_OP1_MOV;8976r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);8977alu.dst.sel = temp_reg;8978alu.dst.write = 1;8979alu.last = 1;8980r = r600_bytecode_add_alu(ctx->bc, &alu);8981if (r)8982return r;89838984r = do_lds_fetch_values(ctx, temp_reg,8985ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index, inst->Dst[0].Register.WriteMask);8986if (r)8987return r;8988return 0;8989}89908991static int tgsi_load(struct r600_shader_ctx *ctx)8992{8993struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;8994if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)8995return tgsi_load_rat(ctx);8996if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)8997return tgsi_load_gds(ctx);8998if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)8999return tgsi_load_buffer(ctx);9000if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)9001return tgsi_load_lds(ctx);9002return 0;9003}90049005static int tgsi_store_buffer_rat(struct r600_shader_ctx *ctx)9006{9007struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;9008struct r600_bytecode_cf *cf;9009int r, i;9010unsigned rat_index_mode;9011int lasti;9012int temp_reg = r600_get_temp(ctx), treg2 = r600_get_temp(ctx);90139014r = load_buffer_coord(ctx, 0, treg2);9015if (r)9016return r;90179018rat_index_mode = inst->Dst[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE9019if (rat_index_mode)9020egcm_load_index_reg(ctx->bc, 1, false);90219022for (i = 0; i <= 3; i++) {9023struct r600_bytecode_alu alu;9024memset(&alu, 0, sizeof(struct r600_bytecode_alu));9025alu.op = ALU_OP1_MOV;9026alu.dst.sel = temp_reg;9027alu.dst.chan = i;9028alu.src[0].sel = V_SQ_ALU_SRC_0;9029alu.last = (i == 3);9030alu.dst.write = 1;9031r = r600_bytecode_add_alu(ctx->bc, &alu);9032if (r)9033return r;9034}90359036lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);9037for (i = 0; i <= lasti; i++) {9038struct r600_bytecode_alu alu;9039if (!((1 << i) & inst->Dst[0].Register.WriteMask))9040continue;90419042r = single_alu_op2(ctx, ALU_OP2_ADD_INT,9043temp_reg, 0,9044treg2, 0,9045V_SQ_ALU_SRC_LITERAL, i);9046if (r)9047return r;90489049memset(&alu, 0, sizeof(struct r600_bytecode_alu));9050alu.op = ALU_OP1_MOV;9051alu.dst.sel = ctx->temp_reg;9052alu.dst.chan = 0;90539054r600_bytecode_src(&alu.src[0], &ctx->src[1], i);9055alu.last = 1;9056alu.dst.write = 1;9057r = r600_bytecode_add_alu(ctx->bc, &alu);9058if (r)9059return r;90609061r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);9062cf = ctx->bc->cf_last;90639064cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index + ctx->info.file_count[TGSI_FILE_IMAGE];9065cf->rat.inst = V_RAT_INST_STORE_TYPED;9066cf->rat.index_mode = rat_index_mode;9067cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;9068cf->output.gpr = ctx->temp_reg;9069cf->output.index_gpr = temp_reg;9070cf->output.comp_mask = 1;9071cf->output.burst_count = 1;9072cf->vpm = 1;9073cf->barrier = 1;9074cf->output.elem_size = 0;9075}9076return 0;9077}90789079static int tgsi_store_rat(struct r600_shader_ctx *ctx)9080{9081struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;9082struct r600_bytecode_cf *cf;9083bool src_requires_loading = false;9084int val_gpr, idx_gpr;9085int r, i;9086unsigned rat_index_mode;90879088rat_index_mode = inst->Dst[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE90899090r = load_index_src(ctx, 0, &idx_gpr);9091if (r)9092return r;90939094if (inst->Src[1].Register.File != TGSI_FILE_TEMPORARY)9095src_requires_loading = true;90969097if (src_requires_loading) {9098struct r600_bytecode_alu alu;9099for (i = 0; i < 4; i++) {9100memset(&alu, 0, sizeof(struct r600_bytecode_alu));9101alu.op = ALU_OP1_MOV;9102alu.dst.sel = ctx->temp_reg;9103alu.dst.chan = i;91049105r600_bytecode_src(&alu.src[0], &ctx->src[1], i);9106if (i == 3)9107alu.last = 1;9108alu.dst.write = 1;9109r = r600_bytecode_add_alu(ctx->bc, &alu);9110if (r)9111return r;9112}9113val_gpr = ctx->temp_reg;9114} else9115val_gpr = tgsi_tex_get_src_gpr(ctx, 1);9116if (rat_index_mode)9117egcm_load_index_reg(ctx->bc, 1, false);91189119r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);9120cf = ctx->bc->cf_last;91219122cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index;9123cf->rat.inst = V_RAT_INST_STORE_TYPED;9124cf->rat.index_mode = rat_index_mode;9125cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;9126cf->output.gpr = val_gpr;9127cf->output.index_gpr = idx_gpr;9128cf->output.comp_mask = 0xf;9129cf->output.burst_count = 1;9130cf->vpm = 1;9131cf->barrier = 1;9132cf->output.elem_size = 0;9133return 0;9134}91359136static int tgsi_store_lds(struct r600_shader_ctx *ctx)9137{9138struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;9139struct r600_bytecode_alu alu;9140int r, i, lasti;9141int write_mask = inst->Dst[0].Register.WriteMask;9142int temp_reg = r600_get_temp(ctx);91439144/* LDS write */9145memset(&alu, 0, sizeof(struct r600_bytecode_alu));9146alu.op = ALU_OP1_MOV;9147r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);9148alu.dst.sel = temp_reg;9149alu.dst.write = 1;9150alu.last = 1;9151r = r600_bytecode_add_alu(ctx->bc, &alu);9152if (r)9153return r;91549155lasti = tgsi_last_instruction(write_mask);9156for (i = 1; i <= lasti; i++) {9157if (!(write_mask & (1 << i)))9158continue;9159r = single_alu_op2(ctx, ALU_OP2_ADD_INT,9160temp_reg, i,9161temp_reg, 0,9162V_SQ_ALU_SRC_LITERAL, 4 * i);9163if (r)9164return r;9165}9166for (i = 0; i <= lasti; i++) {9167if (!(write_mask & (1 << i)))9168continue;91699170if ((i == 0 && ((write_mask & 3) == 3)) ||9171(i == 2 && ((write_mask & 0xc) == 0xc))) {9172memset(&alu, 0, sizeof(struct r600_bytecode_alu));9173alu.op = LDS_OP3_LDS_WRITE_REL;91749175alu.src[0].sel = temp_reg;9176alu.src[0].chan = i;9177r600_bytecode_src(&alu.src[1], &ctx->src[1], i);9178r600_bytecode_src(&alu.src[2], &ctx->src[1], i + 1);9179alu.last = 1;9180alu.is_lds_idx_op = true;9181alu.lds_idx = 1;9182r = r600_bytecode_add_alu(ctx->bc, &alu);9183if (r)9184return r;9185i += 1;9186continue;9187}9188memset(&alu, 0, sizeof(struct r600_bytecode_alu));9189alu.op = LDS_OP2_LDS_WRITE;91909191alu.src[0].sel = temp_reg;9192alu.src[0].chan = i;9193r600_bytecode_src(&alu.src[1], &ctx->src[1], i);91949195alu.last = 1;9196alu.is_lds_idx_op = true;91979198r = r600_bytecode_add_alu(ctx->bc, &alu);9199if (r)9200return r;9201}9202return 0;9203}92049205static int tgsi_store(struct r600_shader_ctx *ctx)9206{9207struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;9208if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER)9209return tgsi_store_buffer_rat(ctx);9210else if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY)9211return tgsi_store_lds(ctx);9212else9213return tgsi_store_rat(ctx);9214}92159216static int tgsi_atomic_op_rat(struct r600_shader_ctx *ctx)9217{9218struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;9219/* have to work out the offset into the RAT immediate return buffer */9220struct r600_bytecode_alu alu;9221struct r600_bytecode_vtx vtx;9222struct r600_bytecode_cf *cf;9223int r;9224int idx_gpr;9225unsigned format, num_format, format_comp, endian;9226const struct util_format_description *desc;9227unsigned rat_index_mode;9228unsigned immed_base;9229unsigned rat_base;92309231immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET;9232rat_base = ctx->shader->rat_base;92339234if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {9235immed_base += ctx->info.file_count[TGSI_FILE_IMAGE];9236rat_base += ctx->info.file_count[TGSI_FILE_IMAGE];92379238r = load_buffer_coord(ctx, 1, ctx->temp_reg);9239if (r)9240return r;9241idx_gpr = ctx->temp_reg;9242} else {9243r = load_index_src(ctx, 1, &idx_gpr);9244if (r)9245return r;9246}92479248rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE92499250if (ctx->inst_info->op == V_RAT_INST_CMPXCHG_INT_RTN) {9251memset(&alu, 0, sizeof(struct r600_bytecode_alu));9252alu.op = ALU_OP1_MOV;9253alu.dst.sel = ctx->thread_id_gpr;9254alu.dst.chan = 0;9255alu.dst.write = 1;9256r600_bytecode_src(&alu.src[0], &ctx->src[3], 0);9257alu.last = 1;9258r = r600_bytecode_add_alu(ctx->bc, &alu);9259if (r)9260return r;92619262memset(&alu, 0, sizeof(struct r600_bytecode_alu));9263alu.op = ALU_OP1_MOV;9264alu.dst.sel = ctx->thread_id_gpr;9265if (ctx->bc->chip_class == CAYMAN)9266alu.dst.chan = 2;9267else9268alu.dst.chan = 3;9269alu.dst.write = 1;9270r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);9271alu.last = 1;9272r = r600_bytecode_add_alu(ctx->bc, &alu);9273if (r)9274return r;9275} else {9276memset(&alu, 0, sizeof(struct r600_bytecode_alu));9277alu.op = ALU_OP1_MOV;9278alu.dst.sel = ctx->thread_id_gpr;9279alu.dst.chan = 0;9280alu.dst.write = 1;9281r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);9282alu.last = 1;9283r = r600_bytecode_add_alu(ctx->bc, &alu);9284if (r)9285return r;9286}92879288if (rat_index_mode)9289egcm_load_index_reg(ctx->bc, 1, false);9290r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);9291cf = ctx->bc->cf_last;92929293cf->rat.id = rat_base + inst->Src[0].Register.Index;9294cf->rat.inst = ctx->inst_info->op;9295cf->rat.index_mode = rat_index_mode;9296cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;9297cf->output.gpr = ctx->thread_id_gpr;9298cf->output.index_gpr = idx_gpr;9299cf->output.comp_mask = 0xf;9300cf->output.burst_count = 1;9301cf->vpm = 1;9302cf->barrier = 1;9303cf->mark = 1;9304cf->output.elem_size = 0;9305r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);9306cf = ctx->bc->cf_last;9307cf->barrier = 1;9308cf->cf_addr = 1;93099310memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));9311if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {9312desc = util_format_description(inst->Memory.Format);9313r600_vertex_data_type(inst->Memory.Format,9314&format, &num_format, &format_comp, &endian);9315vtx.dst_sel_x = desc->swizzle[0];9316} else {9317format = FMT_32;9318num_format = 1;9319format_comp = 0;9320endian = 0;9321vtx.dst_sel_x = 0;9322}9323vtx.op = FETCH_OP_VFETCH;9324vtx.buffer_id = immed_base + inst->Src[0].Register.Index;9325vtx.buffer_index_mode = rat_index_mode;9326vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;9327vtx.src_gpr = ctx->thread_id_gpr;9328vtx.src_sel_x = 1;9329vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;9330vtx.dst_sel_y = 7;9331vtx.dst_sel_z = 7;9332vtx.dst_sel_w = 7;9333vtx.use_const_fields = 0;9334vtx.srf_mode_all = 1;9335vtx.data_format = format;9336vtx.num_format_all = num_format;9337vtx.format_comp_all = format_comp;9338vtx.endian = endian;9339vtx.offset = 0;9340vtx.mega_fetch_count = 0xf;9341r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);9342if (r)9343return r;9344cf = ctx->bc->cf_last;9345cf->vpm = 1;9346cf->barrier = 1;9347return 0;9348}93499350static int get_gds_op(int opcode)9351{9352switch (opcode) {9353case TGSI_OPCODE_ATOMUADD:9354return FETCH_OP_GDS_ADD_RET;9355case TGSI_OPCODE_ATOMAND:9356return FETCH_OP_GDS_AND_RET;9357case TGSI_OPCODE_ATOMOR:9358return FETCH_OP_GDS_OR_RET;9359case TGSI_OPCODE_ATOMXOR:9360return FETCH_OP_GDS_XOR_RET;9361case TGSI_OPCODE_ATOMUMIN:9362return FETCH_OP_GDS_MIN_UINT_RET;9363case TGSI_OPCODE_ATOMUMAX:9364return FETCH_OP_GDS_MAX_UINT_RET;9365case TGSI_OPCODE_ATOMXCHG:9366return FETCH_OP_GDS_XCHG_RET;9367case TGSI_OPCODE_ATOMCAS:9368return FETCH_OP_GDS_CMP_XCHG_RET;9369default:9370return -1;9371}9372}93739374static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx)9375{9376struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;9377struct r600_bytecode_gds gds;9378struct r600_bytecode_alu alu;9379int gds_op = get_gds_op(inst->Instruction.Opcode);9380int r;9381int uav_id = 0;9382int uav_index_mode = 0;9383bool is_cm = (ctx->bc->chip_class == CAYMAN);93849385if (gds_op == -1) {9386fprintf(stderr, "unknown GDS op for opcode %d\n", inst->Instruction.Opcode);9387return -1;9388}93899390r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode);9391if (r)9392return r;93939394if (gds_op == FETCH_OP_GDS_CMP_XCHG_RET) {9395if (inst->Src[3].Register.File == TGSI_FILE_IMMEDIATE) {9396int value = (ctx->literals[4 * inst->Src[3].Register.Index + inst->Src[3].Register.SwizzleX]);9397memset(&alu, 0, sizeof(struct r600_bytecode_alu));9398alu.op = ALU_OP1_MOV;9399alu.dst.sel = ctx->temp_reg;9400alu.dst.chan = is_cm ? 2 : 1;9401alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;9402alu.src[0].value = value;9403alu.last = 1;9404alu.dst.write = 1;9405r = r600_bytecode_add_alu(ctx->bc, &alu);9406if (r)9407return r;9408} else {9409memset(&alu, 0, sizeof(struct r600_bytecode_alu));9410alu.op = ALU_OP1_MOV;9411alu.dst.sel = ctx->temp_reg;9412alu.dst.chan = is_cm ? 2 : 1;9413r600_bytecode_src(&alu.src[0], &ctx->src[3], 0);9414alu.last = 1;9415alu.dst.write = 1;9416r = r600_bytecode_add_alu(ctx->bc, &alu);9417if (r)9418return r;9419}9420}9421if (inst->Src[2].Register.File == TGSI_FILE_IMMEDIATE) {9422int value = (ctx->literals[4 * inst->Src[2].Register.Index + inst->Src[2].Register.SwizzleX]);9423int abs_value = abs(value);9424if (abs_value != value && gds_op == FETCH_OP_GDS_ADD_RET)9425gds_op = FETCH_OP_GDS_SUB_RET;9426memset(&alu, 0, sizeof(struct r600_bytecode_alu));9427alu.op = ALU_OP1_MOV;9428alu.dst.sel = ctx->temp_reg;9429alu.dst.chan = is_cm ? 1 : 0;9430alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;9431alu.src[0].value = abs_value;9432alu.last = 1;9433alu.dst.write = 1;9434r = r600_bytecode_add_alu(ctx->bc, &alu);9435if (r)9436return r;9437} else {9438memset(&alu, 0, sizeof(struct r600_bytecode_alu));9439alu.op = ALU_OP1_MOV;9440alu.dst.sel = ctx->temp_reg;9441alu.dst.chan = is_cm ? 1 : 0;9442r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);9443alu.last = 1;9444alu.dst.write = 1;9445r = r600_bytecode_add_alu(ctx->bc, &alu);9446if (r)9447return r;9448}944994509451memset(&gds, 0, sizeof(struct r600_bytecode_gds));9452gds.op = gds_op;9453gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;9454gds.uav_id = is_cm ? 0 : uav_id;9455gds.uav_index_mode = is_cm ? 0 : uav_index_mode;9456gds.src_gpr = ctx->temp_reg;9457gds.src_gpr2 = 0;9458gds.src_sel_x = is_cm ? 0 : 4;9459gds.src_sel_y = is_cm ? 1 : 0;9460if (gds_op == FETCH_OP_GDS_CMP_XCHG_RET)9461gds.src_sel_z = is_cm ? 2 : 1;9462else9463gds.src_sel_z = 7;9464gds.dst_sel_x = 0;9465gds.dst_sel_y = 7;9466gds.dst_sel_z = 7;9467gds.dst_sel_w = 7;9468gds.alloc_consume = !is_cm;94699470r = r600_bytecode_add_gds(ctx->bc, &gds);9471if (r)9472return r;9473ctx->bc->cf_last->vpm = 1;9474return 0;9475}94769477static int get_lds_op(int opcode)9478{9479switch (opcode) {9480case TGSI_OPCODE_ATOMUADD:9481return LDS_OP2_LDS_ADD_RET;9482case TGSI_OPCODE_ATOMAND:9483return LDS_OP2_LDS_AND_RET;9484case TGSI_OPCODE_ATOMOR:9485return LDS_OP2_LDS_OR_RET;9486case TGSI_OPCODE_ATOMXOR:9487return LDS_OP2_LDS_XOR_RET;9488case TGSI_OPCODE_ATOMUMIN:9489return LDS_OP2_LDS_MIN_UINT_RET;9490case TGSI_OPCODE_ATOMUMAX:9491return LDS_OP2_LDS_MAX_UINT_RET;9492case TGSI_OPCODE_ATOMIMIN:9493return LDS_OP2_LDS_MIN_INT_RET;9494case TGSI_OPCODE_ATOMIMAX:9495return LDS_OP2_LDS_MAX_INT_RET;9496case TGSI_OPCODE_ATOMXCHG:9497return LDS_OP2_LDS_XCHG_RET;9498case TGSI_OPCODE_ATOMCAS:9499return LDS_OP3_LDS_CMP_XCHG_RET;9500default:9501return -1;9502}9503}95049505static int tgsi_atomic_op_lds(struct r600_shader_ctx *ctx)9506{9507struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;9508int lds_op = get_lds_op(inst->Instruction.Opcode);9509int r;95109511struct r600_bytecode_alu alu;9512memset(&alu, 0, sizeof(struct r600_bytecode_alu));9513alu.op = lds_op;9514alu.is_lds_idx_op = true;9515alu.last = 1;9516r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);9517r600_bytecode_src(&alu.src[1], &ctx->src[2], 0);9518if (lds_op == LDS_OP3_LDS_CMP_XCHG_RET)9519r600_bytecode_src(&alu.src[2], &ctx->src[3], 0);9520else9521alu.src[2].sel = V_SQ_ALU_SRC_0;9522r = r600_bytecode_add_alu(ctx->bc, &alu);9523if (r)9524return r;95259526/* then read from LDS_OQ_A_POP */9527memset(&alu, 0, sizeof(alu));95289529alu.op = ALU_OP1_MOV;9530alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;9531alu.src[0].chan = 0;9532tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);9533alu.dst.write = 1;9534alu.last = 1;9535r = r600_bytecode_add_alu(ctx->bc, &alu);9536if (r)9537return r;95389539return 0;9540}95419542static int tgsi_atomic_op(struct r600_shader_ctx *ctx)9543{9544struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;9545if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)9546return tgsi_atomic_op_rat(ctx);9547if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)9548return tgsi_atomic_op_gds(ctx);9549if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)9550return tgsi_atomic_op_rat(ctx);9551if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)9552return tgsi_atomic_op_lds(ctx);9553return 0;9554}95559556static int tgsi_resq(struct r600_shader_ctx *ctx)9557{9558struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;9559unsigned sampler_index_mode;9560struct r600_bytecode_tex tex;9561int r;9562boolean has_txq_cube_array_z = false;95639564if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||9565(inst->Src[0].Register.File == TGSI_FILE_IMAGE && inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) {9566if (ctx->bc->chip_class < EVERGREEN)9567ctx->shader->uses_tex_buffers = true;9568unsigned eg_buffer_base = 0;9569eg_buffer_base = R600_IMAGE_REAL_RESOURCE_OFFSET;9570if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)9571eg_buffer_base += ctx->info.file_count[TGSI_FILE_IMAGE];9572return r600_do_buffer_txq(ctx, 0, ctx->shader->image_size_const_offset, eg_buffer_base);9573}95749575if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY &&9576inst->Dst[0].Register.WriteMask & 4) {9577ctx->shader->has_txq_cube_array_z_comp = true;9578has_txq_cube_array_z = true;9579}95809581sampler_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE9582if (sampler_index_mode)9583egcm_load_index_reg(ctx->bc, 1, false);958495859586/* does this shader want a num layers from TXQ for a cube array? */9587if (has_txq_cube_array_z) {9588int id = tgsi_tex_get_src_gpr(ctx, 0) + ctx->shader->image_size_const_offset;9589struct r600_bytecode_alu alu;95909591memset(&alu, 0, sizeof(struct r600_bytecode_alu));9592alu.op = ALU_OP1_MOV;95939594alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;9595/* with eg each dword is either number of cubes */9596alu.src[0].sel += id / 4;9597alu.src[0].chan = id % 4;9598alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;9599tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);9600alu.last = 1;9601r = r600_bytecode_add_alu(ctx->bc, &alu);9602if (r)9603return r;9604/* disable writemask from texture instruction */9605inst->Dst[0].Register.WriteMask &= ~4;9606}9607memset(&tex, 0, sizeof(struct r600_bytecode_tex));9608tex.op = ctx->inst_info->op;9609tex.sampler_id = R600_IMAGE_REAL_RESOURCE_OFFSET + inst->Src[0].Register.Index;9610tex.sampler_index_mode = sampler_index_mode;9611tex.resource_id = tex.sampler_id;9612tex.resource_index_mode = sampler_index_mode;9613tex.src_sel_x = 4;9614tex.src_sel_y = 4;9615tex.src_sel_z = 4;9616tex.src_sel_w = 4;9617tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;9618tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;9619tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;9620tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;9621tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;9622r = r600_bytecode_add_tex(ctx->bc, &tex);9623if (r)9624return r;96259626return 0;9627}96289629static int tgsi_lrp(struct r600_shader_ctx *ctx)9630{9631struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;9632struct r600_bytecode_alu alu;9633unsigned lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);9634struct r600_bytecode_alu_src srcs[2][4];9635unsigned i;9636int r;96379638/* optimize if it's just an equal balance */9639if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {9640for (i = 0; i < lasti + 1; i++) {9641if (!(inst->Dst[0].Register.WriteMask & (1 << i)))9642continue;96439644memset(&alu, 0, sizeof(struct r600_bytecode_alu));9645alu.op = ALU_OP2_ADD;9646r600_bytecode_src(&alu.src[0], &ctx->src[1], i);9647r600_bytecode_src(&alu.src[1], &ctx->src[2], i);9648alu.omod = 3;9649tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);9650alu.dst.chan = i;9651if (i == lasti) {9652alu.last = 1;9653}9654r = r600_bytecode_add_alu(ctx->bc, &alu);9655if (r)9656return r;9657}9658return 0;9659}96609661/* 1 - src0 */9662for (i = 0; i < lasti + 1; i++) {9663if (!(inst->Dst[0].Register.WriteMask & (1 << i)))9664continue;96659666memset(&alu, 0, sizeof(struct r600_bytecode_alu));9667alu.op = ALU_OP2_ADD;9668alu.src[0].sel = V_SQ_ALU_SRC_1;9669alu.src[0].chan = 0;9670r600_bytecode_src(&alu.src[1], &ctx->src[0], i);9671r600_bytecode_src_toggle_neg(&alu.src[1]);9672alu.dst.sel = ctx->temp_reg;9673alu.dst.chan = i;9674if (i == lasti) {9675alu.last = 1;9676}9677alu.dst.write = 1;9678r = r600_bytecode_add_alu(ctx->bc, &alu);9679if (r)9680return r;9681}96829683/* (1 - src0) * src2 */9684for (i = 0; i < lasti + 1; i++) {9685if (!(inst->Dst[0].Register.WriteMask & (1 << i)))9686continue;96879688memset(&alu, 0, sizeof(struct r600_bytecode_alu));9689alu.op = ALU_OP2_MUL;9690alu.src[0].sel = ctx->temp_reg;9691alu.src[0].chan = i;9692r600_bytecode_src(&alu.src[1], &ctx->src[2], i);9693alu.dst.sel = ctx->temp_reg;9694alu.dst.chan = i;9695if (i == lasti) {9696alu.last = 1;9697}9698alu.dst.write = 1;9699r = r600_bytecode_add_alu(ctx->bc, &alu);9700if (r)9701return r;9702}97039704/* src0 * src1 + (1 - src0) * src2 */97059706for (i = 0; i < 2; i++) {9707r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask,9708srcs[i], &ctx->src[i]);9709if (r)9710return r;9711}97129713for (i = 0; i < lasti + 1; i++) {9714if (!(inst->Dst[0].Register.WriteMask & (1 << i)))9715continue;97169717memset(&alu, 0, sizeof(struct r600_bytecode_alu));9718alu.op = ALU_OP3_MULADD;9719alu.is_op3 = 1;9720alu.src[0] = srcs[0][i];9721alu.src[1] = srcs[1][i];9722alu.src[2].sel = ctx->temp_reg;9723alu.src[2].chan = i;97249725tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);9726alu.dst.chan = i;9727if (i == lasti) {9728alu.last = 1;9729}9730r = r600_bytecode_add_alu(ctx->bc, &alu);9731if (r)9732return r;9733}9734return 0;9735}97369737static int tgsi_cmp(struct r600_shader_ctx *ctx)9738{9739struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;9740struct r600_bytecode_alu alu;9741int i, r, j;9742int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);9743struct r600_bytecode_alu_src srcs[3][4];97449745unsigned op;97469747if (ctx->src[0].abs && ctx->src[0].neg) {9748op = ALU_OP3_CNDE;9749ctx->src[0].abs = 0;9750ctx->src[0].neg = 0;9751} else {9752op = ALU_OP3_CNDGE;9753}97549755for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {9756r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask,9757srcs[j], &ctx->src[j]);9758if (r)9759return r;9760}97619762for (i = 0; i < lasti + 1; i++) {9763if (!(inst->Dst[0].Register.WriteMask & (1 << i)))9764continue;97659766memset(&alu, 0, sizeof(struct r600_bytecode_alu));9767alu.op = op;9768alu.src[0] = srcs[0][i];9769alu.src[1] = srcs[2][i];9770alu.src[2] = srcs[1][i];97719772tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);9773alu.dst.chan = i;9774alu.dst.write = 1;9775alu.is_op3 = 1;9776if (i == lasti)9777alu.last = 1;9778r = r600_bytecode_add_alu(ctx->bc, &alu);9779if (r)9780return r;9781}9782return 0;9783}97849785static int tgsi_ucmp(struct r600_shader_ctx *ctx)9786{9787struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;9788struct r600_bytecode_alu alu;9789int i, r;9790int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);97919792for (i = 0; i < lasti + 1; i++) {9793if (!(inst->Dst[0].Register.WriteMask & (1 << i)))9794continue;97959796memset(&alu, 0, sizeof(struct r600_bytecode_alu));9797alu.op = ALU_OP3_CNDE_INT;9798r600_bytecode_src(&alu.src[0], &ctx->src[0], i);9799r600_bytecode_src(&alu.src[1], &ctx->src[2], i);9800r600_bytecode_src(&alu.src[2], &ctx->src[1], i);9801tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);9802alu.dst.chan = i;9803alu.dst.write = 1;9804alu.is_op3 = 1;9805if (i == lasti)9806alu.last = 1;9807r = r600_bytecode_add_alu(ctx->bc, &alu);9808if (r)9809return r;9810}9811return 0;9812}98139814static int tgsi_exp(struct r600_shader_ctx *ctx)9815{9816struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;9817struct r600_bytecode_alu alu;9818int r;9819unsigned i;98209821/* result.x = 2^floor(src); */9822if (inst->Dst[0].Register.WriteMask & 1) {9823memset(&alu, 0, sizeof(struct r600_bytecode_alu));98249825alu.op = ALU_OP1_FLOOR;9826r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);98279828alu.dst.sel = ctx->temp_reg;9829alu.dst.chan = 0;9830alu.dst.write = 1;9831alu.last = 1;9832r = r600_bytecode_add_alu(ctx->bc, &alu);9833if (r)9834return r;98359836if (ctx->bc->chip_class == CAYMAN) {9837for (i = 0; i < 3; i++) {9838alu.op = ALU_OP1_EXP_IEEE;9839alu.src[0].sel = ctx->temp_reg;9840alu.src[0].chan = 0;98419842alu.dst.sel = ctx->temp_reg;9843alu.dst.chan = i;9844alu.dst.write = i == 0;9845alu.last = i == 2;9846r = r600_bytecode_add_alu(ctx->bc, &alu);9847if (r)9848return r;9849}9850} else {9851alu.op = ALU_OP1_EXP_IEEE;9852alu.src[0].sel = ctx->temp_reg;9853alu.src[0].chan = 0;98549855alu.dst.sel = ctx->temp_reg;9856alu.dst.chan = 0;9857alu.dst.write = 1;9858alu.last = 1;9859r = r600_bytecode_add_alu(ctx->bc, &alu);9860if (r)9861return r;9862}9863}98649865/* result.y = tmp - floor(tmp); */9866if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {9867memset(&alu, 0, sizeof(struct r600_bytecode_alu));98689869alu.op = ALU_OP1_FRACT;9870r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);98719872alu.dst.sel = ctx->temp_reg;9873#if 09874r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);9875if (r)9876return r;9877#endif9878alu.dst.write = 1;9879alu.dst.chan = 1;98809881alu.last = 1;98829883r = r600_bytecode_add_alu(ctx->bc, &alu);9884if (r)9885return r;9886}98879888/* result.z = RoughApprox2ToX(tmp);*/9889if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {9890if (ctx->bc->chip_class == CAYMAN) {9891for (i = 0; i < 3; i++) {9892memset(&alu, 0, sizeof(struct r600_bytecode_alu));9893alu.op = ALU_OP1_EXP_IEEE;9894r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);98959896alu.dst.sel = ctx->temp_reg;9897alu.dst.chan = i;9898if (i == 2) {9899alu.dst.write = 1;9900alu.last = 1;9901}99029903r = r600_bytecode_add_alu(ctx->bc, &alu);9904if (r)9905return r;9906}9907} else {9908memset(&alu, 0, sizeof(struct r600_bytecode_alu));9909alu.op = ALU_OP1_EXP_IEEE;9910r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);99119912alu.dst.sel = ctx->temp_reg;9913alu.dst.write = 1;9914alu.dst.chan = 2;99159916alu.last = 1;99179918r = r600_bytecode_add_alu(ctx->bc, &alu);9919if (r)9920return r;9921}9922}99239924/* result.w = 1.0;*/9925if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {9926memset(&alu, 0, sizeof(struct r600_bytecode_alu));99279928alu.op = ALU_OP1_MOV;9929alu.src[0].sel = V_SQ_ALU_SRC_1;9930alu.src[0].chan = 0;99319932alu.dst.sel = ctx->temp_reg;9933alu.dst.chan = 3;9934alu.dst.write = 1;9935alu.last = 1;9936r = r600_bytecode_add_alu(ctx->bc, &alu);9937if (r)9938return r;9939}9940return tgsi_helper_copy(ctx, inst);9941}99429943static int tgsi_log(struct r600_shader_ctx *ctx)9944{9945struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;9946struct r600_bytecode_alu alu;9947int r;9948unsigned i;99499950/* result.x = floor(log2(|src|)); */9951if (inst->Dst[0].Register.WriteMask & 1) {9952if (ctx->bc->chip_class == CAYMAN) {9953for (i = 0; i < 3; i++) {9954memset(&alu, 0, sizeof(struct r600_bytecode_alu));99559956alu.op = ALU_OP1_LOG_IEEE;9957r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);9958r600_bytecode_src_set_abs(&alu.src[0]);99599960alu.dst.sel = ctx->temp_reg;9961alu.dst.chan = i;9962if (i == 0)9963alu.dst.write = 1;9964if (i == 2)9965alu.last = 1;9966r = r600_bytecode_add_alu(ctx->bc, &alu);9967if (r)9968return r;9969}99709971} else {9972memset(&alu, 0, sizeof(struct r600_bytecode_alu));99739974alu.op = ALU_OP1_LOG_IEEE;9975r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);9976r600_bytecode_src_set_abs(&alu.src[0]);99779978alu.dst.sel = ctx->temp_reg;9979alu.dst.chan = 0;9980alu.dst.write = 1;9981alu.last = 1;9982r = r600_bytecode_add_alu(ctx->bc, &alu);9983if (r)9984return r;9985}99869987alu.op = ALU_OP1_FLOOR;9988alu.src[0].sel = ctx->temp_reg;9989alu.src[0].chan = 0;99909991alu.dst.sel = ctx->temp_reg;9992alu.dst.chan = 0;9993alu.dst.write = 1;9994alu.last = 1;99959996r = r600_bytecode_add_alu(ctx->bc, &alu);9997if (r)9998return r;9999}1000010001/* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */10002if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {1000310004if (ctx->bc->chip_class == CAYMAN) {10005for (i = 0; i < 3; i++) {10006memset(&alu, 0, sizeof(struct r600_bytecode_alu));1000710008alu.op = ALU_OP1_LOG_IEEE;10009r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);10010r600_bytecode_src_set_abs(&alu.src[0]);1001110012alu.dst.sel = ctx->temp_reg;10013alu.dst.chan = i;10014if (i == 1)10015alu.dst.write = 1;10016if (i == 2)10017alu.last = 1;1001810019r = r600_bytecode_add_alu(ctx->bc, &alu);10020if (r)10021return r;10022}10023} else {10024memset(&alu, 0, sizeof(struct r600_bytecode_alu));1002510026alu.op = ALU_OP1_LOG_IEEE;10027r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);10028r600_bytecode_src_set_abs(&alu.src[0]);1002910030alu.dst.sel = ctx->temp_reg;10031alu.dst.chan = 1;10032alu.dst.write = 1;10033alu.last = 1;1003410035r = r600_bytecode_add_alu(ctx->bc, &alu);10036if (r)10037return r;10038}1003910040memset(&alu, 0, sizeof(struct r600_bytecode_alu));1004110042alu.op = ALU_OP1_FLOOR;10043alu.src[0].sel = ctx->temp_reg;10044alu.src[0].chan = 1;1004510046alu.dst.sel = ctx->temp_reg;10047alu.dst.chan = 1;10048alu.dst.write = 1;10049alu.last = 1;1005010051r = r600_bytecode_add_alu(ctx->bc, &alu);10052if (r)10053return r;1005410055if (ctx->bc->chip_class == CAYMAN) {10056for (i = 0; i < 3; i++) {10057memset(&alu, 0, sizeof(struct r600_bytecode_alu));10058alu.op = ALU_OP1_EXP_IEEE;10059alu.src[0].sel = ctx->temp_reg;10060alu.src[0].chan = 1;1006110062alu.dst.sel = ctx->temp_reg;10063alu.dst.chan = i;10064if (i == 1)10065alu.dst.write = 1;10066if (i == 2)10067alu.last = 1;1006810069r = r600_bytecode_add_alu(ctx->bc, &alu);10070if (r)10071return r;10072}10073} else {10074memset(&alu, 0, sizeof(struct r600_bytecode_alu));10075alu.op = ALU_OP1_EXP_IEEE;10076alu.src[0].sel = ctx->temp_reg;10077alu.src[0].chan = 1;1007810079alu.dst.sel = ctx->temp_reg;10080alu.dst.chan = 1;10081alu.dst.write = 1;10082alu.last = 1;1008310084r = r600_bytecode_add_alu(ctx->bc, &alu);10085if (r)10086return r;10087}1008810089if (ctx->bc->chip_class == CAYMAN) {10090for (i = 0; i < 3; i++) {10091memset(&alu, 0, sizeof(struct r600_bytecode_alu));10092alu.op = ALU_OP1_RECIP_IEEE;10093alu.src[0].sel = ctx->temp_reg;10094alu.src[0].chan = 1;1009510096alu.dst.sel = ctx->temp_reg;10097alu.dst.chan = i;10098if (i == 1)10099alu.dst.write = 1;10100if (i == 2)10101alu.last = 1;1010210103r = r600_bytecode_add_alu(ctx->bc, &alu);10104if (r)10105return r;10106}10107} else {10108memset(&alu, 0, sizeof(struct r600_bytecode_alu));10109alu.op = ALU_OP1_RECIP_IEEE;10110alu.src[0].sel = ctx->temp_reg;10111alu.src[0].chan = 1;1011210113alu.dst.sel = ctx->temp_reg;10114alu.dst.chan = 1;10115alu.dst.write = 1;10116alu.last = 1;1011710118r = r600_bytecode_add_alu(ctx->bc, &alu);10119if (r)10120return r;10121}1012210123memset(&alu, 0, sizeof(struct r600_bytecode_alu));1012410125alu.op = ALU_OP2_MUL;1012610127r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);10128r600_bytecode_src_set_abs(&alu.src[0]);1012910130alu.src[1].sel = ctx->temp_reg;10131alu.src[1].chan = 1;1013210133alu.dst.sel = ctx->temp_reg;10134alu.dst.chan = 1;10135alu.dst.write = 1;10136alu.last = 1;1013710138r = r600_bytecode_add_alu(ctx->bc, &alu);10139if (r)10140return r;10141}1014210143/* result.z = log2(|src|);*/10144if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {10145if (ctx->bc->chip_class == CAYMAN) {10146for (i = 0; i < 3; i++) {10147memset(&alu, 0, sizeof(struct r600_bytecode_alu));1014810149alu.op = ALU_OP1_LOG_IEEE;10150r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);10151r600_bytecode_src_set_abs(&alu.src[0]);1015210153alu.dst.sel = ctx->temp_reg;10154if (i == 2)10155alu.dst.write = 1;10156alu.dst.chan = i;10157if (i == 2)10158alu.last = 1;1015910160r = r600_bytecode_add_alu(ctx->bc, &alu);10161if (r)10162return r;10163}10164} else {10165memset(&alu, 0, sizeof(struct r600_bytecode_alu));1016610167alu.op = ALU_OP1_LOG_IEEE;10168r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);10169r600_bytecode_src_set_abs(&alu.src[0]);1017010171alu.dst.sel = ctx->temp_reg;10172alu.dst.write = 1;10173alu.dst.chan = 2;10174alu.last = 1;1017510176r = r600_bytecode_add_alu(ctx->bc, &alu);10177if (r)10178return r;10179}10180}1018110182/* result.w = 1.0; */10183if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {10184memset(&alu, 0, sizeof(struct r600_bytecode_alu));1018510186alu.op = ALU_OP1_MOV;10187alu.src[0].sel = V_SQ_ALU_SRC_1;10188alu.src[0].chan = 0;1018910190alu.dst.sel = ctx->temp_reg;10191alu.dst.chan = 3;10192alu.dst.write = 1;10193alu.last = 1;1019410195r = r600_bytecode_add_alu(ctx->bc, &alu);10196if (r)10197return r;10198}1019910200return tgsi_helper_copy(ctx, inst);10201}1020210203static int tgsi_eg_arl(struct r600_shader_ctx *ctx)10204{10205struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;10206struct r600_bytecode_alu alu;10207int r;10208int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);10209unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index);1021010211assert(inst->Dst[0].Register.Index < 3);10212memset(&alu, 0, sizeof(struct r600_bytecode_alu));1021310214switch (inst->Instruction.Opcode) {10215case TGSI_OPCODE_ARL:10216alu.op = ALU_OP1_FLT_TO_INT_FLOOR;10217break;10218case TGSI_OPCODE_ARR:10219alu.op = ALU_OP1_FLT_TO_INT;10220break;10221case TGSI_OPCODE_UARL:10222alu.op = ALU_OP1_MOV;10223break;10224default:10225assert(0);10226return -1;10227}1022810229for (i = 0; i <= lasti; ++i) {10230if (!(inst->Dst[0].Register.WriteMask & (1 << i)))10231continue;10232r600_bytecode_src(&alu.src[0], &ctx->src[0], i);10233alu.last = i == lasti;10234alu.dst.sel = reg;10235alu.dst.chan = i;10236alu.dst.write = 1;10237r = r600_bytecode_add_alu(ctx->bc, &alu);10238if (r)10239return r;10240}1024110242if (inst->Dst[0].Register.Index > 0)10243ctx->bc->index_loaded[inst->Dst[0].Register.Index - 1] = 0;10244else10245ctx->bc->ar_loaded = 0;1024610247return 0;10248}10249static int tgsi_r600_arl(struct r600_shader_ctx *ctx)10250{10251struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;10252struct r600_bytecode_alu alu;10253int r;10254int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);1025510256switch (inst->Instruction.Opcode) {10257case TGSI_OPCODE_ARL:10258memset(&alu, 0, sizeof(alu));10259alu.op = ALU_OP1_FLOOR;10260alu.dst.sel = ctx->bc->ar_reg;10261alu.dst.write = 1;10262for (i = 0; i <= lasti; ++i) {10263if (inst->Dst[0].Register.WriteMask & (1 << i)) {10264alu.dst.chan = i;10265r600_bytecode_src(&alu.src[0], &ctx->src[0], i);10266alu.last = i == lasti;10267if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))10268return r;10269}10270}1027110272memset(&alu, 0, sizeof(alu));10273alu.op = ALU_OP1_FLT_TO_INT;10274alu.src[0].sel = ctx->bc->ar_reg;10275alu.dst.sel = ctx->bc->ar_reg;10276alu.dst.write = 1;10277/* FLT_TO_INT is trans-only on r600/r700 */10278alu.last = TRUE;10279for (i = 0; i <= lasti; ++i) {10280alu.dst.chan = i;10281alu.src[0].chan = i;10282if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))10283return r;10284}10285break;10286case TGSI_OPCODE_ARR:10287memset(&alu, 0, sizeof(alu));10288alu.op = ALU_OP1_FLT_TO_INT;10289alu.dst.sel = ctx->bc->ar_reg;10290alu.dst.write = 1;10291/* FLT_TO_INT is trans-only on r600/r700 */10292alu.last = TRUE;10293for (i = 0; i <= lasti; ++i) {10294if (inst->Dst[0].Register.WriteMask & (1 << i)) {10295alu.dst.chan = i;10296r600_bytecode_src(&alu.src[0], &ctx->src[0], i);10297if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))10298return r;10299}10300}10301break;10302case TGSI_OPCODE_UARL:10303memset(&alu, 0, sizeof(alu));10304alu.op = ALU_OP1_MOV;10305alu.dst.sel = ctx->bc->ar_reg;10306alu.dst.write = 1;10307for (i = 0; i <= lasti; ++i) {10308if (inst->Dst[0].Register.WriteMask & (1 << i)) {10309alu.dst.chan = i;10310r600_bytecode_src(&alu.src[0], &ctx->src[0], i);10311alu.last = i == lasti;10312if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))10313return r;10314}10315}10316break;10317default:10318assert(0);10319return -1;10320}1032110322ctx->bc->ar_loaded = 0;10323return 0;10324}1032510326static int tgsi_opdst(struct r600_shader_ctx *ctx)10327{10328struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;10329struct r600_bytecode_alu alu;10330int i, r = 0;1033110332for (i = 0; i < 4; i++) {10333memset(&alu, 0, sizeof(struct r600_bytecode_alu));1033410335alu.op = ALU_OP2_MUL;10336tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);1033710338if (i == 0 || i == 3) {10339alu.src[0].sel = V_SQ_ALU_SRC_1;10340} else {10341r600_bytecode_src(&alu.src[0], &ctx->src[0], i);10342}1034310344if (i == 0 || i == 2) {10345alu.src[1].sel = V_SQ_ALU_SRC_1;10346} else {10347r600_bytecode_src(&alu.src[1], &ctx->src[1], i);10348}10349if (i == 3)10350alu.last = 1;10351r = r600_bytecode_add_alu(ctx->bc, &alu);10352if (r)10353return r;10354}10355return 0;10356}1035710358static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type,10359struct r600_bytecode_alu_src *src)10360{10361struct r600_bytecode_alu alu;10362int r;1036310364memset(&alu, 0, sizeof(struct r600_bytecode_alu));10365alu.op = opcode;10366alu.execute_mask = 1;10367alu.update_pred = 1;1036810369alu.dst.sel = ctx->temp_reg;10370alu.dst.write = 1;10371alu.dst.chan = 0;1037210373alu.src[0] = *src;10374alu.src[1].sel = V_SQ_ALU_SRC_0;10375alu.src[1].chan = 0;1037610377alu.last = 1;1037810379r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type);10380if (r)10381return r;10382return 0;10383}1038410385static int pops(struct r600_shader_ctx *ctx, int pops)10386{10387unsigned force_pop = ctx->bc->force_add_cf;1038810389if (!force_pop) {10390int alu_pop = 3;10391if (ctx->bc->cf_last) {10392if (ctx->bc->cf_last->op == CF_OP_ALU)10393alu_pop = 0;10394else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER)10395alu_pop = 1;10396}10397alu_pop += pops;10398if (alu_pop == 1) {10399ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER;10400ctx->bc->force_add_cf = 1;10401} else if (alu_pop == 2) {10402ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER;10403ctx->bc->force_add_cf = 1;10404} else {10405force_pop = 1;10406}10407}1040810409if (force_pop) {10410r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);10411ctx->bc->cf_last->pop_count = pops;10412ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;10413}1041410415return 0;10416}1041710418static inline int callstack_update_max_depth(struct r600_shader_ctx *ctx,10419unsigned reason)10420{10421struct r600_stack_info *stack = &ctx->bc->stack;10422unsigned elements;10423int entries;1042410425unsigned entry_size = stack->entry_size;1042610427elements = (stack->loop + stack->push_wqm ) * entry_size;10428elements += stack->push;1042910430switch (ctx->bc->chip_class) {10431case R600:10432case R700:10433/* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on10434* the stack must be reserved to hold the current active/continue10435* masks */10436if (reason == FC_PUSH_VPM || stack->push > 0) {10437elements += 2;10438}10439break;1044010441case CAYMAN:10442/* r9xx: any stack operation on empty stack consumes 2 additional10443* elements */10444elements += 2;1044510446FALLTHROUGH;10447/* FIXME: do the two elements added above cover the cases for the10448* r8xx+ below? */1044910450case EVERGREEN:10451/* r8xx+: 2 extra elements are not always required, but one extra10452* element must be added for each of the following cases:10453* 1. There is an ALU_ELSE_AFTER instruction at the point of greatest10454* stack usage.10455* (Currently we don't use ALU_ELSE_AFTER.)10456* 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM10457* PUSH instruction executed.10458*10459* NOTE: it seems we also need to reserve additional element in some10460* other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,10461* then STACK_SIZE should be 2 instead of 1 */10462if (reason == FC_PUSH_VPM || stack->push > 0) {10463elements += 1;10464}10465break;1046610467default:10468assert(0);10469break;10470}1047110472/* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 410473* for all chips, so we use 4 in the final formula, not the real entry_size10474* for the chip */10475entry_size = 4;1047610477entries = (elements + (entry_size - 1)) / entry_size;1047810479if (entries > stack->max_entries)10480stack->max_entries = entries;10481return elements;10482}1048310484static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)10485{10486switch(reason) {10487case FC_PUSH_VPM:10488--ctx->bc->stack.push;10489assert(ctx->bc->stack.push >= 0);10490break;10491case FC_PUSH_WQM:10492--ctx->bc->stack.push_wqm;10493assert(ctx->bc->stack.push_wqm >= 0);10494break;10495case FC_LOOP:10496--ctx->bc->stack.loop;10497assert(ctx->bc->stack.loop >= 0);10498break;10499default:10500assert(0);10501break;10502}10503}1050410505static inline int callstack_push(struct r600_shader_ctx *ctx, unsigned reason)10506{10507switch (reason) {10508case FC_PUSH_VPM:10509++ctx->bc->stack.push;10510break;10511case FC_PUSH_WQM:10512++ctx->bc->stack.push_wqm;10513break;10514case FC_LOOP:10515++ctx->bc->stack.loop;10516break;10517default:10518assert(0);10519}1052010521return callstack_update_max_depth(ctx, reason);10522}1052310524static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)10525{10526struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];1052710528sp->mid = realloc((void *)sp->mid,10529sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));10530sp->mid[sp->num_mid] = ctx->bc->cf_last;10531sp->num_mid++;10532}1053310534static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)10535{10536assert(ctx->bc->fc_sp < ARRAY_SIZE(ctx->bc->fc_stack));10537ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;10538ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;10539ctx->bc->fc_sp++;10540}1054110542static void fc_poplevel(struct r600_shader_ctx *ctx)10543{10544struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp - 1];10545free(sp->mid);10546sp->mid = NULL;10547sp->num_mid = 0;10548sp->start = NULL;10549sp->type = 0;10550ctx->bc->fc_sp--;10551}1055210553#if 010554static int emit_return(struct r600_shader_ctx *ctx)10555{10556r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN));10557return 0;10558}1055910560static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)10561{1056210563r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP));10564ctx->bc->cf_last->pop_count = pops;10565/* XXX work out offset */10566return 0;10567}1056810569static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)10570{10571return 0;10572}1057310574static void emit_testflag(struct r600_shader_ctx *ctx)10575{1057610577}1057810579static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)10580{10581emit_testflag(ctx);10582emit_jump_to_offset(ctx, 1, 4);10583emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);10584pops(ctx, ifidx + 1);10585emit_return(ctx);10586}1058710588static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)10589{10590emit_testflag(ctx);1059110592r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);10593ctx->bc->cf_last->pop_count = 1;1059410595fc_set_mid(ctx, fc_sp);1059610597pops(ctx, 1);10598}10599#endif1060010601static int emit_if(struct r600_shader_ctx *ctx, int opcode,10602struct r600_bytecode_alu_src *src)10603{10604int alu_type = CF_OP_ALU_PUSH_BEFORE;10605bool needs_workaround = false;10606int elems = callstack_push(ctx, FC_PUSH_VPM);1060710608if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1)10609needs_workaround = true;1061010611if (ctx->bc->chip_class == EVERGREEN && ctx_needs_stack_workaround_8xx(ctx)) {10612unsigned dmod1 = (elems - 1) % ctx->bc->stack.entry_size;10613unsigned dmod2 = (elems) % ctx->bc->stack.entry_size;1061410615if (elems && (!dmod1 || !dmod2))10616needs_workaround = true;10617}1061810619/* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by10620* LOOP_STARTxxx for nested loops may put the branch stack into a state10621* such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this10622* by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */10623if (needs_workaround) {10624r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH);10625ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;10626alu_type = CF_OP_ALU;10627}1062810629emit_logic_pred(ctx, opcode, alu_type, src);1063010631r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);1063210633fc_pushlevel(ctx, FC_IF);1063410635return 0;10636}1063710638static int tgsi_if(struct r600_shader_ctx *ctx)10639{10640struct r600_bytecode_alu_src alu_src;10641r600_bytecode_src(&alu_src, &ctx->src[0], 0);1064210643return emit_if(ctx, ALU_OP2_PRED_SETNE, &alu_src);10644}1064510646static int tgsi_uif(struct r600_shader_ctx *ctx)10647{10648struct r600_bytecode_alu_src alu_src;10649r600_bytecode_src(&alu_src, &ctx->src[0], 0);10650return emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);10651}1065210653static int tgsi_else(struct r600_shader_ctx *ctx)10654{10655r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE);10656ctx->bc->cf_last->pop_count = 1;1065710658fc_set_mid(ctx, ctx->bc->fc_sp - 1);10659ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id;10660return 0;10661}1066210663static int tgsi_endif(struct r600_shader_ctx *ctx)10664{10665int offset = 2;10666pops(ctx, 1);10667if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_IF) {10668R600_ERR("if/endif unbalanced in shader\n");10669return -1;10670}1067110672/* ALU_EXTENDED needs 4 DWords instead of two, adjust jump target offset accordingly */10673if (ctx->bc->cf_last->eg_alu_extended)10674offset += 2;1067510676if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid == NULL) {10677ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + offset;10678ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->pop_count = 1;10679} else {10680ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[0]->cf_addr = ctx->bc->cf_last->id + offset;10681}10682fc_poplevel(ctx);1068310684callstack_pop(ctx, FC_PUSH_VPM);10685return 0;10686}1068710688static int tgsi_bgnloop(struct r600_shader_ctx *ctx)10689{10690/* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not10691* limited to 4096 iterations, like the other LOOP_* instructions. */10692r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10);1069310694fc_pushlevel(ctx, FC_LOOP);1069510696/* check stack depth */10697callstack_push(ctx, FC_LOOP);10698return 0;10699}1070010701static int tgsi_endloop(struct r600_shader_ctx *ctx)10702{10703int i;1070410705r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);1070610707if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_LOOP) {10708R600_ERR("loop/endloop in shader code are not paired.\n");10709return -EINVAL;10710}1071110712/* fixup loop pointers - from r600isa10713LOOP END points to CF after LOOP START,10714LOOP START point to CF after LOOP END10715BRK/CONT point to LOOP END CF10716*/10717ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->id + 2;1071810719ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + 2;1072010721for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp - 1].num_mid; i++) {10722ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[i]->cf_addr = ctx->bc->cf_last->id;10723}10724/* XXX add LOOPRET support */10725fc_poplevel(ctx);10726callstack_pop(ctx, FC_LOOP);10727return 0;10728}1072910730static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)10731{10732unsigned int fscp;1073310734for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)10735{10736if (FC_LOOP == ctx->bc->fc_stack[fscp - 1].type)10737break;10738}1073910740if (fscp == 0) {10741R600_ERR("Break not inside loop/endloop pair\n");10742return -EINVAL;10743}1074410745r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);1074610747fc_set_mid(ctx, fscp - 1);1074810749return 0;10750}1075110752static int tgsi_gs_emit(struct r600_shader_ctx *ctx)10753{10754struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;10755int stream = ctx->literals[inst->Src[0].Register.Index * 4 + inst->Src[0].Register.SwizzleX];10756int r;1075710758if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)10759emit_gs_ring_writes(ctx, ctx->gs_stream_output_info, stream, TRUE);1076010761r = r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);10762if (!r) {10763ctx->bc->cf_last->count = stream; // Count field for CUT/EMIT_VERTEX indicates which stream10764if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)10765return emit_inc_ring_offset(ctx, stream, TRUE);10766}10767return r;10768}1076910770static int tgsi_umad(struct r600_shader_ctx *ctx)10771{10772struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;10773struct r600_bytecode_alu alu;10774int i, j, r;10775int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);1077610777/* src0 * src1 */10778for (i = 0; i < lasti + 1; i++) {10779if (!(inst->Dst[0].Register.WriteMask & (1 << i)))10780continue;1078110782memset(&alu, 0, sizeof(struct r600_bytecode_alu));1078310784alu.dst.chan = i;10785alu.dst.sel = ctx->temp_reg;10786alu.dst.write = 1;1078710788alu.op = ALU_OP2_MULLO_UINT;10789for (j = 0; j < 2; j++) {10790r600_bytecode_src(&alu.src[j], &ctx->src[j], i);10791}1079210793alu.last = 1;10794r = emit_mul_int_op(ctx->bc, &alu);10795if (r)10796return r;10797}107981079910800for (i = 0; i < lasti + 1; i++) {10801if (!(inst->Dst[0].Register.WriteMask & (1 << i)))10802continue;1080310804memset(&alu, 0, sizeof(struct r600_bytecode_alu));10805tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);1080610807alu.op = ALU_OP2_ADD_INT;1080810809alu.src[0].sel = ctx->temp_reg;10810alu.src[0].chan = i;1081110812r600_bytecode_src(&alu.src[1], &ctx->src[2], i);10813if (i == lasti) {10814alu.last = 1;10815}10816r = r600_bytecode_add_alu(ctx->bc, &alu);10817if (r)10818return r;10819}10820return 0;10821}1082210823static int tgsi_pk2h(struct r600_shader_ctx *ctx)10824{10825struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;10826struct r600_bytecode_alu alu;10827int r, i;10828int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);1082910830/* temp.xy = f32_to_f16(src) */10831memset(&alu, 0, sizeof(struct r600_bytecode_alu));10832alu.op = ALU_OP1_FLT32_TO_FLT16;10833alu.dst.chan = 0;10834alu.dst.sel = ctx->temp_reg;10835alu.dst.write = 1;10836r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);10837r = r600_bytecode_add_alu(ctx->bc, &alu);10838if (r)10839return r;10840alu.dst.chan = 1;10841r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);10842alu.last = 1;10843r = r600_bytecode_add_alu(ctx->bc, &alu);10844if (r)10845return r;1084610847/* dst.x = temp.y * 0x10000 + temp.x */10848for (i = 0; i < lasti + 1; i++) {10849if (!(inst->Dst[0].Register.WriteMask & (1 << i)))10850continue;1085110852memset(&alu, 0, sizeof(struct r600_bytecode_alu));10853alu.op = ALU_OP3_MULADD_UINT24;10854alu.is_op3 = 1;10855tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);10856alu.last = i == lasti;10857alu.src[0].sel = ctx->temp_reg;10858alu.src[0].chan = 1;10859alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;10860alu.src[1].value = 0x10000;10861alu.src[2].sel = ctx->temp_reg;10862alu.src[2].chan = 0;10863r = r600_bytecode_add_alu(ctx->bc, &alu);10864if (r)10865return r;10866}1086710868return 0;10869}1087010871static int tgsi_up2h(struct r600_shader_ctx *ctx)10872{10873struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;10874struct r600_bytecode_alu alu;10875int r, i;10876int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);1087710878/* temp.x = src.x */10879/* note: no need to mask out the high bits */10880memset(&alu, 0, sizeof(struct r600_bytecode_alu));10881alu.op = ALU_OP1_MOV;10882alu.dst.chan = 0;10883alu.dst.sel = ctx->temp_reg;10884alu.dst.write = 1;10885r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);10886r = r600_bytecode_add_alu(ctx->bc, &alu);10887if (r)10888return r;1088910890/* temp.y = src.x >> 16 */10891memset(&alu, 0, sizeof(struct r600_bytecode_alu));10892alu.op = ALU_OP2_LSHR_INT;10893alu.dst.chan = 1;10894alu.dst.sel = ctx->temp_reg;10895alu.dst.write = 1;10896r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);10897alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;10898alu.src[1].value = 16;10899alu.last = 1;10900r = r600_bytecode_add_alu(ctx->bc, &alu);10901if (r)10902return r;1090310904/* dst.wz = dst.xy = f16_to_f32(temp.xy) */10905for (i = 0; i < lasti + 1; i++) {10906if (!(inst->Dst[0].Register.WriteMask & (1 << i)))10907continue;10908memset(&alu, 0, sizeof(struct r600_bytecode_alu));10909tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);10910alu.op = ALU_OP1_FLT16_TO_FLT32;10911alu.src[0].sel = ctx->temp_reg;10912alu.src[0].chan = i % 2;10913alu.last = i == lasti;10914r = r600_bytecode_add_alu(ctx->bc, &alu);10915if (r)10916return r;10917}1091810919return 0;10920}1092110922static int tgsi_bfe(struct r600_shader_ctx *ctx)10923{10924struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;10925struct r600_bytecode_alu alu;10926int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);10927int r, i;10928int dst = -1;1092910930if ((inst->Src[0].Register.File == inst->Dst[0].Register.File &&10931inst->Src[0].Register.Index == inst->Dst[0].Register.Index) ||10932(inst->Src[2].Register.File == inst->Dst[0].Register.File &&10933inst->Src[2].Register.Index == inst->Dst[0].Register.Index))10934dst = r600_get_temp(ctx);1093510936r = tgsi_op3_dst(ctx, dst);10937if (r)10938return r;1093910940for (i = 0; i < lasti + 1; i++) {10941memset(&alu, 0, sizeof(struct r600_bytecode_alu));10942alu.op = ALU_OP2_SETGE_INT;10943r600_bytecode_src(&alu.src[0], &ctx->src[2], i);10944alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;10945alu.src[1].value = 32;10946alu.dst.sel = ctx->temp_reg;10947alu.dst.chan = i;10948alu.dst.write = 1;10949if (i == lasti)10950alu.last = 1;10951r = r600_bytecode_add_alu(ctx->bc, &alu);10952if (r)10953return r;10954}1095510956for (i = 0; i < lasti + 1; i++) {10957memset(&alu, 0, sizeof(struct r600_bytecode_alu));10958alu.op = ALU_OP3_CNDE_INT;10959alu.is_op3 = 1;10960alu.src[0].sel = ctx->temp_reg;10961alu.src[0].chan = i;1096210963tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);10964if (dst != -1)10965alu.src[1].sel = dst;10966else10967alu.src[1].sel = alu.dst.sel;10968alu.src[1].chan = i;10969r600_bytecode_src(&alu.src[2], &ctx->src[0], i);10970alu.dst.write = 1;10971if (i == lasti)10972alu.last = 1;10973r = r600_bytecode_add_alu(ctx->bc, &alu);10974if (r)10975return r;10976}1097710978return 0;10979}1098010981static int tgsi_clock(struct r600_shader_ctx *ctx)10982{10983struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;10984struct r600_bytecode_alu alu;10985int r;1098610987memset(&alu, 0, sizeof(struct r600_bytecode_alu));10988alu.op = ALU_OP1_MOV;10989tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);10990alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_LO;10991r = r600_bytecode_add_alu(ctx->bc, &alu);10992if (r)10993return r;10994memset(&alu, 0, sizeof(struct r600_bytecode_alu));10995alu.op = ALU_OP1_MOV;10996tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);10997alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_HI;10998alu.last = 1;10999r = r600_bytecode_add_alu(ctx->bc, &alu);11000if (r)11001return r;11002return 0;11003}1100411005static int emit_u64add(struct r600_shader_ctx *ctx, int op,11006int treg,11007int src0_sel, int src0_chan,11008int src1_sel, int src1_chan)11009{11010struct r600_bytecode_alu alu;11011int r;11012int opc;1101311014if (op == ALU_OP2_ADD_INT)11015opc = ALU_OP2_ADDC_UINT;11016else11017opc = ALU_OP2_SUBB_UINT;1101811019memset(&alu, 0, sizeof(struct r600_bytecode_alu));11020alu.op = op; ;11021alu.dst.sel = treg;11022alu.dst.chan = 0;11023alu.dst.write = 1;11024alu.src[0].sel = src0_sel;11025alu.src[0].chan = src0_chan + 0;11026alu.src[1].sel = src1_sel;11027alu.src[1].chan = src1_chan + 0;11028alu.src[1].neg = 0;11029r = r600_bytecode_add_alu(ctx->bc, &alu);11030if (r)11031return r;1103211033memset(&alu, 0, sizeof(struct r600_bytecode_alu));11034alu.op = op;11035alu.dst.sel = treg;11036alu.dst.chan = 1;11037alu.dst.write = 1;11038alu.src[0].sel = src0_sel;11039alu.src[0].chan = src0_chan + 1;11040alu.src[1].sel = src1_sel;11041alu.src[1].chan = src1_chan + 1;11042alu.src[1].neg = 0;11043r = r600_bytecode_add_alu(ctx->bc, &alu);11044if (r)11045return r;1104611047memset(&alu, 0, sizeof(struct r600_bytecode_alu));11048alu.op = opc;11049alu.dst.sel = treg;11050alu.dst.chan = 2;11051alu.dst.write = 1;11052alu.last = 1;11053alu.src[0].sel = src0_sel;11054alu.src[0].chan = src0_chan + 0;11055alu.src[1].sel = src1_sel;11056alu.src[1].chan = src1_chan + 0;11057alu.src[1].neg = 0;11058r = r600_bytecode_add_alu(ctx->bc, &alu);11059if (r)11060return r;1106111062memset(&alu, 0, sizeof(struct r600_bytecode_alu));11063alu.op = op;11064alu.dst.sel = treg;11065alu.dst.chan = 1;11066alu.dst.write = 1;11067alu.src[0].sel = treg;11068alu.src[0].chan = 1;11069alu.src[1].sel = treg;11070alu.src[1].chan = 2;11071alu.last = 1;11072r = r600_bytecode_add_alu(ctx->bc, &alu);11073if (r)11074return r;11075return 0;11076}1107711078static int egcm_u64add(struct r600_shader_ctx *ctx)11079{11080struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;11081struct r600_bytecode_alu alu;11082int r;11083int treg = ctx->temp_reg;11084int op = ALU_OP2_ADD_INT, opc = ALU_OP2_ADDC_UINT;1108511086if (ctx->src[1].neg) {11087op = ALU_OP2_SUB_INT;11088opc = ALU_OP2_SUBB_UINT;11089}11090memset(&alu, 0, sizeof(struct r600_bytecode_alu));11091alu.op = op; ;11092alu.dst.sel = treg;11093alu.dst.chan = 0;11094alu.dst.write = 1;11095r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);11096r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);11097alu.src[1].neg = 0;11098r = r600_bytecode_add_alu(ctx->bc, &alu);11099if (r)11100return r;1110111102memset(&alu, 0, sizeof(struct r600_bytecode_alu));11103alu.op = op;11104alu.dst.sel = treg;11105alu.dst.chan = 1;11106alu.dst.write = 1;11107r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);11108r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);11109alu.src[1].neg = 0;11110r = r600_bytecode_add_alu(ctx->bc, &alu);11111if (r)11112return r;1111311114memset(&alu, 0, sizeof(struct r600_bytecode_alu));11115alu.op = opc ;11116alu.dst.sel = treg;11117alu.dst.chan = 2;11118alu.dst.write = 1;11119alu.last = 1;11120r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);11121r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);11122alu.src[1].neg = 0;11123r = r600_bytecode_add_alu(ctx->bc, &alu);11124if (r)11125return r;1112611127memset(&alu, 0, sizeof(struct r600_bytecode_alu));11128alu.op = op;11129tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);11130alu.src[0].sel = treg;11131alu.src[0].chan = 1;11132alu.src[1].sel = treg;11133alu.src[1].chan = 2;11134alu.last = 1;11135r = r600_bytecode_add_alu(ctx->bc, &alu);11136if (r)11137return r;11138memset(&alu, 0, sizeof(struct r600_bytecode_alu));11139alu.op = ALU_OP1_MOV;11140tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);11141alu.src[0].sel = treg;11142alu.src[0].chan = 0;11143alu.last = 1;11144r = r600_bytecode_add_alu(ctx->bc, &alu);11145if (r)11146return r;11147return 0;11148}111491115011151static int egcm_i64neg(struct r600_shader_ctx *ctx)11152{11153struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;11154struct r600_bytecode_alu alu;11155int r;11156int treg = ctx->temp_reg;11157const int op = ALU_OP2_SUB_INT;11158const int opc = ALU_OP2_SUBB_UINT;1115911160memset(&alu, 0, sizeof(struct r600_bytecode_alu));11161alu.op = op; ;11162alu.dst.sel = treg;11163alu.dst.chan = 0;11164alu.dst.write = 1;11165alu.src[0].sel = V_SQ_ALU_SRC_0;11166r600_bytecode_src(&alu.src[1], &ctx->src[0], 0);11167alu.src[1].neg = 0;11168r = r600_bytecode_add_alu(ctx->bc, &alu);11169if (r)11170return r;1117111172memset(&alu, 0, sizeof(struct r600_bytecode_alu));11173alu.op = op;11174alu.dst.sel = treg;11175alu.dst.chan = 1;11176alu.dst.write = 1;11177alu.src[0].sel = V_SQ_ALU_SRC_0;11178r600_bytecode_src(&alu.src[1], &ctx->src[0], 1);11179alu.src[1].neg = 0;11180r = r600_bytecode_add_alu(ctx->bc, &alu);11181if (r)11182return r;1118311184memset(&alu, 0, sizeof(struct r600_bytecode_alu));11185alu.op = opc ;11186alu.dst.sel = treg;11187alu.dst.chan = 2;11188alu.dst.write = 1;11189alu.last = 1;11190alu.src[0].sel = V_SQ_ALU_SRC_0;11191r600_bytecode_src(&alu.src[1], &ctx->src[0], 0);11192alu.src[1].neg = 0;11193r = r600_bytecode_add_alu(ctx->bc, &alu);11194if (r)11195return r;1119611197memset(&alu, 0, sizeof(struct r600_bytecode_alu));11198alu.op = op;11199tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);11200alu.src[0].sel = treg;11201alu.src[0].chan = 1;11202alu.src[1].sel = treg;11203alu.src[1].chan = 2;11204alu.last = 1;11205r = r600_bytecode_add_alu(ctx->bc, &alu);11206if (r)11207return r;11208memset(&alu, 0, sizeof(struct r600_bytecode_alu));11209alu.op = ALU_OP1_MOV;11210tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);11211alu.src[0].sel = treg;11212alu.src[0].chan = 0;11213alu.last = 1;11214r = r600_bytecode_add_alu(ctx->bc, &alu);11215if (r)11216return r;11217return 0;11218}1121911220/* result.y = mul_high a, b11221result.x = mul a,b11222result.y += a.x * b.y + a.y * b.x;11223*/11224static int egcm_u64mul(struct r600_shader_ctx *ctx)11225{11226struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;11227struct r600_bytecode_alu alu;11228int r;11229int treg = ctx->temp_reg;1123011231/* temp.x = mul_lo a.x, b.x */11232memset(&alu, 0, sizeof(struct r600_bytecode_alu));11233alu.op = ALU_OP2_MULLO_UINT;11234alu.dst.sel = treg;11235alu.dst.chan = 0;11236alu.dst.write = 1;11237r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);11238r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);11239r = emit_mul_int_op(ctx->bc, &alu);11240if (r)11241return r;1124211243/* temp.y = mul_hi a.x, b.x */11244memset(&alu, 0, sizeof(struct r600_bytecode_alu));11245alu.op = ALU_OP2_MULHI_UINT;11246alu.dst.sel = treg;11247alu.dst.chan = 1;11248alu.dst.write = 1;11249r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);11250r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);11251r = emit_mul_int_op(ctx->bc, &alu);11252if (r)11253return r;1125411255/* temp.z = mul a.x, b.y */11256memset(&alu, 0, sizeof(struct r600_bytecode_alu));11257alu.op = ALU_OP2_MULLO_UINT;11258alu.dst.sel = treg;11259alu.dst.chan = 2;11260alu.dst.write = 1;11261r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);11262r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);11263r = emit_mul_int_op(ctx->bc, &alu);11264if (r)11265return r;1126611267/* temp.w = mul a.y, b.x */11268memset(&alu, 0, sizeof(struct r600_bytecode_alu));11269alu.op = ALU_OP2_MULLO_UINT;11270alu.dst.sel = treg;11271alu.dst.chan = 3;11272alu.dst.write = 1;11273r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);11274r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);11275r = emit_mul_int_op(ctx->bc, &alu);11276if (r)11277return r;1127811279/* temp.z = temp.z + temp.w */11280memset(&alu, 0, sizeof(struct r600_bytecode_alu));11281alu.op = ALU_OP2_ADD_INT;11282alu.dst.sel = treg;11283alu.dst.chan = 2;11284alu.dst.write = 1;11285alu.src[0].sel = treg;11286alu.src[0].chan = 2;11287alu.src[1].sel = treg;11288alu.src[1].chan = 3;11289alu.last = 1;11290r = r600_bytecode_add_alu(ctx->bc, &alu);11291if (r)11292return r;1129311294/* temp.y = temp.y + temp.z */11295memset(&alu, 0, sizeof(struct r600_bytecode_alu));11296alu.op = ALU_OP2_ADD_INT;11297alu.dst.sel = treg;11298alu.dst.chan = 1;11299alu.dst.write = 1;11300alu.src[0].sel = treg;11301alu.src[0].chan = 1;11302alu.src[1].sel = treg;11303alu.src[1].chan = 2;11304alu.last = 1;11305r = r600_bytecode_add_alu(ctx->bc, &alu);11306if (r)11307return r;1130811309/* dst.x = temp.x */11310memset(&alu, 0, sizeof(struct r600_bytecode_alu));11311alu.op = ALU_OP1_MOV;11312tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);11313alu.src[0].sel = treg;11314alu.src[0].chan = 0;11315r = r600_bytecode_add_alu(ctx->bc, &alu);11316if (r)11317return r;1131811319/* dst.y = temp.y */11320memset(&alu, 0, sizeof(struct r600_bytecode_alu));11321alu.op = ALU_OP1_MOV;11322tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);11323alu.src[0].sel = treg;11324alu.src[0].chan = 1;11325alu.last = 1;11326r = r600_bytecode_add_alu(ctx->bc, &alu);11327if (r)11328return r;1132911330return 0;11331}1133211333static int emit_u64sge(struct r600_shader_ctx *ctx,11334int treg,11335int src0_sel, int src0_base_chan,11336int src1_sel, int src1_base_chan)11337{11338int r;11339/* for 64-bit sge */11340/* result = (src0.y > src1.y) || ((src0.y == src1.y) && src0.x >= src1.x)) */11341r = single_alu_op2(ctx, ALU_OP2_SETGT_UINT,11342treg, 1,11343src0_sel, src0_base_chan + 1,11344src1_sel, src1_base_chan + 1);11345if (r)11346return r;1134711348r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,11349treg, 0,11350src0_sel, src0_base_chan,11351src1_sel, src1_base_chan);11352if (r)11353return r;1135411355r = single_alu_op2(ctx, ALU_OP2_SETE_INT,11356treg, 2,11357src0_sel, src0_base_chan + 1,11358src1_sel, src1_base_chan + 1);11359if (r)11360return r;1136111362r = single_alu_op2(ctx, ALU_OP2_AND_INT,11363treg, 0,11364treg, 0,11365treg, 2);11366if (r)11367return r;1136811369r = single_alu_op2(ctx, ALU_OP2_OR_INT,11370treg, 0,11371treg, 0,11372treg, 1);11373if (r)11374return r;11375return 0;11376}1137711378/* this isn't a complete div it's just enough for qbo shader to work */11379static int egcm_u64div(struct r600_shader_ctx *ctx)11380{11381struct r600_bytecode_alu alu;11382struct r600_bytecode_alu_src alu_num_hi, alu_num_lo, alu_denom_hi, alu_denom_lo, alu_src;11383int r, i;11384struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;1138511386/* make sure we are dividing my a const with 0 in the high bits */11387if (ctx->src[1].sel != V_SQ_ALU_SRC_LITERAL)11388return -1;11389if (ctx->src[1].value[ctx->src[1].swizzle[1]] != 0)11390return -1;11391/* make sure we are doing one division */11392if (inst->Dst[0].Register.WriteMask != 0x3)11393return -1;1139411395/* emit_if uses ctx->temp_reg so we can't */11396int treg = r600_get_temp(ctx);11397int tmp_num = r600_get_temp(ctx);11398int sub_tmp = r600_get_temp(ctx);1139911400/* tmp quot are tmp_num.zw */11401r600_bytecode_src(&alu_num_lo, &ctx->src[0], 0);11402r600_bytecode_src(&alu_num_hi, &ctx->src[0], 1);11403r600_bytecode_src(&alu_denom_lo, &ctx->src[1], 0);11404r600_bytecode_src(&alu_denom_hi, &ctx->src[1], 1);1140511406/* MOV tmp_num.xy, numerator */11407r = single_alu_op2(ctx, ALU_OP1_MOV,11408tmp_num, 0,11409alu_num_lo.sel, alu_num_lo.chan,114100, 0);11411if (r)11412return r;11413r = single_alu_op2(ctx, ALU_OP1_MOV,11414tmp_num, 1,11415alu_num_hi.sel, alu_num_hi.chan,114160, 0);11417if (r)11418return r;1141911420r = single_alu_op2(ctx, ALU_OP1_MOV,11421tmp_num, 2,11422V_SQ_ALU_SRC_LITERAL, 0,114230, 0);11424if (r)11425return r;1142611427r = single_alu_op2(ctx, ALU_OP1_MOV,11428tmp_num, 3,11429V_SQ_ALU_SRC_LITERAL, 0,114300, 0);11431if (r)11432return r;1143311434/* treg 0 is log2_denom */11435/* normally this gets the MSB for the denom high value11436- however we know this will always be 0 here. */11437r = single_alu_op2(ctx,11438ALU_OP1_MOV,11439treg, 0,11440V_SQ_ALU_SRC_LITERAL, 32,114410, 0);11442if (r)11443return r;1144411445/* normally check demon hi for 0, but we know it is already */11446/* t0.z = num_hi >= denom_lo */11447r = single_alu_op2(ctx,11448ALU_OP2_SETGE_UINT,11449treg, 1,11450alu_num_hi.sel, alu_num_hi.chan,11451V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value);11452if (r)11453return r;1145411455memset(&alu_src, 0, sizeof(alu_src));11456alu_src.sel = treg;11457alu_src.chan = 1;11458r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);11459if (r)11460return r;1146111462/* for loops in here */11463/* get msb t0.x = msb(src[1].x) first */11464int msb_lo = util_last_bit(alu_denom_lo.value);11465r = single_alu_op2(ctx, ALU_OP1_MOV,11466treg, 0,11467V_SQ_ALU_SRC_LITERAL, msb_lo,114680, 0);11469if (r)11470return r;1147111472/* unroll the asm here */11473for (i = 0; i < 31; i++) {11474r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,11475treg, 2,11476V_SQ_ALU_SRC_LITERAL, i,11477treg, 0);11478if (r)11479return r;1148011481/* we can do this on the CPU */11482uint32_t denom_lo_shl = alu_denom_lo.value << (31 - i);11483/* t0.z = tmp_num.y >= t0.z */11484r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,11485treg, 1,11486tmp_num, 1,11487V_SQ_ALU_SRC_LITERAL, denom_lo_shl);11488if (r)11489return r;1149011491r = single_alu_op2(ctx, ALU_OP2_AND_INT,11492treg, 1,11493treg, 1,11494treg, 2);11495if (r)11496return r;1149711498memset(&alu_src, 0, sizeof(alu_src));11499alu_src.sel = treg;11500alu_src.chan = 1;11501r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);11502if (r)11503return r;1150411505r = single_alu_op2(ctx, ALU_OP2_SUB_INT,11506tmp_num, 1,11507tmp_num, 1,11508V_SQ_ALU_SRC_LITERAL, denom_lo_shl);11509if (r)11510return r;1151111512r = single_alu_op2(ctx, ALU_OP2_OR_INT,11513tmp_num, 3,11514tmp_num, 3,11515V_SQ_ALU_SRC_LITERAL, 1U << (31 - i));11516if (r)11517return r;1151811519r = tgsi_endif(ctx);11520if (r)11521return r;11522}1152311524/* log2_denom is always <= 31, so manually peel the last loop11525* iteration.11526*/11527r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,11528treg, 1,11529tmp_num, 1,11530V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value);11531if (r)11532return r;1153311534memset(&alu_src, 0, sizeof(alu_src));11535alu_src.sel = treg;11536alu_src.chan = 1;11537r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);11538if (r)11539return r;1154011541r = single_alu_op2(ctx, ALU_OP2_SUB_INT,11542tmp_num, 1,11543tmp_num, 1,11544V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value);11545if (r)11546return r;1154711548r = single_alu_op2(ctx, ALU_OP2_OR_INT,11549tmp_num, 3,11550tmp_num, 3,11551V_SQ_ALU_SRC_LITERAL, 1U);11552if (r)11553return r;11554r = tgsi_endif(ctx);11555if (r)11556return r;1155711558r = tgsi_endif(ctx);11559if (r)11560return r;1156111562/* onto the second loop to unroll */11563for (i = 0; i < 31; i++) {11564r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,11565treg, 1,11566V_SQ_ALU_SRC_LITERAL, (63 - (31 - i)),11567treg, 0);11568if (r)11569return r;1157011571uint64_t denom_shl = (uint64_t)alu_denom_lo.value << (31 - i);11572r = single_alu_op2(ctx, ALU_OP1_MOV,11573treg, 2,11574V_SQ_ALU_SRC_LITERAL, (denom_shl & 0xffffffff),115750, 0);11576if (r)11577return r;1157811579r = single_alu_op2(ctx, ALU_OP1_MOV,11580treg, 3,11581V_SQ_ALU_SRC_LITERAL, (denom_shl >> 32),115820, 0);11583if (r)11584return r;1158511586r = emit_u64sge(ctx, sub_tmp,11587tmp_num, 0,11588treg, 2);11589if (r)11590return r;1159111592r = single_alu_op2(ctx, ALU_OP2_AND_INT,11593treg, 1,11594treg, 1,11595sub_tmp, 0);11596if (r)11597return r;1159811599memset(&alu_src, 0, sizeof(alu_src));11600alu_src.sel = treg;11601alu_src.chan = 1;11602r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);11603if (r)11604return r;116051160611607r = emit_u64add(ctx, ALU_OP2_SUB_INT,11608sub_tmp,11609tmp_num, 0,11610treg, 2);11611if (r)11612return r;1161311614r = single_alu_op2(ctx, ALU_OP1_MOV,11615tmp_num, 0,11616sub_tmp, 0,116170, 0);11618if (r)11619return r;1162011621r = single_alu_op2(ctx, ALU_OP1_MOV,11622tmp_num, 1,11623sub_tmp, 1,116240, 0);11625if (r)11626return r;1162711628r = single_alu_op2(ctx, ALU_OP2_OR_INT,11629tmp_num, 2,11630tmp_num, 2,11631V_SQ_ALU_SRC_LITERAL, 1U << (31 - i));11632if (r)11633return r;1163411635r = tgsi_endif(ctx);11636if (r)11637return r;11638}1163911640/* log2_denom is always <= 63, so manually peel the last loop11641* iteration.11642*/11643uint64_t denom_shl = (uint64_t)alu_denom_lo.value;11644r = single_alu_op2(ctx, ALU_OP1_MOV,11645treg, 2,11646V_SQ_ALU_SRC_LITERAL, (denom_shl & 0xffffffff),116470, 0);11648if (r)11649return r;1165011651r = single_alu_op2(ctx, ALU_OP1_MOV,11652treg, 3,11653V_SQ_ALU_SRC_LITERAL, (denom_shl >> 32),116540, 0);11655if (r)11656return r;1165711658r = emit_u64sge(ctx, sub_tmp,11659tmp_num, 0,11660treg, 2);11661if (r)11662return r;1166311664memset(&alu_src, 0, sizeof(alu_src));11665alu_src.sel = sub_tmp;11666alu_src.chan = 0;11667r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);11668if (r)11669return r;1167011671r = emit_u64add(ctx, ALU_OP2_SUB_INT,11672sub_tmp,11673tmp_num, 0,11674treg, 2);11675if (r)11676return r;1167711678r = single_alu_op2(ctx, ALU_OP2_OR_INT,11679tmp_num, 2,11680tmp_num, 2,11681V_SQ_ALU_SRC_LITERAL, 1U);11682if (r)11683return r;11684r = tgsi_endif(ctx);11685if (r)11686return r;1168711688memset(&alu, 0, sizeof(struct r600_bytecode_alu));11689alu.op = ALU_OP1_MOV;11690tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);11691alu.src[0].sel = tmp_num;11692alu.src[0].chan = 2;11693r = r600_bytecode_add_alu(ctx->bc, &alu);11694if (r)11695return r;1169611697memset(&alu, 0, sizeof(struct r600_bytecode_alu));11698alu.op = ALU_OP1_MOV;11699tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);11700alu.src[0].sel = tmp_num;11701alu.src[0].chan = 3;11702alu.last = 1;11703r = r600_bytecode_add_alu(ctx->bc, &alu);11704if (r)11705return r;11706return 0;11707}1170811709static int egcm_u64sne(struct r600_shader_ctx *ctx)11710{11711struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;11712struct r600_bytecode_alu alu;11713int r;11714int treg = ctx->temp_reg;1171511716memset(&alu, 0, sizeof(struct r600_bytecode_alu));11717alu.op = ALU_OP2_SETNE_INT;11718alu.dst.sel = treg;11719alu.dst.chan = 0;11720alu.dst.write = 1;11721r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);11722r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);11723r = r600_bytecode_add_alu(ctx->bc, &alu);11724if (r)11725return r;1172611727memset(&alu, 0, sizeof(struct r600_bytecode_alu));11728alu.op = ALU_OP2_SETNE_INT;11729alu.dst.sel = treg;11730alu.dst.chan = 1;11731alu.dst.write = 1;11732r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);11733r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);11734alu.last = 1;11735r = r600_bytecode_add_alu(ctx->bc, &alu);11736if (r)11737return r;1173811739memset(&alu, 0, sizeof(struct r600_bytecode_alu));11740alu.op = ALU_OP2_OR_INT;11741tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);11742alu.src[0].sel = treg;11743alu.src[0].chan = 0;11744alu.src[1].sel = treg;11745alu.src[1].chan = 1;11746alu.last = 1;11747r = r600_bytecode_add_alu(ctx->bc, &alu);11748if (r)11749return r;11750return 0;11751}1175211753static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {11754[TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_r600_arl},11755[TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},11756[TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},1175711758[TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},1175911760[TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq},11761[TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},11762[TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},11763[TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2},11764[TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},11765[TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp},11766[TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp},11767[TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},11768/* MIN_DX10 returns non-nan result if one src is NaN, MIN returns NaN */11769[TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2},11770[TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2},11771[TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},11772[TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},11773[TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3},11774[TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},11775[TGSI_OPCODE_FMA] = { ALU_OP0_NOP, tgsi_unsupported},11776[TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},11777[21] = { ALU_OP0_NOP, tgsi_unsupported},11778[22] = { ALU_OP0_NOP, tgsi_unsupported},11779[23] = { ALU_OP0_NOP, tgsi_unsupported},11780[TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},11781[25] = { ALU_OP0_NOP, tgsi_unsupported},11782[TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},11783[TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},11784[TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},11785[TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},11786[TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow},11787[31] = { ALU_OP0_NOP, tgsi_unsupported},11788[32] = { ALU_OP0_NOP, tgsi_unsupported},11789[TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_unsupported},11790[34] = { ALU_OP0_NOP, tgsi_unsupported},11791[35] = { ALU_OP0_NOP, tgsi_unsupported},11792[TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig},11793[TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},11794[TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},11795[TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */11796[TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_unsupported},11797[TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},11798[TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},11799[TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},11800[44] = { ALU_OP0_NOP, tgsi_unsupported},11801[TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},11802[46] = { ALU_OP0_NOP, tgsi_unsupported},11803[TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},11804[TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig},11805[TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},11806[TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},11807[51] = { ALU_OP0_NOP, tgsi_unsupported},11808[TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},11809[TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},11810[TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},11811[TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_unsupported},11812[TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},11813[TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},11814[TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},11815[59] = { ALU_OP0_NOP, tgsi_unsupported},11816[60] = { ALU_OP0_NOP, tgsi_unsupported},11817[TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_r600_arl},11818[62] = { ALU_OP0_NOP, tgsi_unsupported},11819[TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},11820[TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},11821[TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},11822[TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},11823[67] = { ALU_OP0_NOP, tgsi_unsupported},11824[TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},11825[69] = { ALU_OP0_NOP, tgsi_unsupported},11826[TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},11827[TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp},11828[TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},11829[TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},11830[TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},11831[TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},11832[76] = { ALU_OP0_NOP, tgsi_unsupported},11833[TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},11834[TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},11835[TGSI_OPCODE_DDX_FINE] = { ALU_OP0_NOP, tgsi_unsupported},11836[TGSI_OPCODE_DDY_FINE] = { ALU_OP0_NOP, tgsi_unsupported},11837[81] = { ALU_OP0_NOP, tgsi_unsupported},11838[82] = { ALU_OP0_NOP, tgsi_unsupported},11839[TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},11840[TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},11841[TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},11842[TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},11843[TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2_trans},11844[88] = { ALU_OP0_NOP, tgsi_unsupported},11845[TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},11846[TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},11847[TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},11848[TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},11849[93] = { ALU_OP0_NOP, tgsi_unsupported},11850[TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},11851[TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},11852[TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},11853[TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},11854[TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},11855[TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},11856[TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},11857[TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},11858[TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},11859[103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},11860[TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},11861[TGSI_OPCODE_RESQ] = { ALU_OP0_NOP, tgsi_unsupported},11862[106] = { ALU_OP0_NOP, tgsi_unsupported},11863[TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},11864[TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},11865[TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},11866[TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},11867[TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},11868[TGSI_OPCODE_MEMBAR] = { ALU_OP0_NOP, tgsi_unsupported},11869[113] = { ALU_OP0_NOP, tgsi_unsupported},11870[114] = { ALU_OP0_NOP, tgsi_unsupported},11871[115] = { ALU_OP0_NOP, tgsi_unsupported},11872[TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */11873[TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */11874[TGSI_OPCODE_DFMA] = { ALU_OP0_NOP, tgsi_unsupported},11875[TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2_trans},11876[TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},11877[TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},11878[TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},11879[TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},11880[TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},11881[TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2_trans},11882[TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},11883[TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2_trans},11884[TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},11885[TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},11886[TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},11887[TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},11888[TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},11889[TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},11890[TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},11891[TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans},11892[TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},11893[TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},11894[TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2_trans},11895[TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},11896[TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2_swap},11897[TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},11898[TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},11899[TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},11900[TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},11901[TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},11902[TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},11903[TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},11904[TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},11905[TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},11906[TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},11907[TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},11908[TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},11909[TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},11910[TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},11911[TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},11912[TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},11913[TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_r600_arl},11914[TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},11915[TGSI_OPCODE_IABS] = { 0, tgsi_iabs},11916[TGSI_OPCODE_ISSG] = { 0, tgsi_issg},11917[TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_unsupported},11918[TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_unsupported},11919[163] = { ALU_OP0_NOP, tgsi_unsupported},11920[164] = { ALU_OP0_NOP, tgsi_unsupported},11921[165] = { ALU_OP0_NOP, tgsi_unsupported},11922[TGSI_OPCODE_BARRIER] = { ALU_OP0_NOP, tgsi_unsupported},11923[TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported},11924[TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported},11925[TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported},11926[TGSI_OPCODE_ATOMAND] = { ALU_OP0_NOP, tgsi_unsupported},11927[TGSI_OPCODE_ATOMOR] = { ALU_OP0_NOP, tgsi_unsupported},11928[TGSI_OPCODE_ATOMXOR] = { ALU_OP0_NOP, tgsi_unsupported},11929[TGSI_OPCODE_ATOMUMIN] = { ALU_OP0_NOP, tgsi_unsupported},11930[TGSI_OPCODE_ATOMUMAX] = { ALU_OP0_NOP, tgsi_unsupported},11931[TGSI_OPCODE_ATOMIMIN] = { ALU_OP0_NOP, tgsi_unsupported},11932[TGSI_OPCODE_ATOMIMAX] = { ALU_OP0_NOP, tgsi_unsupported},11933[TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},11934[TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},11935[TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},11936[TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans},11937[TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans},11938[TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_unsupported},11939[TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_unsupported},11940[TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_unsupported},11941[TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_unsupported},11942[TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_unsupported},11943[TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_unsupported},11944[TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_unsupported},11945[TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_unsupported},11946[TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_unsupported},11947[TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_unsupported},11948[TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_unsupported},11949[TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_unsupported},11950[TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_unsupported},11951[TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},11952};1195311954static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {11955[TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl},11956[TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},11957[TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},11958[TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},11959[TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq},11960[TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},11961[TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},11962[TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2},11963[TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},11964[TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp},11965[TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp},11966[TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},11967[TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2},11968[TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2},11969[TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},11970[TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},11971[TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3},11972[TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},11973[TGSI_OPCODE_FMA] = { ALU_OP3_FMA, tgsi_op3},11974[TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},11975[21] = { ALU_OP0_NOP, tgsi_unsupported},11976[22] = { ALU_OP0_NOP, tgsi_unsupported},11977[23] = { ALU_OP0_NOP, tgsi_unsupported},11978[TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},11979[25] = { ALU_OP0_NOP, tgsi_unsupported},11980[TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},11981[TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},11982[TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},11983[TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},11984[TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow},11985[31] = { ALU_OP0_NOP, tgsi_unsupported},11986[32] = { ALU_OP0_NOP, tgsi_unsupported},11987[TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_clock},11988[34] = { ALU_OP0_NOP, tgsi_unsupported},11989[35] = { ALU_OP0_NOP, tgsi_unsupported},11990[TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig},11991[TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},11992[TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},11993[TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */11994[TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_pk2h},11995[TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},11996[TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},11997[TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},11998[44] = { ALU_OP0_NOP, tgsi_unsupported},11999[TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},12000[46] = { ALU_OP0_NOP, tgsi_unsupported},12001[TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},12002[TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig},12003[TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},12004[TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},12005[51] = { ALU_OP0_NOP, tgsi_unsupported},12006[TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},12007[TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},12008[TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},12009[TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_up2h},12010[TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},12011[TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},12012[TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},12013[59] = { ALU_OP0_NOP, tgsi_unsupported},12014[60] = { ALU_OP0_NOP, tgsi_unsupported},12015[TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl},12016[62] = { ALU_OP0_NOP, tgsi_unsupported},12017[TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},12018[TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},12019[TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},12020[TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},12021[67] = { ALU_OP0_NOP, tgsi_unsupported},12022[TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},12023[69] = { ALU_OP0_NOP, tgsi_unsupported},12024[TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},12025[TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp},12026[TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},12027[TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},12028[TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},12029[TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},12030[76] = { ALU_OP0_NOP, tgsi_unsupported},12031[TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},12032[TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},12033[TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},12034[TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},12035[82] = { ALU_OP0_NOP, tgsi_unsupported},12036[TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},12037[TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},12038[TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},12039[TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},12040[TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2},12041[88] = { ALU_OP0_NOP, tgsi_unsupported},12042[TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},12043[TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},12044[TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},12045[TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},12046[93] = { ALU_OP0_NOP, tgsi_unsupported},12047[TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},12048[TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},12049[TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},12050[TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},12051[TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},12052[TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},12053[TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},12054[TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},12055[TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},12056[103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},12057[TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},12058[TGSI_OPCODE_RESQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq},12059[106] = { ALU_OP0_NOP, tgsi_unsupported},12060[TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},12061[TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},12062[TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},12063[TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},12064[TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},12065[TGSI_OPCODE_MEMBAR] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},12066[113] = { ALU_OP0_NOP, tgsi_unsupported},12067[114] = { ALU_OP0_NOP, tgsi_unsupported},12068[115] = { ALU_OP0_NOP, tgsi_unsupported},12069[TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */12070[TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */12071/* Refer below for TGSI_OPCODE_DFMA */12072[TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_f2i},12073[TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},12074[TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},12075[TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},12076[TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},12077[TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},12078[TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2},12079[TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},12080[TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_f2i},12081[TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},12082[TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},12083[TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},12084[TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},12085[TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},12086[TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},12087[TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},12088[TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans},12089[TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},12090[TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},12091[TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2},12092[TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},12093[TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2},12094[TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},12095[TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},12096[TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},12097[TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},12098[TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},12099[TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},12100[TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},12101[TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},12102[TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},12103[TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},12104[TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},12105[TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},12106[TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},12107[TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},12108[TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},12109[TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},12110[TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl},12111[TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},12112[TGSI_OPCODE_IABS] = { 0, tgsi_iabs},12113[TGSI_OPCODE_ISSG] = { 0, tgsi_issg},12114[TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load},12115[TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_store},12116[163] = { ALU_OP0_NOP, tgsi_unsupported},12117[164] = { ALU_OP0_NOP, tgsi_unsupported},12118[165] = { ALU_OP0_NOP, tgsi_unsupported},12119[TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},12120[TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op},12121[TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},12122[TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},12123[TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op},12124[TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op},12125[TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op},12126[TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},12127[TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},12128[TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},12129[TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},12130[TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},12131[TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},12132[TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},12133[TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans},12134[TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans},12135[TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex},12136[TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex},12137[TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_bfe},12138[TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_bfe},12139[TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi},12140[TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2},12141[TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2},12142[TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2},12143[TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb},12144[TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb},12145[TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm},12146[TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm},12147[TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm},12148[TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},12149[TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},12150[TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64},12151[TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg},12152[TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64},12153[TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr},12154[TGSI_OPCODE_DDIV] = { 0, cayman_ddiv_instr },12155[TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64},12156[TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64},12157[TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},12158[TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},12159[TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},12160[TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},12161[TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr},12162[TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr},12163[TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64},12164[TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64},12165[TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64},12166[TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64},12167[TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp},12168[TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int},12169[TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double},12170[TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},12171[TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},12172[TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},12173[TGSI_OPCODE_U64SNE] = { ALU_OP0_NOP, egcm_u64sne },12174[TGSI_OPCODE_U64ADD] = { ALU_OP0_NOP, egcm_u64add },12175[TGSI_OPCODE_U64MUL] = { ALU_OP0_NOP, egcm_u64mul },12176[TGSI_OPCODE_U64DIV] = { ALU_OP0_NOP, egcm_u64div },12177[TGSI_OPCODE_I64NEG] = { ALU_OP0_NOP, egcm_i64neg },12178[TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},12179};1218012181static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {12182[TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl},12183[TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},12184[TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},12185[TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, cayman_emit_float_instr},12186[TGSI_OPCODE_RSQ] = { ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr},12187[TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},12188[TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},12189[TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2},12190[TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},12191[TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp},12192[TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp},12193[TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},12194[TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2},12195[TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2},12196[TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},12197[TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},12198[TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3},12199[TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},12200[TGSI_OPCODE_FMA] = { ALU_OP3_FMA, tgsi_op3},12201[TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, cayman_emit_float_instr},12202[21] = { ALU_OP0_NOP, tgsi_unsupported},12203[22] = { ALU_OP0_NOP, tgsi_unsupported},12204[23] = { ALU_OP0_NOP, tgsi_unsupported},12205[TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},12206[25] = { ALU_OP0_NOP, tgsi_unsupported},12207[TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},12208[TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},12209[TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, cayman_emit_float_instr},12210[TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, cayman_emit_float_instr},12211[TGSI_OPCODE_POW] = { ALU_OP0_NOP, cayman_pow},12212[31] = { ALU_OP0_NOP, tgsi_unsupported},12213[32] = { ALU_OP0_NOP, tgsi_unsupported},12214[TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_clock},12215[34] = { ALU_OP0_NOP, tgsi_unsupported},12216[35] = { ALU_OP0_NOP, tgsi_unsupported},12217[TGSI_OPCODE_COS] = { ALU_OP1_COS, cayman_trig},12218[TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},12219[TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},12220[TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */12221[TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_pk2h},12222[TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},12223[TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},12224[TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},12225[44] = { ALU_OP0_NOP, tgsi_unsupported},12226[TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},12227[46] = { ALU_OP0_NOP, tgsi_unsupported},12228[TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},12229[TGSI_OPCODE_SIN] = { ALU_OP1_SIN, cayman_trig},12230[TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},12231[TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},12232[51] = { ALU_OP0_NOP, tgsi_unsupported},12233[TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},12234[TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},12235[TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},12236[TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_up2h},12237[TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},12238[TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},12239[TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},12240[59] = { ALU_OP0_NOP, tgsi_unsupported},12241[60] = { ALU_OP0_NOP, tgsi_unsupported},12242[TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl},12243[62] = { ALU_OP0_NOP, tgsi_unsupported},12244[TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},12245[TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},12246[TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},12247[TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},12248[67] = { ALU_OP0_NOP, tgsi_unsupported},12249[TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},12250[69] = { ALU_OP0_NOP, tgsi_unsupported},12251[TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},12252[TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp},12253[TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},12254[TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},12255[TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},12256[TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},12257[76] = { ALU_OP0_NOP, tgsi_unsupported},12258[TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},12259[TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},12260[TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},12261[TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},12262[82] = { ALU_OP0_NOP, tgsi_unsupported},12263[TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},12264[TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2},12265[TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},12266[TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},12267[TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2},12268[88] = { ALU_OP0_NOP, tgsi_unsupported},12269[TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},12270[TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},12271[TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},12272[TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},12273[93] = { ALU_OP0_NOP, tgsi_unsupported},12274[TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},12275[TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},12276[TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},12277[TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},12278[TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},12279[TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},12280[TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},12281[TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},12282[TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},12283[103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},12284[TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},12285[TGSI_OPCODE_RESQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq},12286[106] = { ALU_OP0_NOP, tgsi_unsupported},12287[TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},12288[TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},12289[TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},12290[TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},12291[TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},12292[TGSI_OPCODE_MEMBAR] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},12293[113] = { ALU_OP0_NOP, tgsi_unsupported},12294[114] = { ALU_OP0_NOP, tgsi_unsupported},12295[115] = { ALU_OP0_NOP, tgsi_unsupported},12296[TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */12297[TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */12298/* Refer below for TGSI_OPCODE_DFMA */12299[TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2},12300[TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},12301[TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},12302[TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},12303[TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},12304[TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},12305[TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2},12306[TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},12307[TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2},12308[TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2},12309[TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},12310[TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},12311[TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},12312[TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},12313[TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},12314[TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},12315[TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_INT, cayman_mul_int_instr},12316[TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},12317[TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},12318[TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2},12319[TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},12320[TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2},12321[TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},12322[TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},12323[TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},12324[TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},12325[TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},12326[TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},12327[TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},12328[TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},12329[TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},12330[TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},12331[TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},12332[TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},12333[TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},12334[TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},12335[TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},12336[TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},12337[TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl},12338[TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},12339[TGSI_OPCODE_IABS] = { 0, tgsi_iabs},12340[TGSI_OPCODE_ISSG] = { 0, tgsi_issg},12341[TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load},12342[TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_store},12343[163] = { ALU_OP0_NOP, tgsi_unsupported},12344[164] = { ALU_OP0_NOP, tgsi_unsupported},12345[165] = { ALU_OP0_NOP, tgsi_unsupported},12346[TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},12347[TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op},12348[TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},12349[TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},12350[TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op},12351[TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op},12352[TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op},12353[TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},12354[TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},12355[TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},12356[TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},12357[TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},12358[TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},12359[TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},12360[TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, cayman_mul_int_instr},12361[TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, cayman_mul_int_instr},12362[TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex},12363[TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex},12364[TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_bfe},12365[TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_bfe},12366[TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi},12367[TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2},12368[TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2},12369[TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2},12370[TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb},12371[TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb},12372[TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm},12373[TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm},12374[TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm},12375[TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},12376[TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},12377[TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64},12378[TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg},12379[TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64},12380[TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr},12381[TGSI_OPCODE_DDIV] = { 0, cayman_ddiv_instr },12382[TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64},12383[TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64},12384[TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},12385[TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},12386[TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},12387[TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},12388[TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr},12389[TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr},12390[TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64},12391[TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64},12392[TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64},12393[TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64},12394[TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp},12395[TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int},12396[TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double},12397[TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},12398[TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},12399[TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},12400[TGSI_OPCODE_U64SNE] = { ALU_OP0_NOP, egcm_u64sne },12401[TGSI_OPCODE_U64ADD] = { ALU_OP0_NOP, egcm_u64add },12402[TGSI_OPCODE_U64MUL] = { ALU_OP0_NOP, egcm_u64mul },12403[TGSI_OPCODE_U64DIV] = { ALU_OP0_NOP, egcm_u64div },12404[TGSI_OPCODE_I64NEG] = { ALU_OP0_NOP, egcm_i64neg },12405[TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},12406};124071240812409