Path: blob/21.2-virgl/src/gallium/drivers/r300/compiler/r3xx_vertprog.c
4574 views
/*1* Copyright 2009 Nicolai Hähnle <[email protected]>2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* on the rights to use, copy, modify, merge, publish, distribute, sub7* license, and/or sell copies of the Software, and to permit persons to whom8* the Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL17* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,18* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR19* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE20* USE OR OTHER DEALINGS IN THE SOFTWARE. */2122#include "radeon_compiler.h"2324#include <stdio.h>2526#include "r300_reg.h"2728#include "radeon_compiler_util.h"29#include "radeon_dataflow.h"30#include "radeon_program.h"31#include "radeon_program_alu.h"32#include "radeon_swizzle.h"33#include "radeon_emulate_branches.h"34#include "radeon_emulate_loops.h"35#include "radeon_remove_constants.h"3637#include "util/compiler.h"3839/*40* Take an already-setup and valid source then swizzle it appropriately to41* obtain a constant ZERO or ONE source.42*/43#define __CONST(x, y) \44(PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]), \45t_swizzle(y), \46t_swizzle(y), \47t_swizzle(y), \48t_swizzle(y), \49t_src_class(vpi->SrcReg[x].File), \50RC_MASK_NONE) | (vpi->SrcReg[x].RelAddr << 4))515253static unsigned long t_dst_mask(unsigned int mask)54{55/* RC_MASK_* is equivalent to VSF_FLAG_* */56return mask & RC_MASK_XYZW;57}5859static unsigned long t_dst_class(rc_register_file file)60{61switch (file) {62default:63fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);64FALLTHROUGH;65case RC_FILE_TEMPORARY:66return PVS_DST_REG_TEMPORARY;67case RC_FILE_OUTPUT:68return PVS_DST_REG_OUT;69case RC_FILE_ADDRESS:70return PVS_DST_REG_A0;71}72}7374static unsigned long t_dst_index(struct r300_vertex_program_code *vp,75struct rc_dst_register *dst)76{77if (dst->File == RC_FILE_OUTPUT)78return vp->outputs[dst->Index];7980return dst->Index;81}8283static unsigned long t_src_class(rc_register_file file)84{85switch (file) {86default:87fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);88FALLTHROUGH;89case RC_FILE_NONE:90case RC_FILE_TEMPORARY:91return PVS_SRC_REG_TEMPORARY;92case RC_FILE_INPUT:93return PVS_SRC_REG_INPUT;94case RC_FILE_CONSTANT:95return PVS_SRC_REG_CONSTANT;96}97}9899static int t_src_conflict(struct rc_src_register a, struct rc_src_register b)100{101unsigned long aclass = t_src_class(a.File);102unsigned long bclass = t_src_class(b.File);103104if (aclass != bclass)105return 0;106if (aclass == PVS_SRC_REG_TEMPORARY)107return 0;108109if (a.RelAddr || b.RelAddr)110return 1;111if (a.Index != b.Index)112return 1;113114return 0;115}116117static inline unsigned long t_swizzle(unsigned int swizzle)118{119/* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */120return swizzle;121}122123static unsigned long t_src_index(struct r300_vertex_program_code *vp,124struct rc_src_register *src)125{126if (src->File == RC_FILE_INPUT) {127assert(vp->inputs[src->Index] != -1);128return vp->inputs[src->Index];129} else {130if (src->Index < 0) {131fprintf(stderr,132"negative offsets for indirect addressing do not work.\n");133return 0;134}135return src->Index;136}137}138139/* these two functions should probably be merged... */140141static unsigned long t_src(struct r300_vertex_program_code *vp,142struct rc_src_register *src)143{144/* src->Negate uses the RC_MASK_ flags from program_instruction.h,145* which equal our VSF_FLAGS_ values, so it's safe to just pass it here.146*/147return PVS_SRC_OPERAND(t_src_index(vp, src),148t_swizzle(GET_SWZ(src->Swizzle, 0)),149t_swizzle(GET_SWZ(src->Swizzle, 1)),150t_swizzle(GET_SWZ(src->Swizzle, 2)),151t_swizzle(GET_SWZ(src->Swizzle, 3)),152t_src_class(src->File),153src->Negate) |154(src->RelAddr << 4) | (src->Abs << 3);155}156157static unsigned long t_src_scalar(struct r300_vertex_program_code *vp,158struct rc_src_register *src)159{160/* src->Negate uses the RC_MASK_ flags from program_instruction.h,161* which equal our VSF_FLAGS_ values, so it's safe to just pass it here.162*/163unsigned int swz = rc_get_scalar_src_swz(src->Swizzle);164165return PVS_SRC_OPERAND(t_src_index(vp, src),166t_swizzle(swz),167t_swizzle(swz),168t_swizzle(swz),169t_swizzle(swz),170t_src_class(src->File),171src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) |172(src->RelAddr << 4) | (src->Abs << 3);173}174175static int valid_dst(struct r300_vertex_program_code *vp,176struct rc_dst_register *dst)177{178if (dst->File == RC_FILE_OUTPUT && vp->outputs[dst->Index] == -1) {179return 0;180} else if (dst->File == RC_FILE_ADDRESS) {181assert(dst->Index == 0);182}183184return 1;185}186187static void ei_vector1(struct r300_vertex_program_code *vp,188unsigned int hw_opcode,189struct rc_sub_instruction *vpi,190unsigned int * inst)191{192inst[0] = PVS_OP_DST_OPERAND(hw_opcode,1930,1940,195t_dst_index(vp, &vpi->DstReg),196t_dst_mask(vpi->DstReg.WriteMask),197t_dst_class(vpi->DstReg.File),198vpi->SaturateMode == RC_SATURATE_ZERO_ONE);199inst[1] = t_src(vp, &vpi->SrcReg[0]);200inst[2] = __CONST(0, RC_SWIZZLE_ZERO);201inst[3] = __CONST(0, RC_SWIZZLE_ZERO);202}203204static void ei_vector2(struct r300_vertex_program_code *vp,205unsigned int hw_opcode,206struct rc_sub_instruction *vpi,207unsigned int * inst)208{209inst[0] = PVS_OP_DST_OPERAND(hw_opcode,2100,2110,212t_dst_index(vp, &vpi->DstReg),213t_dst_mask(vpi->DstReg.WriteMask),214t_dst_class(vpi->DstReg.File),215vpi->SaturateMode == RC_SATURATE_ZERO_ONE);216inst[1] = t_src(vp, &vpi->SrcReg[0]);217inst[2] = t_src(vp, &vpi->SrcReg[1]);218inst[3] = __CONST(1, RC_SWIZZLE_ZERO);219}220221static void ei_math1(struct r300_vertex_program_code *vp,222unsigned int hw_opcode,223struct rc_sub_instruction *vpi,224unsigned int * inst)225{226inst[0] = PVS_OP_DST_OPERAND(hw_opcode,2271,2280,229t_dst_index(vp, &vpi->DstReg),230t_dst_mask(vpi->DstReg.WriteMask),231t_dst_class(vpi->DstReg.File),232vpi->SaturateMode == RC_SATURATE_ZERO_ONE);233inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);234inst[2] = __CONST(0, RC_SWIZZLE_ZERO);235inst[3] = __CONST(0, RC_SWIZZLE_ZERO);236}237238static void ei_lit(struct r300_vertex_program_code *vp,239struct rc_sub_instruction *vpi,240unsigned int * inst)241{242//LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}243244inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX,2451,2460,247t_dst_index(vp, &vpi->DstReg),248t_dst_mask(vpi->DstReg.WriteMask),249t_dst_class(vpi->DstReg.File),250vpi->SaturateMode == RC_SATURATE_ZERO_ONE);251/* NOTE: Users swizzling might not work. */252inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X253t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W254PVS_SRC_SELECT_FORCE_0, // Z255t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y256t_src_class(vpi->SrcReg[0].File),257vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |258(vpi->SrcReg[0].RelAddr << 4);259inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y260t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W261PVS_SRC_SELECT_FORCE_0, // Z262t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X263t_src_class(vpi->SrcReg[0].File),264vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |265(vpi->SrcReg[0].RelAddr << 4);266inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y267t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X268PVS_SRC_SELECT_FORCE_0, // Z269t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W270t_src_class(vpi->SrcReg[0].File),271vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |272(vpi->SrcReg[0].RelAddr << 4);273}274275static void ei_mad(struct r300_vertex_program_code *vp,276struct rc_sub_instruction *vpi,277unsigned int * inst)278{279unsigned int i;280/* Remarks about hardware limitations of MAD281* (please preserve this comment, as this information is _NOT_282* in the documentation provided by AMD).283*284* As described in the documentation, MAD with three unique temporary285* source registers requires the use of the macro version.286*287* However (and this is not mentioned in the documentation), apparently288* the macro version is _NOT_ a full superset of the normal version.289* In particular, the macro version does not always work when relative290* addressing is used in the source operands.291*292* This limitation caused incorrect rendering in Sauerbraten's OpenGL293* assembly shader path when using medium quality animations294* (i.e. animations with matrix blending instead of quaternion blending).295*296* Unfortunately, I (nha) have been unable to extract a Piglit regression297* test for this issue - for some reason, it is possible to have vertex298* programs whose prefix is *exactly* the same as the prefix of the299* offending program in Sauerbraten up to the offending instruction300* without causing any trouble.301*302* Bottom line: Only use the macro version only when really necessary;303* according to AMD docs, this should improve performance by one clock304* as a nice side bonus.305*/306if (vpi->SrcReg[0].File == RC_FILE_TEMPORARY &&307vpi->SrcReg[1].File == RC_FILE_TEMPORARY &&308vpi->SrcReg[2].File == RC_FILE_TEMPORARY &&309vpi->SrcReg[0].Index != vpi->SrcReg[1].Index &&310vpi->SrcReg[0].Index != vpi->SrcReg[2].Index &&311vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) {312inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD,3130,3141,315t_dst_index(vp, &vpi->DstReg),316t_dst_mask(vpi->DstReg.WriteMask),317t_dst_class(vpi->DstReg.File),318vpi->SaturateMode == RC_SATURATE_ZERO_ONE);319} else {320inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD,3210,3220,323t_dst_index(vp, &vpi->DstReg),324t_dst_mask(vpi->DstReg.WriteMask),325t_dst_class(vpi->DstReg.File),326vpi->SaturateMode == RC_SATURATE_ZERO_ONE);327328/* Arguments with constant swizzles still count as a unique329* temporary, so we should make sure these arguments share a330* register index with one of the other arguments. */331for (i = 0; i < 3; i++) {332unsigned int j;333if (vpi->SrcReg[i].File != RC_FILE_NONE)334continue;335336for (j = 0; j < 3; j++) {337if (i != j) {338vpi->SrcReg[i].Index =339vpi->SrcReg[j].Index;340break;341}342}343}344}345inst[1] = t_src(vp, &vpi->SrcReg[0]);346inst[2] = t_src(vp, &vpi->SrcReg[1]);347inst[3] = t_src(vp, &vpi->SrcReg[2]);348}349350static void ei_pow(struct r300_vertex_program_code *vp,351struct rc_sub_instruction *vpi,352unsigned int * inst)353{354inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF,3551,3560,357t_dst_index(vp, &vpi->DstReg),358t_dst_mask(vpi->DstReg.WriteMask),359t_dst_class(vpi->DstReg.File),360vpi->SaturateMode == RC_SATURATE_ZERO_ONE);361inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);362inst[2] = __CONST(0, RC_SWIZZLE_ZERO);363inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]);364}365366static void translate_vertex_program(struct radeon_compiler *c, void *user)367{368struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;369struct rc_instruction *rci;370371unsigned loops[R500_PVS_MAX_LOOP_DEPTH];372unsigned loop_depth = 0;373374compiler->code->pos_end = 0; /* Not supported yet */375compiler->code->length = 0;376compiler->code->num_temporaries = 0;377378compiler->SetHwInputOutput(compiler);379380for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) {381struct rc_sub_instruction *vpi = &rci->U.I;382unsigned int *inst = compiler->code->body.d + compiler->code->length;383const struct rc_opcode_info *info = rc_get_opcode_info(vpi->Opcode);384385/* Skip instructions writing to non-existing destination */386if (!valid_dst(compiler->code, &vpi->DstReg))387continue;388389if (info->HasDstReg) {390/* Neither is Saturate. */391if (vpi->SaturateMode != RC_SATURATE_NONE && !c->is_r500) {392rc_error(&compiler->Base, "Vertex program does not support the Saturate "393"modifier (yet).\n");394}395}396397if (compiler->code->length >= c->max_alu_insts * 4) {398rc_error(&compiler->Base, "Vertex program has too many instructions\n");399return;400}401402assert(compiler->Base.is_r500 ||403(vpi->Opcode != RC_OPCODE_SEQ &&404vpi->Opcode != RC_OPCODE_SNE));405406switch (vpi->Opcode) {407case RC_OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break;408case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;409case RC_OPCODE_ARR: ei_vector1(compiler->code, VE_FLT2FIX_DX_RND, vpi, inst); break;410case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break;411case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;412case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;413case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;414case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break;415case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break;416case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break;417case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break;418case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break;419case RC_OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break;420case RC_OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break;421case RC_OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break;422case RC_OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break;423case RC_OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break;424case RC_OPCODE_POW: ei_pow(compiler->code, vpi, inst); break;425case RC_OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break;426case RC_OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break;427case RC_OPCODE_SEQ: ei_vector2(compiler->code, VE_SET_EQUAL, vpi, inst); break;428case RC_OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break;429case RC_OPCODE_SIN: ei_math1(compiler->code, ME_SIN, vpi, inst); break;430case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;431case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break;432case RC_OPCODE_BGNLOOP:433{434if ((!compiler->Base.is_r500435&& loop_depth >= R300_VS_MAX_LOOP_DEPTH)436|| loop_depth >= R500_PVS_MAX_LOOP_DEPTH) {437rc_error(&compiler->Base,438"Loops are nested too deep.");439return;440}441loops[loop_depth++] = ((compiler->code->length)/ 4) + 1;442break;443}444case RC_OPCODE_ENDLOOP:445{446unsigned int act_addr;447unsigned int last_addr;448unsigned int ret_addr;449450ret_addr = loops[--loop_depth];451act_addr = ret_addr - 1;452last_addr = (compiler->code->length / 4) - 1;453454if (loop_depth >= R300_VS_MAX_FC_OPS) {455rc_error(&compiler->Base,456"Too many flow control instructions.");457return;458}459if (compiler->Base.is_r500) {460compiler->code->fc_op_addrs.r500461[compiler->code->num_fc_ops].lw =462R500_PVS_FC_ACT_ADRS(act_addr)463| R500_PVS_FC_LOOP_CNT_JMP_INST(0x00ff)464;465compiler->code->fc_op_addrs.r500466[compiler->code->num_fc_ops].uw =467R500_PVS_FC_LAST_INST(last_addr)468| R500_PVS_FC_RTN_INST(ret_addr)469;470} else {471compiler->code->fc_op_addrs.r300472[compiler->code->num_fc_ops] =473R300_PVS_FC_ACT_ADRS(act_addr)474| R300_PVS_FC_LOOP_CNT_JMP_INST(0xff)475| R300_PVS_FC_LAST_INST(last_addr)476| R300_PVS_FC_RTN_INST(ret_addr)477;478}479compiler->code->fc_loop_index[compiler->code->num_fc_ops] =480R300_PVS_FC_LOOP_INIT_VAL(0x0)481| R300_PVS_FC_LOOP_STEP_VAL(0x1)482;483compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP(484compiler->code->num_fc_ops);485compiler->code->num_fc_ops++;486487break;488}489490case RC_ME_PRED_SET_CLR:491ei_math1(compiler->code, ME_PRED_SET_CLR, vpi, inst);492break;493494case RC_ME_PRED_SET_INV:495ei_math1(compiler->code, ME_PRED_SET_INV, vpi, inst);496break;497498case RC_ME_PRED_SET_POP:499ei_math1(compiler->code, ME_PRED_SET_POP, vpi, inst);500break;501502case RC_ME_PRED_SET_RESTORE:503ei_math1(compiler->code, ME_PRED_SET_RESTORE, vpi, inst);504break;505506case RC_ME_PRED_SEQ:507ei_math1(compiler->code, ME_PRED_SET_EQ, vpi, inst);508break;509510case RC_ME_PRED_SNEQ:511ei_math1(compiler->code, ME_PRED_SET_NEQ, vpi, inst);512break;513514case RC_VE_PRED_SNEQ_PUSH:515ei_vector2(compiler->code, VE_PRED_SET_NEQ_PUSH,516vpi, inst);517break;518519default:520rc_error(&compiler->Base, "Unknown opcode %s\n", info->Name);521return;522}523524if (vpi->DstReg.Pred != RC_PRED_DISABLED) {525inst[0] |= (PVS_DST_PRED_ENABLE_MASK526<< PVS_DST_PRED_ENABLE_SHIFT);527if (vpi->DstReg.Pred == RC_PRED_SET) {528inst[0] |= (PVS_DST_PRED_SENSE_MASK529<< PVS_DST_PRED_SENSE_SHIFT);530}531}532533/* Update the number of temporaries. */534if (info->HasDstReg && vpi->DstReg.File == RC_FILE_TEMPORARY &&535vpi->DstReg.Index >= compiler->code->num_temporaries)536compiler->code->num_temporaries = vpi->DstReg.Index + 1;537538for (unsigned i = 0; i < info->NumSrcRegs; i++)539if (vpi->SrcReg[i].File == RC_FILE_TEMPORARY &&540vpi->SrcReg[i].Index >= compiler->code->num_temporaries)541compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1;542543if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) {544rc_error(&compiler->Base, "Too many temporaries.\n");545return;546}547548compiler->code->length += 4;549550if (compiler->Base.Error)551return;552}553}554555struct temporary_allocation {556unsigned int Allocated:1;557unsigned int HwTemp:15;558struct rc_instruction * LastRead;559};560561static void allocate_temporary_registers(struct radeon_compiler *c, void *user)562{563struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;564struct rc_instruction *inst;565struct rc_instruction *end_loop = NULL;566unsigned int num_orig_temps = 0;567char hwtemps[RC_REGISTER_MAX_INDEX];568struct temporary_allocation * ta;569unsigned int i, j;570571memset(hwtemps, 0, sizeof(hwtemps));572573rc_recompute_ips(c);574575/* Pass 1: Count original temporaries. */576for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {577const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);578579for (i = 0; i < opcode->NumSrcRegs; ++i) {580if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {581if (inst->U.I.SrcReg[i].Index >= num_orig_temps)582num_orig_temps = inst->U.I.SrcReg[i].Index + 1;583}584}585586if (opcode->HasDstReg) {587if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {588if (inst->U.I.DstReg.Index >= num_orig_temps)589num_orig_temps = inst->U.I.DstReg.Index + 1;590}591}592}593594ta = (struct temporary_allocation*)memory_pool_malloc(&compiler->Base.Pool,595sizeof(struct temporary_allocation) * num_orig_temps);596memset(ta, 0, sizeof(struct temporary_allocation) * num_orig_temps);597598/* Pass 2: Determine original temporary lifetimes */599for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {600const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);601/* Instructions inside of loops need to use the ENDLOOP602* instruction as their LastRead. */603if (!end_loop && inst->U.I.Opcode == RC_OPCODE_BGNLOOP) {604int endloops = 1;605struct rc_instruction * ptr;606for(ptr = inst->Next;607ptr != &compiler->Base.Program.Instructions;608ptr = ptr->Next){609if (ptr->U.I.Opcode == RC_OPCODE_BGNLOOP) {610endloops++;611} else if (ptr->U.I.Opcode == RC_OPCODE_ENDLOOP) {612endloops--;613if (endloops <= 0) {614end_loop = ptr;615break;616}617}618}619}620621if (inst == end_loop) {622end_loop = NULL;623continue;624}625626for (i = 0; i < opcode->NumSrcRegs; ++i) {627if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {628ta[inst->U.I.SrcReg[i].Index].LastRead = end_loop ? end_loop : inst;629}630}631}632633/* Pass 3: Register allocation */634for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {635const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);636637for (i = 0; i < opcode->NumSrcRegs; ++i) {638if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {639unsigned int orig = inst->U.I.SrcReg[i].Index;640inst->U.I.SrcReg[i].Index = ta[orig].HwTemp;641642if (ta[orig].Allocated && inst == ta[orig].LastRead)643hwtemps[ta[orig].HwTemp] = 0;644}645}646647if (opcode->HasDstReg) {648if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {649unsigned int orig = inst->U.I.DstReg.Index;650651if (!ta[orig].Allocated) {652for(j = 0; j < c->max_temp_regs; ++j) {653if (!hwtemps[j])654break;655}656ta[orig].Allocated = 1;657ta[orig].HwTemp = j;658hwtemps[ta[orig].HwTemp] = 1;659}660661inst->U.I.DstReg.Index = ta[orig].HwTemp;662}663}664}665}666667/**668* R3xx-R4xx vertex engine does not support the Absolute source operand modifier669* and the Saturate opcode modifier. Only Absolute is currently transformed.670*/671static int transform_nonnative_modifiers(672struct radeon_compiler *c,673struct rc_instruction *inst,674void* unused)675{676const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);677unsigned i;678679/* Transform ABS(a) to MAX(a, -a). */680for (i = 0; i < opcode->NumSrcRegs; i++) {681if (inst->U.I.SrcReg[i].Abs) {682struct rc_instruction *new_inst;683unsigned temp;684685inst->U.I.SrcReg[i].Abs = 0;686687temp = rc_find_free_temporary(c);688689new_inst = rc_insert_new_instruction(c, inst->Prev);690new_inst->U.I.Opcode = RC_OPCODE_MAX;691new_inst->U.I.DstReg.File = RC_FILE_TEMPORARY;692new_inst->U.I.DstReg.Index = temp;693new_inst->U.I.SrcReg[0] = inst->U.I.SrcReg[i];694new_inst->U.I.SrcReg[1] = inst->U.I.SrcReg[i];695new_inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;696697memset(&inst->U.I.SrcReg[i], 0, sizeof(inst->U.I.SrcReg[i]));698inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY;699inst->U.I.SrcReg[i].Index = temp;700inst->U.I.SrcReg[i].Swizzle = RC_SWIZZLE_XYZW;701}702}703return 1;704}705706/**707* Vertex engine cannot read two inputs or two constants at the same time.708* Introduce intermediate MOVs to temporary registers to account for this.709*/710static int transform_source_conflicts(711struct radeon_compiler *c,712struct rc_instruction* inst,713void* unused)714{715const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);716717if (opcode->NumSrcRegs == 3) {718if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[2])719|| t_src_conflict(inst->U.I.SrcReg[0], inst->U.I.SrcReg[2])) {720int tmpreg = rc_find_free_temporary(c);721struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);722inst_mov->U.I.Opcode = RC_OPCODE_MOV;723inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;724inst_mov->U.I.DstReg.Index = tmpreg;725inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[2];726727reset_srcreg(&inst->U.I.SrcReg[2]);728inst->U.I.SrcReg[2].File = RC_FILE_TEMPORARY;729inst->U.I.SrcReg[2].Index = tmpreg;730}731}732733if (opcode->NumSrcRegs >= 2) {734if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[0])) {735int tmpreg = rc_find_free_temporary(c);736struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);737inst_mov->U.I.Opcode = RC_OPCODE_MOV;738inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;739inst_mov->U.I.DstReg.Index = tmpreg;740inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[1];741742reset_srcreg(&inst->U.I.SrcReg[1]);743inst->U.I.SrcReg[1].File = RC_FILE_TEMPORARY;744inst->U.I.SrcReg[1].Index = tmpreg;745}746}747748return 1;749}750751static void rc_vs_add_artificial_outputs(struct radeon_compiler *c, void *user)752{753struct r300_vertex_program_compiler * compiler = (struct r300_vertex_program_compiler*)c;754int i;755756for(i = 0; i < 32; ++i) {757if ((compiler->RequiredOutputs & (1 << i)) &&758!(compiler->Base.Program.OutputsWritten & (1 << i))) {759struct rc_instruction * inst = rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev);760inst->U.I.Opcode = RC_OPCODE_MOV;761762inst->U.I.DstReg.File = RC_FILE_OUTPUT;763inst->U.I.DstReg.Index = i;764inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;765766inst->U.I.SrcReg[0].File = RC_FILE_CONSTANT;767inst->U.I.SrcReg[0].Index = 0;768inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;769770compiler->Base.Program.OutputsWritten |= 1 << i;771}772}773}774775static void dataflow_outputs_mark_used(void * userdata, void * data,776void (*callback)(void *, unsigned int, unsigned int))777{778struct r300_vertex_program_compiler * c = userdata;779int i;780781for(i = 0; i < 32; ++i) {782if (c->RequiredOutputs & (1 << i))783callback(data, i, RC_MASK_XYZW);784}785}786787static int swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)788{789(void) opcode;790(void) reg;791792return 1;793}794795static void transform_negative_addressing(struct r300_vertex_program_compiler *c,796struct rc_instruction *arl,797struct rc_instruction *end,798int min_offset)799{800struct rc_instruction *inst, *add;801unsigned const_swizzle;802803/* Transform ARL/ARR */804add = rc_insert_new_instruction(&c->Base, arl->Prev);805add->U.I.Opcode = RC_OPCODE_ADD;806add->U.I.DstReg.File = RC_FILE_TEMPORARY;807add->U.I.DstReg.Index = rc_find_free_temporary(&c->Base);808add->U.I.DstReg.WriteMask = RC_MASK_X;809add->U.I.SrcReg[0] = arl->U.I.SrcReg[0];810add->U.I.SrcReg[1].File = RC_FILE_CONSTANT;811add->U.I.SrcReg[1].Index = rc_constants_add_immediate_scalar(&c->Base.Program.Constants,812min_offset, &const_swizzle);813add->U.I.SrcReg[1].Swizzle = const_swizzle;814815arl->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;816arl->U.I.SrcReg[0].Index = add->U.I.DstReg.Index;817arl->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XXXX;818819/* Rewrite offsets up to and excluding inst. */820for (inst = arl->Next; inst != end; inst = inst->Next) {821const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);822823for (unsigned i = 0; i < opcode->NumSrcRegs; i++)824if (inst->U.I.SrcReg[i].RelAddr)825inst->U.I.SrcReg[i].Index -= min_offset;826}827}828829static void rc_emulate_negative_addressing(struct radeon_compiler *compiler, void *user)830{831struct r300_vertex_program_compiler * c = (struct r300_vertex_program_compiler*)compiler;832struct rc_instruction *inst, *lastARL = NULL;833int min_offset = 0;834835for (inst = c->Base.Program.Instructions.Next; inst != &c->Base.Program.Instructions; inst = inst->Next) {836const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);837838if (inst->U.I.Opcode == RC_OPCODE_ARL || inst->U.I.Opcode == RC_OPCODE_ARR) {839if (lastARL != NULL && min_offset < 0)840transform_negative_addressing(c, lastARL, inst, min_offset);841842lastARL = inst;843min_offset = 0;844continue;845}846847for (unsigned i = 0; i < opcode->NumSrcRegs; i++) {848if (inst->U.I.SrcReg[i].RelAddr &&849inst->U.I.SrcReg[i].Index < 0) {850/* ARL must precede any indirect addressing. */851if (!lastARL) {852rc_error(&c->Base, "Vertex shader: Found relative addressing without ARL/ARR.");853return;854}855856if (inst->U.I.SrcReg[i].Index < min_offset)857min_offset = inst->U.I.SrcReg[i].Index;858}859}860}861862if (lastARL != NULL && min_offset < 0)863transform_negative_addressing(c, lastARL, inst, min_offset);864}865866struct rc_swizzle_caps r300_vertprog_swizzle_caps = {867.IsNative = &swizzle_is_native,868.Split = 0 /* should never be called */869};870871void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c)872{873int is_r500 = c->Base.is_r500;874int opt = !c->Base.disable_optimizations;875876/* Lists of instruction transformations. */877struct radeon_program_transformation alu_rewrite_r500[] = {878{ &r300_transform_vertex_alu, 0 },879{ &r300_transform_trig_scale_vertex, 0 },880{ 0, 0 }881};882883struct radeon_program_transformation alu_rewrite_r300[] = {884{ &r300_transform_vertex_alu, 0 },885{ &r300_transform_trig_simple, 0 },886{ 0, 0 }887};888889/* Note: These passes have to be done seperately from ALU rewrite,890* otherwise non-native ALU instructions with source conflits891* or non-native modifiers will not be treated properly.892*/893struct radeon_program_transformation emulate_modifiers[] = {894{ &transform_nonnative_modifiers, 0 },895{ 0, 0 }896};897898struct radeon_program_transformation resolve_src_conflicts[] = {899{ &transform_source_conflicts, 0 },900{ 0, 0 }901};902903/* List of compiler passes. */904struct radeon_compiler_pass vs_list[] = {905/* NAME DUMP PREDICATE FUNCTION PARAM */906{"add artificial outputs", 0, 1, rc_vs_add_artificial_outputs, NULL},907{"emulate branches", 1, !is_r500, rc_emulate_branches, NULL},908{"emulate negative addressing", 1, 1, rc_emulate_negative_addressing, NULL},909{"native rewrite", 1, is_r500, rc_local_transform, alu_rewrite_r500},910{"native rewrite", 1, !is_r500, rc_local_transform, alu_rewrite_r300},911{"emulate modifiers", 1, !is_r500, rc_local_transform, emulate_modifiers},912{"deadcode", 1, opt, rc_dataflow_deadcode, dataflow_outputs_mark_used},913{"dataflow optimize", 1, opt, rc_optimize, NULL},914/* This pass must be done after optimizations. */915{"source conflict resolve", 1, 1, rc_local_transform, resolve_src_conflicts},916{"register allocation", 1, opt, allocate_temporary_registers, NULL},917{"dead constants", 1, 1, rc_remove_unused_constants, &c->code->constants_remap_table},918{"lower control flow opcodes", 1, is_r500, rc_vert_fc, NULL},919{"final code validation", 0, 1, rc_validate_final_shader, NULL},920{"machine code generation", 0, 1, translate_vertex_program, NULL},921{"dump machine code", 0, c->Base.Debug & RC_DBG_LOG, r300_vertex_program_dump, NULL},922{NULL, 0, 0, NULL, NULL}923};924925c->Base.type = RC_VERTEX_PROGRAM;926c->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;927928rc_run_compiler(&c->Base, vs_list);929930c->code->InputsRead = c->Base.Program.InputsRead;931c->code->OutputsWritten = c->Base.Program.OutputsWritten;932rc_constants_copy(&c->code->constants, &c->Base.Program.Constants);933}934935936