Path: blob/21.2-virgl/src/gallium/drivers/r300/compiler/radeon_program_alu.c
4574 views
/*1* Copyright (C) 2008 Nicolai Haehnle.2*3* All Rights Reserved.4*5* Permission is hereby granted, free of charge, to any person obtaining6* a copy of this software and associated documentation files (the7* "Software"), to deal in the Software without restriction, including8* without limitation the rights to use, copy, modify, merge, publish,9* distribute, sublicense, and/or sell copies of the Software, and to10* permit persons to whom the Software is furnished to do so, subject to11* the following conditions:12*13* The above copyright notice and this permission notice (including the14* next paragraph) shall be included in all copies or substantial15* portions of the Software.16*17* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,18* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF19* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.20* IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE21* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION22* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION23* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.24*25*/2627/**28* @file29*30* Shareable transformations that transform "special" ALU instructions31* into ALU instructions that are supported by hardware.32*33*/3435#include "radeon_program_alu.h"3637#include "radeon_compiler.h"38#include "radeon_compiler_util.h"394041static struct rc_instruction *emit1(42struct radeon_compiler * c, struct rc_instruction * after,43rc_opcode Opcode, struct rc_sub_instruction * base,44struct rc_dst_register DstReg, struct rc_src_register SrcReg)45{46struct rc_instruction *fpi = rc_insert_new_instruction(c, after);4748if (base) {49memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));50}5152fpi->U.I.Opcode = Opcode;53fpi->U.I.DstReg = DstReg;54fpi->U.I.SrcReg[0] = SrcReg;55return fpi;56}5758static struct rc_instruction *emit2(59struct radeon_compiler * c, struct rc_instruction * after,60rc_opcode Opcode, struct rc_sub_instruction * base,61struct rc_dst_register DstReg,62struct rc_src_register SrcReg0, struct rc_src_register SrcReg1)63{64struct rc_instruction *fpi = rc_insert_new_instruction(c, after);6566if (base) {67memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));68}6970fpi->U.I.Opcode = Opcode;71fpi->U.I.DstReg = DstReg;72fpi->U.I.SrcReg[0] = SrcReg0;73fpi->U.I.SrcReg[1] = SrcReg1;74return fpi;75}7677static struct rc_instruction *emit3(78struct radeon_compiler * c, struct rc_instruction * after,79rc_opcode Opcode, struct rc_sub_instruction * base,80struct rc_dst_register DstReg,81struct rc_src_register SrcReg0, struct rc_src_register SrcReg1,82struct rc_src_register SrcReg2)83{84struct rc_instruction *fpi = rc_insert_new_instruction(c, after);8586if (base) {87memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));88}8990fpi->U.I.Opcode = Opcode;91fpi->U.I.DstReg = DstReg;92fpi->U.I.SrcReg[0] = SrcReg0;93fpi->U.I.SrcReg[1] = SrcReg1;94fpi->U.I.SrcReg[2] = SrcReg2;95return fpi;96}9798static struct rc_dst_register dstregtmpmask(int index, int mask)99{100struct rc_dst_register dst = {0, 0, 0};101dst.File = RC_FILE_TEMPORARY;102dst.Index = index;103dst.WriteMask = mask;104return dst;105}106107static const struct rc_src_register builtin_zero = {108.File = RC_FILE_NONE,109.Index = 0,110.Swizzle = RC_SWIZZLE_0000111};112static const struct rc_src_register builtin_one = {113.File = RC_FILE_NONE,114.Index = 0,115.Swizzle = RC_SWIZZLE_1111116};117118static const struct rc_src_register builtin_half = {119.File = RC_FILE_NONE,120.Index = 0,121.Swizzle = RC_SWIZZLE_HHHH122};123124static const struct rc_src_register srcreg_undefined = {125.File = RC_FILE_NONE,126.Index = 0,127.Swizzle = RC_SWIZZLE_XYZW128};129130static struct rc_src_register srcreg(int file, int index)131{132struct rc_src_register src = srcreg_undefined;133src.File = file;134src.Index = index;135return src;136}137138static struct rc_src_register srcregswz(int file, int index, int swz)139{140struct rc_src_register src = srcreg_undefined;141src.File = file;142src.Index = index;143src.Swizzle = swz;144return src;145}146147static struct rc_src_register absolute(struct rc_src_register reg)148{149struct rc_src_register newreg = reg;150newreg.Abs = 1;151newreg.Negate = RC_MASK_NONE;152return newreg;153}154155static struct rc_src_register negate(struct rc_src_register reg)156{157struct rc_src_register newreg = reg;158newreg.Negate = newreg.Negate ^ RC_MASK_XYZW;159return newreg;160}161162static struct rc_src_register swizzle(struct rc_src_register reg,163rc_swizzle x, rc_swizzle y, rc_swizzle z, rc_swizzle w)164{165struct rc_src_register swizzled = reg;166swizzled.Swizzle = combine_swizzles4(reg.Swizzle, x, y, z, w);167return swizzled;168}169170static struct rc_src_register swizzle_smear(struct rc_src_register reg,171rc_swizzle x)172{173return swizzle(reg, x, x, x, x);174}175176static struct rc_src_register swizzle_xxxx(struct rc_src_register reg)177{178return swizzle_smear(reg, RC_SWIZZLE_X);179}180181static struct rc_src_register swizzle_yyyy(struct rc_src_register reg)182{183return swizzle_smear(reg, RC_SWIZZLE_Y);184}185186static struct rc_src_register swizzle_zzzz(struct rc_src_register reg)187{188return swizzle_smear(reg, RC_SWIZZLE_Z);189}190191static struct rc_src_register swizzle_wwww(struct rc_src_register reg)192{193return swizzle_smear(reg, RC_SWIZZLE_W);194}195196static int is_dst_safe_to_reuse(struct rc_instruction *inst)197{198const struct rc_opcode_info *info = rc_get_opcode_info(inst->U.I.Opcode);199unsigned i;200201assert(info->HasDstReg);202203if (inst->U.I.DstReg.File != RC_FILE_TEMPORARY)204return 0;205206for (i = 0; i < info->NumSrcRegs; i++) {207if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY &&208inst->U.I.SrcReg[i].Index == inst->U.I.DstReg.Index)209return 0;210}211212return 1;213}214215static struct rc_dst_register try_to_reuse_dst(struct radeon_compiler *c,216struct rc_instruction *inst)217{218unsigned tmp;219220if (is_dst_safe_to_reuse(inst))221tmp = inst->U.I.DstReg.Index;222else223tmp = rc_find_free_temporary(c);224225return dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask);226}227228static void transform_ABS(struct radeon_compiler* c,229struct rc_instruction* inst)230{231struct rc_src_register src = inst->U.I.SrcReg[0];232src.Abs = 1;233src.Negate = RC_MASK_NONE;234emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I, inst->U.I.DstReg, src);235rc_remove_instruction(inst);236}237238static void transform_CEIL(struct radeon_compiler* c,239struct rc_instruction* inst)240{241/* Assuming:242* ceil(x) = -floor(-x)243*244* After inlining floor:245* ceil(x) = -(-x-frac(-x))246*247* After simplification:248* ceil(x) = x+frac(-x)249*/250251struct rc_dst_register dst = try_to_reuse_dst(c, inst);252emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, negate(inst->U.I.SrcReg[0]));253emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg,254inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index));255rc_remove_instruction(inst);256}257258static void transform_CLAMP(struct radeon_compiler *c,259struct rc_instruction *inst)260{261/* CLAMP dst, src, min, max262* into:263* MIN tmp, src, max264* MAX dst, tmp, min265*/266struct rc_dst_register dst = try_to_reuse_dst(c, inst);267emit2(c, inst->Prev, RC_OPCODE_MIN, 0, dst,268inst->U.I.SrcReg[0], inst->U.I.SrcReg[2]);269emit2(c, inst->Prev, RC_OPCODE_MAX, &inst->U.I, inst->U.I.DstReg,270srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1]);271rc_remove_instruction(inst);272}273274static void transform_DP2(struct radeon_compiler* c,275struct rc_instruction* inst)276{277struct rc_src_register src0 = inst->U.I.SrcReg[0];278struct rc_src_register src1 = inst->U.I.SrcReg[1];279src0.Negate &= ~(RC_MASK_Z | RC_MASK_W);280src0.Swizzle &= ~(63 << (3 * 2));281src0.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3));282src1.Negate &= ~(RC_MASK_Z | RC_MASK_W);283src1.Swizzle &= ~(63 << (3 * 2));284src1.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3));285emit2(c, inst->Prev, RC_OPCODE_DP3, &inst->U.I, inst->U.I.DstReg, src0, src1);286rc_remove_instruction(inst);287}288289static void transform_DPH(struct radeon_compiler* c,290struct rc_instruction* inst)291{292struct rc_src_register src0 = inst->U.I.SrcReg[0];293src0.Negate &= ~RC_MASK_W;294src0.Swizzle &= ~(7 << (3 * 3));295src0.Swizzle |= RC_SWIZZLE_ONE << (3 * 3);296emit2(c, inst->Prev, RC_OPCODE_DP4, &inst->U.I, inst->U.I.DstReg, src0, inst->U.I.SrcReg[1]);297rc_remove_instruction(inst);298}299300/**301* [1, src0.y*src1.y, src0.z, src1.w]302* So basically MUL with lotsa swizzling.303*/304static void transform_DST(struct radeon_compiler* c,305struct rc_instruction* inst)306{307emit2(c, inst->Prev, RC_OPCODE_MUL, &inst->U.I, inst->U.I.DstReg,308swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_ONE),309swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_ONE, RC_SWIZZLE_W));310rc_remove_instruction(inst);311}312313static void transform_FLR(struct radeon_compiler* c,314struct rc_instruction* inst)315{316struct rc_dst_register dst = try_to_reuse_dst(c, inst);317emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, inst->U.I.SrcReg[0]);318emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg,319inst->U.I.SrcReg[0], negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));320rc_remove_instruction(inst);321}322323static void transform_TRUNC(struct radeon_compiler* c,324struct rc_instruction* inst)325{326/* Definition of trunc:327* trunc(x) = (abs(x) - fract(abs(x))) * sgn(x)328*329* The multiplication by sgn(x) can be simplified using CMP:330* y * sgn(x) = (x < 0 ? -y : y)331*/332struct rc_dst_register dst = try_to_reuse_dst(c, inst);333emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, absolute(inst->U.I.SrcReg[0]));334emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, absolute(inst->U.I.SrcReg[0]),335negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));336emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg, inst->U.I.SrcReg[0],337negate(srcreg(RC_FILE_TEMPORARY, dst.Index)), srcreg(RC_FILE_TEMPORARY, dst.Index));338rc_remove_instruction(inst);339}340341/**342* Definition of LIT (from ARB_fragment_program):343*344* tmp = VectorLoad(op0);345* if (tmp.x < 0) tmp.x = 0;346* if (tmp.y < 0) tmp.y = 0;347* if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);348* else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;349* result.x = 1.0;350* result.y = tmp.x;351* result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;352* result.w = 1.0;353*354* The longest path of computation is the one leading to result.z,355* consisting of 5 operations. This implementation of LIT takes356* 5 slots, if the subsequent optimization passes are clever enough357* to pair instructions correctly.358*/359static void transform_LIT(struct radeon_compiler* c,360struct rc_instruction* inst)361{362unsigned int constant;363unsigned int constant_swizzle;364unsigned int temp;365struct rc_src_register srctemp;366367constant = rc_constants_add_immediate_scalar(&c->Program.Constants, -127.999999, &constant_swizzle);368369if (inst->U.I.DstReg.WriteMask != RC_MASK_XYZW || inst->U.I.DstReg.File != RC_FILE_TEMPORARY) {370struct rc_instruction * inst_mov;371372inst_mov = emit1(c, inst,373RC_OPCODE_MOV, 0, inst->U.I.DstReg,374srcreg(RC_FILE_TEMPORARY, rc_find_free_temporary(c)));375376inst->U.I.DstReg.File = RC_FILE_TEMPORARY;377inst->U.I.DstReg.Index = inst_mov->U.I.SrcReg[0].Index;378inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;379}380381temp = inst->U.I.DstReg.Index;382srctemp = srcreg(RC_FILE_TEMPORARY, temp);383384/* tmp.x = max(0.0, Src.x); */385/* tmp.y = max(0.0, Src.y); */386/* tmp.w = clamp(Src.z, -128+eps, 128-eps); */387emit2(c, inst->Prev, RC_OPCODE_MAX, 0,388dstregtmpmask(temp, RC_MASK_XYW),389inst->U.I.SrcReg[0],390swizzle(srcreg(RC_FILE_CONSTANT, constant),391RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, constant_swizzle&3));392emit2(c, inst->Prev, RC_OPCODE_MIN, 0,393dstregtmpmask(temp, RC_MASK_Z),394swizzle_wwww(srctemp),395negate(srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle)));396397/* tmp.w = Pow(tmp.y, tmp.w) */398emit1(c, inst->Prev, RC_OPCODE_LG2, 0,399dstregtmpmask(temp, RC_MASK_W),400swizzle_yyyy(srctemp));401emit2(c, inst->Prev, RC_OPCODE_MUL, 0,402dstregtmpmask(temp, RC_MASK_W),403swizzle_wwww(srctemp),404swizzle_zzzz(srctemp));405emit1(c, inst->Prev, RC_OPCODE_EX2, 0,406dstregtmpmask(temp, RC_MASK_W),407swizzle_wwww(srctemp));408409/* tmp.z = (tmp.x > 0) ? tmp.w : 0.0 */410emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I,411dstregtmpmask(temp, RC_MASK_Z),412negate(swizzle_xxxx(srctemp)),413swizzle_wwww(srctemp),414builtin_zero);415416/* tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0 */417emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I,418dstregtmpmask(temp, RC_MASK_XYW),419swizzle(srctemp, RC_SWIZZLE_ONE, RC_SWIZZLE_X, RC_SWIZZLE_ONE, RC_SWIZZLE_ONE));420421rc_remove_instruction(inst);422}423424static void transform_LRP(struct radeon_compiler* c,425struct rc_instruction* inst)426{427struct rc_dst_register dst = try_to_reuse_dst(c, inst);428429emit2(c, inst->Prev, RC_OPCODE_ADD, 0,430dst,431inst->U.I.SrcReg[1], negate(inst->U.I.SrcReg[2]));432emit3(c, inst->Prev, RC_OPCODE_MAD, &inst->U.I,433inst->U.I.DstReg,434inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[2]);435436rc_remove_instruction(inst);437}438439static void transform_POW(struct radeon_compiler* c,440struct rc_instruction* inst)441{442struct rc_dst_register tempdst = try_to_reuse_dst(c, inst);443struct rc_src_register tempsrc = srcreg(RC_FILE_TEMPORARY, tempdst.Index);444tempdst.WriteMask = RC_MASK_W;445tempsrc.Swizzle = RC_SWIZZLE_WWWW;446447emit1(c, inst->Prev, RC_OPCODE_LG2, 0, tempdst, swizzle_xxxx(inst->U.I.SrcReg[0]));448emit2(c, inst->Prev, RC_OPCODE_MUL, 0, tempdst, tempsrc, swizzle_xxxx(inst->U.I.SrcReg[1]));449emit1(c, inst->Prev, RC_OPCODE_EX2, &inst->U.I, inst->U.I.DstReg, tempsrc);450451rc_remove_instruction(inst);452}453454/* dst = ROUND(src) :455* add = src + .5456* frac = FRC(add)457* dst = add - frac458*459* According to the GLSL spec, the implementor can decide which way to round460* when the fraction is .5. We round down for .5.461*462*/463static void transform_ROUND(struct radeon_compiler* c,464struct rc_instruction* inst)465{466unsigned int mask = inst->U.I.DstReg.WriteMask;467unsigned int frac_index, add_index;468struct rc_dst_register frac_dst, add_dst;469struct rc_src_register frac_src, add_src;470471/* add = src + .5 */472add_index = rc_find_free_temporary(c);473add_dst = dstregtmpmask(add_index, mask);474emit2(c, inst->Prev, RC_OPCODE_ADD, 0, add_dst, inst->U.I.SrcReg[0],475builtin_half);476add_src = srcreg(RC_FILE_TEMPORARY, add_dst.Index);477478479/* frac = FRC(add) */480frac_index = rc_find_free_temporary(c);481frac_dst = dstregtmpmask(frac_index, mask);482emit1(c, inst->Prev, RC_OPCODE_FRC, 0, frac_dst, add_src);483frac_src = srcreg(RC_FILE_TEMPORARY, frac_dst.Index);484485/* dst = add - frac */486emit2(c, inst->Prev, RC_OPCODE_ADD, 0, inst->U.I.DstReg,487add_src, negate(frac_src));488rc_remove_instruction(inst);489}490491static void transform_RSQ(struct radeon_compiler* c,492struct rc_instruction* inst)493{494inst->U.I.SrcReg[0] = absolute(inst->U.I.SrcReg[0]);495}496497static void transform_SEQ(struct radeon_compiler* c,498struct rc_instruction* inst)499{500struct rc_dst_register dst = try_to_reuse_dst(c, inst);501502emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));503emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,504negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_zero, builtin_one);505506rc_remove_instruction(inst);507}508509static void transform_SFL(struct radeon_compiler* c,510struct rc_instruction* inst)511{512emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I, inst->U.I.DstReg, builtin_zero);513rc_remove_instruction(inst);514}515516static void transform_SGE(struct radeon_compiler* c,517struct rc_instruction* inst)518{519struct rc_dst_register dst = try_to_reuse_dst(c, inst);520521emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));522emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,523srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);524525rc_remove_instruction(inst);526}527528static void transform_SGT(struct radeon_compiler* c,529struct rc_instruction* inst)530{531struct rc_dst_register dst = try_to_reuse_dst(c, inst);532533emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);534emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,535srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);536537rc_remove_instruction(inst);538}539540static void transform_SLE(struct radeon_compiler* c,541struct rc_instruction* inst)542{543struct rc_dst_register dst = try_to_reuse_dst(c, inst);544545emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);546emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,547srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);548549rc_remove_instruction(inst);550}551552static void transform_SLT(struct radeon_compiler* c,553struct rc_instruction* inst)554{555struct rc_dst_register dst = try_to_reuse_dst(c, inst);556557emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));558emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,559srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);560561rc_remove_instruction(inst);562}563564static void transform_SNE(struct radeon_compiler* c,565struct rc_instruction* inst)566{567struct rc_dst_register dst = try_to_reuse_dst(c, inst);568569emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));570emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,571negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_one, builtin_zero);572573rc_remove_instruction(inst);574}575576static void transform_SSG(struct radeon_compiler* c,577struct rc_instruction* inst)578{579/* result = sign(x)580*581* CMP tmp0, -x, 1, 0582* CMP tmp1, x, 1, 0583* ADD result, tmp0, -tmp1;584*/585struct rc_dst_register dst0;586unsigned tmp1;587588/* 0 < x */589dst0 = try_to_reuse_dst(c, inst);590emit3(c, inst->Prev, RC_OPCODE_CMP, 0,591dst0,592negate(inst->U.I.SrcReg[0]),593builtin_one,594builtin_zero);595596/* x < 0 */597tmp1 = rc_find_free_temporary(c);598emit3(c, inst->Prev, RC_OPCODE_CMP, 0,599dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask),600inst->U.I.SrcReg[0],601builtin_one,602builtin_zero);603604/* Either both are zero, or one of them is one and the other is zero. */605/* result = tmp0 - tmp1 */606emit2(c, inst->Prev, RC_OPCODE_ADD, 0,607inst->U.I.DstReg,608srcreg(RC_FILE_TEMPORARY, dst0.Index),609negate(srcreg(RC_FILE_TEMPORARY, tmp1)));610611rc_remove_instruction(inst);612}613614static void transform_SUB(struct radeon_compiler* c,615struct rc_instruction* inst)616{617inst->U.I.Opcode = RC_OPCODE_ADD;618inst->U.I.SrcReg[1] = negate(inst->U.I.SrcReg[1]);619}620621static void transform_SWZ(struct radeon_compiler* c,622struct rc_instruction* inst)623{624inst->U.I.Opcode = RC_OPCODE_MOV;625}626627static void transform_XPD(struct radeon_compiler* c,628struct rc_instruction* inst)629{630struct rc_dst_register dst = try_to_reuse_dst(c, inst);631632emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dst,633swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),634swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W));635emit3(c, inst->Prev, RC_OPCODE_MAD, &inst->U.I, inst->U.I.DstReg,636swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W),637swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),638negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));639640rc_remove_instruction(inst);641}642643644/**645* Can be used as a transformation for @ref radeonClauseLocalTransform,646* no userData necessary.647*648* Eliminates the following ALU instructions:649* ABS, CEIL, DPH, DST, FLR, LIT, LRP, POW, SEQ, SFL, SGE, SGT, SLE, SLT, SNE, SUB, SWZ, XPD650* using:651* MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP652*653* Transforms RSQ to Radeon's native RSQ by explicitly setting654* absolute value.655*656* @note should be applicable to R300 and R500 fragment programs.657*/658int radeonTransformALU(659struct radeon_compiler * c,660struct rc_instruction* inst,661void* unused)662{663switch(inst->U.I.Opcode) {664case RC_OPCODE_ABS: transform_ABS(c, inst); return 1;665case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;666case RC_OPCODE_CLAMP: transform_CLAMP(c, inst); return 1;667case RC_OPCODE_DP2: transform_DP2(c, inst); return 1;668case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;669case RC_OPCODE_DST: transform_DST(c, inst); return 1;670case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;671case RC_OPCODE_LIT: transform_LIT(c, inst); return 1;672case RC_OPCODE_LRP: transform_LRP(c, inst); return 1;673case RC_OPCODE_POW: transform_POW(c, inst); return 1;674case RC_OPCODE_ROUND: transform_ROUND(c, inst); return 1;675case RC_OPCODE_RSQ: transform_RSQ(c, inst); return 1;676case RC_OPCODE_SEQ: transform_SEQ(c, inst); return 1;677case RC_OPCODE_SFL: transform_SFL(c, inst); return 1;678case RC_OPCODE_SGE: transform_SGE(c, inst); return 1;679case RC_OPCODE_SGT: transform_SGT(c, inst); return 1;680case RC_OPCODE_SLE: transform_SLE(c, inst); return 1;681case RC_OPCODE_SLT: transform_SLT(c, inst); return 1;682case RC_OPCODE_SNE: transform_SNE(c, inst); return 1;683case RC_OPCODE_SSG: transform_SSG(c, inst); return 1;684case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;685case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1;686case RC_OPCODE_TRUNC: transform_TRUNC(c, inst); return 1;687case RC_OPCODE_XPD: transform_XPD(c, inst); return 1;688default:689return 0;690}691}692693694static void transform_r300_vertex_ABS(struct radeon_compiler* c,695struct rc_instruction* inst)696{697/* Note: r500 can take absolute values, but r300 cannot. */698inst->U.I.Opcode = RC_OPCODE_MAX;699inst->U.I.SrcReg[1] = inst->U.I.SrcReg[0];700inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;701}702703static void transform_r300_vertex_CMP(struct radeon_compiler* c,704struct rc_instruction* inst)705{706/* There is no decent CMP available, so let's rig one up.707* CMP is defined as dst = src0 < 0.0 ? src1 : src2708* The following sequence consumes zero to two temps and two extra slots709* (the second temp and the second slot is consumed by transform_LRP),710* but should be equivalent:711*712* SLT tmp0, src0, 0.0713* LRP dst, tmp0, src1, src2714*715* Yes, I know, I'm a mad scientist. ~ C. & M. */716struct rc_dst_register dst = try_to_reuse_dst(c, inst);717718/* SLT tmp0, src0, 0.0 */719emit2(c, inst->Prev, RC_OPCODE_SLT, 0,720dst,721inst->U.I.SrcReg[0], builtin_zero);722723/* LRP dst, tmp0, src1, src2 */724transform_LRP(c,725emit3(c, inst->Prev, RC_OPCODE_LRP, 0,726inst->U.I.DstReg,727srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1], inst->U.I.SrcReg[2]));728729rc_remove_instruction(inst);730}731732static void transform_r300_vertex_DP2(struct radeon_compiler* c,733struct rc_instruction* inst)734{735struct rc_instruction *next_inst = inst->Next;736transform_DP2(c, inst);737next_inst->Prev->U.I.Opcode = RC_OPCODE_DP4;738}739740static void transform_r300_vertex_DP3(struct radeon_compiler* c,741struct rc_instruction* inst)742{743struct rc_src_register src0 = inst->U.I.SrcReg[0];744struct rc_src_register src1 = inst->U.I.SrcReg[1];745src0.Negate &= ~RC_MASK_W;746src0.Swizzle &= ~(7 << (3 * 3));747src0.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);748src1.Negate &= ~RC_MASK_W;749src1.Swizzle &= ~(7 << (3 * 3));750src1.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);751emit2(c, inst->Prev, RC_OPCODE_DP4, &inst->U.I, inst->U.I.DstReg, src0, src1);752rc_remove_instruction(inst);753}754755static void transform_r300_vertex_fix_LIT(struct radeon_compiler* c,756struct rc_instruction* inst)757{758struct rc_dst_register dst = try_to_reuse_dst(c, inst);759unsigned constant_swizzle;760int constant = rc_constants_add_immediate_scalar(&c->Program.Constants,7610.0000000000000000001,762&constant_swizzle);763764/* MOV dst, src */765dst.WriteMask = RC_MASK_XYZW;766emit1(c, inst->Prev, RC_OPCODE_MOV, 0,767dst,768inst->U.I.SrcReg[0]);769770/* MAX dst.y, src, 0.00...001 */771emit2(c, inst->Prev, RC_OPCODE_MAX, 0,772dstregtmpmask(dst.Index, RC_MASK_Y),773srcreg(RC_FILE_TEMPORARY, dst.Index),774srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));775776inst->U.I.SrcReg[0] = srcreg(RC_FILE_TEMPORARY, dst.Index);777}778779static void transform_r300_vertex_SEQ(struct radeon_compiler *c,780struct rc_instruction *inst)781{782/* x = y <==> x >= y && y >= x */783int tmp = rc_find_free_temporary(c);784785/* x <= y */786emit2(c, inst->Prev, RC_OPCODE_SGE, 0,787dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask),788inst->U.I.SrcReg[0],789inst->U.I.SrcReg[1]);790791/* y <= x */792emit2(c, inst->Prev, RC_OPCODE_SGE, 0,793inst->U.I.DstReg,794inst->U.I.SrcReg[1],795inst->U.I.SrcReg[0]);796797/* x && y = x * y */798emit2(c, inst->Prev, RC_OPCODE_MUL, 0,799inst->U.I.DstReg,800srcreg(RC_FILE_TEMPORARY, tmp),801srcreg(inst->U.I.DstReg.File, inst->U.I.DstReg.Index));802803rc_remove_instruction(inst);804}805806static void transform_r300_vertex_SNE(struct radeon_compiler *c,807struct rc_instruction *inst)808{809/* x != y <==> x < y || y < x */810int tmp = rc_find_free_temporary(c);811812/* x < y */813emit2(c, inst->Prev, RC_OPCODE_SLT, 0,814dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask),815inst->U.I.SrcReg[0],816inst->U.I.SrcReg[1]);817818/* y < x */819emit2(c, inst->Prev, RC_OPCODE_SLT, 0,820inst->U.I.DstReg,821inst->U.I.SrcReg[1],822inst->U.I.SrcReg[0]);823824/* x || y = max(x, y) */825emit2(c, inst->Prev, RC_OPCODE_MAX, 0,826inst->U.I.DstReg,827srcreg(RC_FILE_TEMPORARY, tmp),828srcreg(inst->U.I.DstReg.File, inst->U.I.DstReg.Index));829830rc_remove_instruction(inst);831}832833static void transform_r300_vertex_SGT(struct radeon_compiler* c,834struct rc_instruction* inst)835{836/* x > y <==> -x < -y */837inst->U.I.Opcode = RC_OPCODE_SLT;838inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;839inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;840}841842static void transform_r300_vertex_SLE(struct radeon_compiler* c,843struct rc_instruction* inst)844{845/* x <= y <==> -x >= -y */846inst->U.I.Opcode = RC_OPCODE_SGE;847inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;848inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;849}850851static void transform_r300_vertex_SSG(struct radeon_compiler* c,852struct rc_instruction* inst)853{854/* result = sign(x)855*856* SLT tmp0, 0, x;857* SLT tmp1, x, 0;858* ADD result, tmp0, -tmp1;859*/860struct rc_dst_register dst0 = try_to_reuse_dst(c, inst);861unsigned tmp1;862863/* 0 < x */864dst0 = try_to_reuse_dst(c, inst);865emit2(c, inst->Prev, RC_OPCODE_SLT, 0,866dst0,867builtin_zero,868inst->U.I.SrcReg[0]);869870/* x < 0 */871tmp1 = rc_find_free_temporary(c);872emit2(c, inst->Prev, RC_OPCODE_SLT, 0,873dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask),874inst->U.I.SrcReg[0],875builtin_zero);876877/* Either both are zero, or one of them is one and the other is zero. */878/* result = tmp0 - tmp1 */879emit2(c, inst->Prev, RC_OPCODE_ADD, 0,880inst->U.I.DstReg,881srcreg(RC_FILE_TEMPORARY, dst0.Index),882negate(srcreg(RC_FILE_TEMPORARY, tmp1)));883884rc_remove_instruction(inst);885}886887static void transform_vertex_TRUNC(struct radeon_compiler* c,888struct rc_instruction* inst)889{890struct rc_instruction *next = inst->Next;891892/* next->Prev is removed after each transformation and replaced893* by a new instruction. */894transform_TRUNC(c, next->Prev);895transform_r300_vertex_CMP(c, next->Prev);896}897898/**899* For use with rc_local_transform, this transforms non-native ALU900* instructions of the r300 up to r500 vertex engine.901*/902int r300_transform_vertex_alu(903struct radeon_compiler * c,904struct rc_instruction* inst,905void* unused)906{907switch(inst->U.I.Opcode) {908case RC_OPCODE_ABS: transform_r300_vertex_ABS(c, inst); return 1;909case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;910case RC_OPCODE_CLAMP: transform_CLAMP(c, inst); return 1;911case RC_OPCODE_CMP: transform_r300_vertex_CMP(c, inst); return 1;912case RC_OPCODE_DP2: transform_r300_vertex_DP2(c, inst); return 1;913case RC_OPCODE_DP3: transform_r300_vertex_DP3(c, inst); return 1;914case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;915case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;916case RC_OPCODE_LIT: transform_r300_vertex_fix_LIT(c, inst); return 1;917case RC_OPCODE_LRP: transform_LRP(c, inst); return 1;918case RC_OPCODE_SEQ:919if (!c->is_r500) {920transform_r300_vertex_SEQ(c, inst);921return 1;922}923return 0;924case RC_OPCODE_SFL: transform_SFL(c, inst); return 1;925case RC_OPCODE_SGT: transform_r300_vertex_SGT(c, inst); return 1;926case RC_OPCODE_SLE: transform_r300_vertex_SLE(c, inst); return 1;927case RC_OPCODE_SNE:928if (!c->is_r500) {929transform_r300_vertex_SNE(c, inst);930return 1;931}932return 0;933case RC_OPCODE_SSG: transform_r300_vertex_SSG(c, inst); return 1;934case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;935case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1;936case RC_OPCODE_TRUNC: transform_vertex_TRUNC(c, inst); return 1;937case RC_OPCODE_XPD: transform_XPD(c, inst); return 1;938default:939return 0;940}941}942943static void sincos_constants(struct radeon_compiler* c, unsigned int *constants)944{945static const float SinCosConsts[2][4] = {946{9471.273239545, /* 4/PI */948-0.405284735, /* -4/(PI*PI) */9493.141592654, /* PI */9500.2225 /* weight */951},952{9530.75,9540.5,9550.159154943, /* 1/(2*PI) */9566.283185307 /* 2*PI */957}958};959int i;960961for(i = 0; i < 2; ++i)962constants[i] = rc_constants_add_immediate_vec4(&c->Program.Constants, SinCosConsts[i]);963}964965/**966* Approximate sin(x), where x is clamped to (-pi/2, pi/2).967*968* MUL tmp.xy, src, { 4/PI, -4/(PI^2) }969* MAD tmp.x, tmp.y, |src|, tmp.x970* MAD tmp.y, tmp.x, |tmp.x|, -tmp.x971* MAD dest, tmp.y, weight, tmp.x972*/973static void sin_approx(974struct radeon_compiler* c, struct rc_instruction * inst,975struct rc_dst_register dst, struct rc_src_register src, const unsigned int* constants)976{977unsigned int tempreg = rc_find_free_temporary(c);978979emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstregtmpmask(tempreg, RC_MASK_XY),980swizzle_xxxx(src),981srcreg(RC_FILE_CONSTANT, constants[0]));982emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_X),983swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),984absolute(swizzle_xxxx(src)),985swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)));986emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_Y),987swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)),988absolute(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))),989negate(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))));990emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dst,991swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),992swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[0])),993swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)));994}995996/**997* Translate the trigonometric functions COS, SIN, and SCS998* using only the basic instructions999* MOV, ADD, MUL, MAD, FRC1000*/1001int r300_transform_trig_simple(struct radeon_compiler* c,1002struct rc_instruction* inst,1003void* unused)1004{1005unsigned int constants[2];1006unsigned int tempreg;10071008if (inst->U.I.Opcode != RC_OPCODE_COS &&1009inst->U.I.Opcode != RC_OPCODE_SIN &&1010inst->U.I.Opcode != RC_OPCODE_SCS)1011return 0;10121013tempreg = rc_find_free_temporary(c);10141015sincos_constants(c, constants);10161017if (inst->U.I.Opcode == RC_OPCODE_COS) {1018/* MAD tmp.x, src, 1/(2*PI), 0.75 */1019/* FRC tmp.x, tmp.x */1020/* MAD tmp.z, tmp.x, 2*PI, -PI */1021emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),1022swizzle_xxxx(inst->U.I.SrcReg[0]),1023swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),1024swizzle_xxxx(srcreg(RC_FILE_CONSTANT, constants[1])));1025emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_W),1026swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)));1027emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),1028swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),1029swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),1030negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));10311032sin_approx(c, inst, inst->U.I.DstReg,1033swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),1034constants);1035} else if (inst->U.I.Opcode == RC_OPCODE_SIN) {1036emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),1037swizzle_xxxx(inst->U.I.SrcReg[0]),1038swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),1039swizzle_yyyy(srcreg(RC_FILE_CONSTANT, constants[1])));1040emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_W),1041swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)));1042emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),1043swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),1044swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),1045negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));10461047sin_approx(c, inst, inst->U.I.DstReg,1048swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),1049constants);1050} else {1051struct rc_dst_register dst;10521053emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_XY),1054swizzle_xxxx(inst->U.I.SrcReg[0]),1055swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),1056swizzle(srcreg(RC_FILE_CONSTANT, constants[1]), RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_W));1057emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_XY),1058srcreg(RC_FILE_TEMPORARY, tempreg));1059emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_XY),1060srcreg(RC_FILE_TEMPORARY, tempreg),1061swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),1062negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));10631064dst = inst->U.I.DstReg;10651066dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_X;1067sin_approx(c, inst, dst,1068swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)),1069constants);10701071dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_Y;1072sin_approx(c, inst, dst,1073swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),1074constants);1075}10761077rc_remove_instruction(inst);10781079return 1;1080}10811082static void r300_transform_SIN_COS_SCS(struct radeon_compiler *c,1083struct rc_instruction *inst,1084unsigned srctmp)1085{1086if (inst->U.I.Opcode == RC_OPCODE_COS) {1087emit1(c, inst->Prev, RC_OPCODE_COS, &inst->U.I, inst->U.I.DstReg,1088srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));1089} else if (inst->U.I.Opcode == RC_OPCODE_SIN) {1090emit1(c, inst->Prev, RC_OPCODE_SIN, &inst->U.I,1091inst->U.I.DstReg, srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));1092} else if (inst->U.I.Opcode == RC_OPCODE_SCS) {1093struct rc_dst_register moddst = inst->U.I.DstReg;10941095if (inst->U.I.DstReg.WriteMask & RC_MASK_X) {1096moddst.WriteMask = RC_MASK_X;1097emit1(c, inst->Prev, RC_OPCODE_COS, &inst->U.I, moddst,1098srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));1099}1100if (inst->U.I.DstReg.WriteMask & RC_MASK_Y) {1101moddst.WriteMask = RC_MASK_Y;1102emit1(c, inst->Prev, RC_OPCODE_SIN, &inst->U.I, moddst,1103srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));1104}1105}11061107rc_remove_instruction(inst);1108}110911101111/**1112* Transform the trigonometric functions COS, SIN, and SCS1113* to include pre-scaling by 1/(2*PI) and taking the fractional1114* part, so that the input to COS and SIN is always in the range [0,1).1115* SCS is replaced by one COS and one SIN instruction.1116*1117* @warning This transformation implicitly changes the semantics of SIN and COS!1118*/1119int radeonTransformTrigScale(struct radeon_compiler* c,1120struct rc_instruction* inst,1121void* unused)1122{1123static const float RCP_2PI = 0.15915494309189535;1124unsigned int temp;1125unsigned int constant;1126unsigned int constant_swizzle;11271128if (inst->U.I.Opcode != RC_OPCODE_COS &&1129inst->U.I.Opcode != RC_OPCODE_SIN &&1130inst->U.I.Opcode != RC_OPCODE_SCS)1131return 0;11321133temp = rc_find_free_temporary(c);1134constant = rc_constants_add_immediate_scalar(&c->Program.Constants, RCP_2PI, &constant_swizzle);11351136emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstregtmpmask(temp, RC_MASK_W),1137swizzle_xxxx(inst->U.I.SrcReg[0]),1138srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));1139emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(temp, RC_MASK_W),1140srcreg(RC_FILE_TEMPORARY, temp));11411142r300_transform_SIN_COS_SCS(c, inst, temp);1143return 1;1144}11451146/**1147* Transform the trigonometric functions COS, SIN, and SCS1148* so that the input to COS and SIN is always in the range [-PI, PI].1149* SCS is replaced by one COS and one SIN instruction.1150*/1151int r300_transform_trig_scale_vertex(struct radeon_compiler *c,1152struct rc_instruction *inst,1153void *unused)1154{1155static const float cons[4] = {0.15915494309189535, 0.5, 6.28318530717959, -3.14159265358979};1156unsigned int temp;1157unsigned int constant;11581159if (inst->U.I.Opcode != RC_OPCODE_COS &&1160inst->U.I.Opcode != RC_OPCODE_SIN &&1161inst->U.I.Opcode != RC_OPCODE_SCS)1162return 0;11631164/* Repeat x in the range [-PI, PI]:1165*1166* repeat(x) = frac(x / 2PI + 0.5) * 2PI - PI1167*/11681169temp = rc_find_free_temporary(c);1170constant = rc_constants_add_immediate_vec4(&c->Program.Constants, cons);11711172emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(temp, RC_MASK_W),1173swizzle_xxxx(inst->U.I.SrcReg[0]),1174srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_XXXX),1175srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_YYYY));1176emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(temp, RC_MASK_W),1177srcreg(RC_FILE_TEMPORARY, temp));1178emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(temp, RC_MASK_W),1179srcreg(RC_FILE_TEMPORARY, temp),1180srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_ZZZZ),1181srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_WWWW));11821183r300_transform_SIN_COS_SCS(c, inst, temp);1184return 1;1185}11861187/**1188* Rewrite DDX/DDY instructions to properly work with r5xx shaders.1189* The r5xx MDH/MDV instruction provides per-quad partial derivatives.1190* It takes the form A*B+C. A and C are set by setting src0. B should be -1.1191*1192* @warning This explicitly changes the form of DDX and DDY!1193*/11941195int radeonTransformDeriv(struct radeon_compiler* c,1196struct rc_instruction* inst,1197void* unused)1198{1199if (inst->U.I.Opcode != RC_OPCODE_DDX && inst->U.I.Opcode != RC_OPCODE_DDY)1200return 0;12011202inst->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_1111;1203inst->U.I.SrcReg[1].Negate = RC_MASK_XYZW;12041205return 1;1206}12071208/**1209* IF Temp[0].x -> IF Temp[0].x1210* ... -> ...1211* KILL -> KIL -abs(Temp[0].x)1212* ... -> ...1213* ENDIF -> ENDIF1214*1215* === OR ===1216*1217* IF Temp[0].x -\1218* KILL - > KIL -abs(Temp[0].x)1219* ENDIF -/1220*1221* === OR ===1222*1223* IF Temp[0].x -> IF Temp[0].x1224* ... -> ...1225* ELSE -> ELSE1226* ... -> ...1227* KILL -> KIL -abs(Temp[0].x)1228* ... -> ...1229* ENDIF -> ENDIF1230*1231* === OR ===1232*1233* KILL -> KIL -none.11111234*1235* This needs to be done in its own pass, because it might modify the1236* instructions before and after KILL.1237*/1238void rc_transform_KILL(struct radeon_compiler * c, void *user)1239{1240struct rc_instruction * inst;1241for (inst = c->Program.Instructions.Next;1242inst != &c->Program.Instructions; inst = inst->Next) {1243struct rc_instruction * if_inst;1244unsigned in_if = 0;12451246if (inst->U.I.Opcode != RC_OPCODE_KILP)1247continue;12481249for (if_inst = inst->Prev; if_inst != &c->Program.Instructions;1250if_inst = if_inst->Prev) {12511252if (if_inst->U.I.Opcode == RC_OPCODE_IF) {1253in_if = 1;1254break;1255}1256}12571258inst->U.I.Opcode = RC_OPCODE_KIL;12591260if (!in_if) {1261inst->U.I.SrcReg[0] = negate(builtin_one);1262} else {1263/* This should work even if the KILP is inside the ELSE1264* block, because -0.0 is considered negative. */1265inst->U.I.SrcReg[0] =1266negate(absolute(if_inst->U.I.SrcReg[0]));12671268if (inst->Prev->U.I.Opcode != RC_OPCODE_IF1269&& inst->Next->U.I.Opcode != RC_OPCODE_ENDIF) {12701271/* Optimize the special case:1272* IF Temp[0].x1273* KILP1274* ENDIF1275*/12761277/* Remove IF */1278rc_remove_instruction(inst->Prev);1279/* Remove ENDIF */1280rc_remove_instruction(inst->Next);1281}1282}1283}1284}12851286int rc_force_output_alpha_to_one(struct radeon_compiler *c,1287struct rc_instruction *inst, void *data)1288{1289struct r300_fragment_program_compiler *fragc = (struct r300_fragment_program_compiler*)c;1290const struct rc_opcode_info *info = rc_get_opcode_info(inst->U.I.Opcode);1291unsigned tmp;12921293if (!info->HasDstReg || inst->U.I.DstReg.File != RC_FILE_OUTPUT ||1294inst->U.I.DstReg.Index == fragc->OutputDepth)1295return 1;12961297tmp = rc_find_free_temporary(c);12981299/* Insert MOV after inst, set alpha to 1. */1300emit1(c, inst, RC_OPCODE_MOV, 0, inst->U.I.DstReg,1301srcregswz(RC_FILE_TEMPORARY, tmp, RC_SWIZZLE_XYZ1));13021303/* Re-route the destination of inst to the source of mov. */1304inst->U.I.DstReg.File = RC_FILE_TEMPORARY;1305inst->U.I.DstReg.Index = tmp;13061307/* Move the saturate output modifier to the MOV instruction1308* (for better copy propagation). */1309inst->Next->U.I.SaturateMode = inst->U.I.SaturateMode;1310inst->U.I.SaturateMode = RC_SATURATE_NONE;1311return 1;1312}131313141315