Path: blob/21.2-virgl/src/gallium/drivers/r300/compiler/radeon_pair_schedule.c
4574 views
/*1* Copyright (C) 2009 Nicolai Haehnle.2*3* All Rights Reserved.4*5* Permission is hereby granted, free of charge, to any person obtaining6* a copy of this software and associated documentation files (the7* "Software"), to deal in the Software without restriction, including8* without limitation the rights to use, copy, modify, merge, publish,9* distribute, sublicense, and/or sell copies of the Software, and to10* permit persons to whom the Software is furnished to do so, subject to11* the following conditions:12*13* The above copyright notice and this permission notice (including the14* next paragraph) shall be included in all copies or substantial15* portions of the Software.16*17* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,18* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF19* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.20* IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE21* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION22* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION23* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.24*25*/2627#include "radeon_program_pair.h"2829#include <stdio.h>3031#include "radeon_compiler.h"32#include "radeon_compiler_util.h"33#include "radeon_dataflow.h"34#include "radeon_list.h"35#include "radeon_variable.h"3637#include "util/u_debug.h"3839#define VERBOSE 04041#define DBG(...) do { if (VERBOSE) fprintf(stderr, __VA_ARGS__); } while(0)4243struct schedule_instruction {44struct rc_instruction * Instruction;4546/** Next instruction in the linked list of ready instructions. */47struct schedule_instruction *NextReady;4849/** Values that this instruction reads and writes */50struct reg_value * WriteValues[4];51struct reg_value * ReadValues[12];52unsigned int NumWriteValues:3;53unsigned int NumReadValues:4;5455/**56* Number of (read and write) dependencies that must be resolved before57* this instruction can be scheduled.58*/59unsigned int NumDependencies:5;6061/** List of all readers (see rc_get_readers() for the definition of62* "all readers"), even those outside the basic block this instruction63* lives in. */64struct rc_reader_data GlobalReaders;6566/** If the scheduler has paired an RGB and an Alpha instruction together,67* PairedInst references the alpha instruction's dependency information.68*/69struct schedule_instruction * PairedInst;7071/** This scheduler uses the value of Score to determine which72* instruction to schedule. Instructions with a higher value of Score73* will be scheduled first. */74int Score;7576/** The number of components that read from a TEX instruction. */77unsigned TexReadCount;7879/** For TEX instructions a list of readers */80struct rc_list * TexReaders;81};828384/**85* Used to keep track of which instructions read a value.86*/87struct reg_value_reader {88struct schedule_instruction *Reader;89struct reg_value_reader *Next;90};9192/**93* Used to keep track which values are stored in each component of a94* RC_FILE_TEMPORARY.95*/96struct reg_value {97struct schedule_instruction * Writer;9899/**100* Unordered linked list of instructions that read from this value.101* When this value becomes available, we increase all readers'102* dependency count.103*/104struct reg_value_reader *Readers;105106/**107* Number of readers of this value. This is decremented each time108* a reader of the value is committed.109* When the reader count reaches zero, the dependency count110* of the instruction writing \ref Next is decremented.111*/112unsigned int NumReaders;113114struct reg_value *Next; /**< Pointer to the next value to be written to the same register */115};116117struct register_state {118struct reg_value * Values[4];119};120121struct remap_reg {122struct rc_instruction * Inst;123unsigned int OldIndex:(RC_REGISTER_INDEX_BITS+1);124unsigned int OldSwizzle:3;125unsigned int NewIndex:(RC_REGISTER_INDEX_BITS+1);126unsigned int NewSwizzle:3;127unsigned int OnlyTexReads:1;128struct remap_reg * Next;129};130131struct schedule_state {132struct radeon_compiler * C;133struct schedule_instruction * Current;134/** Array of the previous writers of Current's destination register135* indexed by channel. */136struct schedule_instruction * PrevWriter[4];137138struct register_state Temporary[RC_REGISTER_MAX_INDEX];139140/**141* Linked lists of instructions that can be scheduled right now,142* based on which ALU/TEX resources they require.143*/144/*@{*/145struct schedule_instruction *ReadyFullALU;146struct schedule_instruction *ReadyRGB;147struct schedule_instruction *ReadyAlpha;148struct schedule_instruction *ReadyTEX;149/*@}*/150struct rc_list *PendingTEX;151152void (*CalcScore)(struct schedule_instruction *);153long max_tex_group;154unsigned PrevBlockHasTex:1;155unsigned TEXCount;156unsigned Opt:1;157};158159static struct reg_value ** get_reg_valuep(struct schedule_state * s,160rc_register_file file, unsigned int index, unsigned int chan)161{162if (file != RC_FILE_TEMPORARY)163return 0;164165if (index >= RC_REGISTER_MAX_INDEX) {166rc_error(s->C, "%s: index %i out of bounds\n", __FUNCTION__, index);167return 0;168}169170return &s->Temporary[index].Values[chan];171}172173static unsigned get_tex_read_count(struct schedule_instruction * sinst)174{175unsigned tex_read_count = sinst->TexReadCount;176if (sinst->PairedInst) {177tex_read_count += sinst->PairedInst->TexReadCount;178}179return tex_read_count;180}181182#if VERBOSE183static void print_list(struct schedule_instruction * sinst)184{185struct schedule_instruction * ptr;186for (ptr = sinst; ptr; ptr=ptr->NextReady) {187unsigned tex_read_count = get_tex_read_count(ptr);188unsigned score = sinst->Score;189fprintf(stderr,"%u (%d) [%u],", ptr->Instruction->IP, score,190tex_read_count);191}192fprintf(stderr, "\n");193}194#endif195196static void remove_inst_from_list(struct schedule_instruction ** list,197struct schedule_instruction * inst)198{199struct schedule_instruction * prev = NULL;200struct schedule_instruction * list_ptr;201for (list_ptr = *list; list_ptr; prev = list_ptr,202list_ptr = list_ptr->NextReady) {203if (list_ptr == inst) {204if (prev) {205prev->NextReady = inst->NextReady;206} else {207*list = inst->NextReady;208}209inst->NextReady = NULL;210break;211}212}213}214215static void add_inst_to_list(struct schedule_instruction ** list, struct schedule_instruction * inst)216{217inst->NextReady = *list;218*list = inst;219}220221static void add_inst_to_list_score(struct schedule_instruction ** list,222struct schedule_instruction * inst)223{224struct schedule_instruction * temp;225struct schedule_instruction * prev;226if (!*list) {227*list = inst;228return;229}230temp = *list;231prev = NULL;232while(temp && inst->Score <= temp->Score) {233prev = temp;234temp = temp->NextReady;235}236237if (!prev) {238inst->NextReady = temp;239*list = inst;240} else {241prev->NextReady = inst;242inst->NextReady = temp;243}244}245246static void instruction_ready(struct schedule_state * s, struct schedule_instruction * sinst)247{248DBG("%i is now ready\n", sinst->Instruction->IP);249250/* Adding Ready TEX instructions to the end of the "Ready List" helps251* us emit TEX instructions in blocks without losing our place. */252if (sinst->Instruction->Type == RC_INSTRUCTION_NORMAL)253add_inst_to_list_score(&s->ReadyTEX, sinst);254else if (sinst->Instruction->U.P.Alpha.Opcode == RC_OPCODE_NOP)255add_inst_to_list_score(&s->ReadyRGB, sinst);256else if (sinst->Instruction->U.P.RGB.Opcode == RC_OPCODE_NOP)257add_inst_to_list_score(&s->ReadyAlpha, sinst);258else259add_inst_to_list_score(&s->ReadyFullALU, sinst);260}261262static void decrease_dependencies(struct schedule_state * s, struct schedule_instruction * sinst)263{264assert(sinst->NumDependencies > 0);265sinst->NumDependencies--;266if (!sinst->NumDependencies)267instruction_ready(s, sinst);268}269270/* These functions provide different heuristics for scheduling instructions.271* The default is calc_score_readers. */272273#if 0274275static void calc_score_zero(struct schedule_instruction * sinst)276{277sinst->Score = 0;278}279280static void calc_score_deps(struct schedule_instruction * sinst)281{282int i;283sinst->Score = 0;284for (i = 0; i < sinst->NumWriteValues; i++) {285struct reg_value * v = sinst->WriteValues[i];286if (v->NumReaders) {287struct reg_value_reader * r;288for (r = v->Readers; r; r = r->Next) {289if (r->Reader->NumDependencies == 1) {290sinst->Score += 100;291}292sinst->Score += r->Reader->NumDependencies;293}294}295}296}297298#endif299300#define NO_OUTPUT_SCORE (1 << 24)301302static void score_no_output(struct schedule_instruction * sinst)303{304assert(sinst->Instruction->Type != RC_INSTRUCTION_NORMAL);305if (!sinst->Instruction->U.P.RGB.OutputWriteMask &&306!sinst->Instruction->U.P.Alpha.OutputWriteMask) {307if (sinst->PairedInst) {308if (!sinst->PairedInst->Instruction->U.P.309RGB.OutputWriteMask310&& !sinst->PairedInst->Instruction->U.P.311Alpha.OutputWriteMask) {312sinst->Score |= NO_OUTPUT_SCORE;313}314315} else {316sinst->Score |= NO_OUTPUT_SCORE;317}318}319}320321#define PAIRED_SCORE (1 << 16)322323static void calc_score_r300(struct schedule_instruction * sinst)324{325unsigned src_idx;326327if (sinst->Instruction->Type == RC_INSTRUCTION_NORMAL) {328sinst->Score = 0;329return;330}331332score_no_output(sinst);333334if (sinst->PairedInst) {335sinst->Score |= PAIRED_SCORE;336return;337}338339for (src_idx = 0; src_idx < 4; src_idx++) {340sinst->Score += sinst->Instruction->U.P.RGB.Src[src_idx].Used +341sinst->Instruction->U.P.Alpha.Src[src_idx].Used;342}343}344345#define NO_READ_TEX_SCORE (1 << 16)346347static void calc_score_readers(struct schedule_instruction * sinst)348{349if (sinst->Instruction->Type == RC_INSTRUCTION_NORMAL) {350sinst->Score = 0;351} else {352sinst->Score = sinst->NumReadValues;353if (sinst->PairedInst) {354sinst->Score += sinst->PairedInst->NumReadValues;355}356if (get_tex_read_count(sinst) == 0) {357sinst->Score |= NO_READ_TEX_SCORE;358}359score_no_output(sinst);360}361}362363/**364* This function decreases the dependencies of the next instruction that365* wants to write to each of sinst's read values.366*/367static void commit_update_reads(struct schedule_state * s,368struct schedule_instruction * sinst){369unsigned int i;370for(i = 0; i < sinst->NumReadValues; ++i) {371struct reg_value * v = sinst->ReadValues[i];372assert(v->NumReaders > 0);373v->NumReaders--;374if (!v->NumReaders) {375if (v->Next) {376decrease_dependencies(s, v->Next->Writer);377}378}379}380if (sinst->PairedInst) {381commit_update_reads(s, sinst->PairedInst);382}383}384385static void commit_update_writes(struct schedule_state * s,386struct schedule_instruction * sinst){387unsigned int i;388for(i = 0; i < sinst->NumWriteValues; ++i) {389struct reg_value * v = sinst->WriteValues[i];390if (v->NumReaders) {391for(struct reg_value_reader * r = v->Readers; r; r = r->Next) {392decrease_dependencies(s, r->Reader);393}394} else {395/* This happens in instruction sequences of the type396* OP r.x, ...;397* OP r.x, r.x, ...;398* See also the subtlety in how instructions that both399* read and write the same register are scanned.400*/401if (v->Next)402decrease_dependencies(s, v->Next->Writer);403}404}405if (sinst->PairedInst) {406commit_update_writes(s, sinst->PairedInst);407}408}409410static void notify_sem_wait(struct schedule_state *s)411{412struct rc_list * pend_ptr;413for (pend_ptr = s->PendingTEX; pend_ptr; pend_ptr = pend_ptr->Next) {414struct rc_list * read_ptr;415struct schedule_instruction * pending = pend_ptr->Item;416for (read_ptr = pending->TexReaders; read_ptr;417read_ptr = read_ptr->Next) {418struct schedule_instruction * reader = read_ptr->Item;419reader->TexReadCount--;420}421}422s->PendingTEX = NULL;423}424425static void commit_alu_instruction(struct schedule_state * s, struct schedule_instruction * sinst)426{427DBG("%i: commit score = %d\n", sinst->Instruction->IP, sinst->Score);428429commit_update_reads(s, sinst);430431commit_update_writes(s, sinst);432433if (get_tex_read_count(sinst) > 0) {434sinst->Instruction->U.P.SemWait = 1;435notify_sem_wait(s);436}437}438439/**440* Emit all ready texture instructions in a single block.441*442* Emit as a single block to (hopefully) sample many textures in parallel,443* and to avoid hardware indirections on R300.444*/445static void emit_all_tex(struct schedule_state * s, struct rc_instruction * before)446{447struct schedule_instruction *readytex;448struct rc_instruction * inst_begin;449450assert(s->ReadyTEX);451notify_sem_wait(s);452453/* Node marker for R300 */454inst_begin = rc_insert_new_instruction(s->C, before->Prev);455inst_begin->U.I.Opcode = RC_OPCODE_BEGIN_TEX;456457/* Link texture instructions back in */458readytex = s->ReadyTEX;459while(readytex) {460rc_insert_instruction(before->Prev, readytex->Instruction);461DBG("%i: commit TEX reads\n", readytex->Instruction->IP);462463/* All of the TEX instructions in the same TEX block have464* their source registers read from before any of the465* instructions in that block write to their destination466* registers. This means that when we commit a TEX467* instruction, any other TEX instruction that wants to write468* to one of the committed instruction's source register can be469* marked as ready and should be emitted in the same TEX470* block. This prevents the following sequence from being471* emitted in two different TEX blocks:472* 0: TEX temp[0].xyz, temp[1].xy__, 2D[0];473* 1: TEX temp[1].xyz, temp[2].xy__, 2D[0];474*/475commit_update_reads(s, readytex);476readytex = readytex->NextReady;477}478readytex = s->ReadyTEX;479s->ReadyTEX = 0;480while(readytex){481DBG("%i: commit TEX writes\n", readytex->Instruction->IP);482commit_update_writes(s, readytex);483/* Set semaphore bits for last TEX instruction in the block */484if (!readytex->NextReady) {485readytex->Instruction->U.I.TexSemAcquire = 1;486readytex->Instruction->U.I.TexSemWait = 1;487}488rc_list_add(&s->PendingTEX, rc_list(&s->C->Pool, readytex));489readytex = readytex->NextReady;490}491}492493/* This is a helper function for destructive_merge_instructions(). It helps494* merge presubtract sources from two instructions and makes sure the495* presubtract sources end up in the correct spot. This function assumes that496* dst_full is an rgb instruction, meaning that it has a vector instruction(rgb)497* but no scalar instruction (alpha).498* @return 0 if merging the presubtract sources fails.499* @retrun 1 if merging the presubtract sources succeeds.500*/501static int merge_presub_sources(502struct rc_pair_instruction * dst_full,503struct rc_pair_sub_instruction src,504unsigned int type)505{506unsigned int srcp_src, srcp_regs, is_rgb, is_alpha;507struct rc_pair_sub_instruction * dst_sub;508const struct rc_opcode_info * info;509510assert(dst_full->Alpha.Opcode == RC_OPCODE_NOP);511512switch(type) {513case RC_SOURCE_RGB:514is_rgb = 1;515is_alpha = 0;516dst_sub = &dst_full->RGB;517break;518case RC_SOURCE_ALPHA:519is_rgb = 0;520is_alpha = 1;521dst_sub = &dst_full->Alpha;522break;523default:524assert(0);525return 0;526}527528info = rc_get_opcode_info(dst_full->RGB.Opcode);529530if (dst_sub->Src[RC_PAIR_PRESUB_SRC].Used)531return 0;532533srcp_regs = rc_presubtract_src_reg_count(534src.Src[RC_PAIR_PRESUB_SRC].Index);535for(srcp_src = 0; srcp_src < srcp_regs; srcp_src++) {536unsigned int arg;537int free_source;538unsigned int one_way = 0;539struct rc_pair_instruction_source srcp = src.Src[srcp_src];540struct rc_pair_instruction_source temp;541542free_source = rc_pair_alloc_source(dst_full, is_rgb, is_alpha,543srcp.File, srcp.Index);544545/* If free_source < 0 then there are no free source546* slots. */547if (free_source < 0)548return 0;549550temp = dst_sub->Src[srcp_src];551dst_sub->Src[srcp_src] = dst_sub->Src[free_source];552553/* srcp needs src0 and src1 to be the same */554if (free_source < srcp_src) {555if (!temp.Used)556continue;557free_source = rc_pair_alloc_source(dst_full, is_rgb,558is_alpha, temp.File, temp.Index);559if (free_source < 0)560return 0;561one_way = 1;562} else {563dst_sub->Src[free_source] = temp;564}565566/* If free_source == srcp_src, then the presubtract567* source is already in the correct place. */568if (free_source == srcp_src)569continue;570571/* Shuffle the sources, so we can put the572* presubtract source in the correct place. */573for(arg = 0; arg < info->NumSrcRegs; arg++) {574/*If this arg does not read from an rgb source,575* do nothing. */576if (!(rc_source_type_swz(dst_full->RGB.Arg[arg].Swizzle)577& type)) {578continue;579}580581if (dst_full->RGB.Arg[arg].Source == srcp_src)582dst_full->RGB.Arg[arg].Source = free_source;583/* We need to do this just in case register584* is one of the sources already, but in the585* wrong spot. */586else if(dst_full->RGB.Arg[arg].Source == free_source587&& !one_way) {588dst_full->RGB.Arg[arg].Source = srcp_src;589}590}591}592return 1;593}594595596/* This function assumes that rgb.Alpha and alpha.RGB are unused */597static int destructive_merge_instructions(598struct rc_pair_instruction * rgb,599struct rc_pair_instruction * alpha)600{601const struct rc_opcode_info * opcode;602603assert(rgb->Alpha.Opcode == RC_OPCODE_NOP);604assert(alpha->RGB.Opcode == RC_OPCODE_NOP);605606/* Presubtract registers need to be merged first so that registers607* needed by the presubtract operation can be placed in src0 and/or608* src1. */609610/* Merge the rgb presubtract registers. */611if (alpha->RGB.Src[RC_PAIR_PRESUB_SRC].Used) {612if (!merge_presub_sources(rgb, alpha->RGB, RC_SOURCE_RGB)) {613return 0;614}615}616/* Merge the alpha presubtract registers */617if (alpha->Alpha.Src[RC_PAIR_PRESUB_SRC].Used) {618if(!merge_presub_sources(rgb, alpha->Alpha, RC_SOURCE_ALPHA)){619return 0;620}621}622623/* Copy alpha args into rgb */624opcode = rc_get_opcode_info(alpha->Alpha.Opcode);625626for(unsigned int arg = 0; arg < opcode->NumSrcRegs; ++arg) {627unsigned int srcrgb = 0;628unsigned int srcalpha = 0;629unsigned int oldsrc = alpha->Alpha.Arg[arg].Source;630rc_register_file file = 0;631unsigned int index = 0;632int source;633634if (GET_SWZ(alpha->Alpha.Arg[arg].Swizzle, 0) < 3) {635srcrgb = 1;636file = alpha->RGB.Src[oldsrc].File;637index = alpha->RGB.Src[oldsrc].Index;638} else if (GET_SWZ(alpha->Alpha.Arg[arg].Swizzle, 0) < 4) {639srcalpha = 1;640file = alpha->Alpha.Src[oldsrc].File;641index = alpha->Alpha.Src[oldsrc].Index;642}643644source = rc_pair_alloc_source(rgb, srcrgb, srcalpha, file, index);645if (source < 0)646return 0;647648rgb->Alpha.Arg[arg].Source = source;649rgb->Alpha.Arg[arg].Swizzle = alpha->Alpha.Arg[arg].Swizzle;650rgb->Alpha.Arg[arg].Abs = alpha->Alpha.Arg[arg].Abs;651rgb->Alpha.Arg[arg].Negate = alpha->Alpha.Arg[arg].Negate;652}653654/* Copy alpha opcode into rgb */655rgb->Alpha.Opcode = alpha->Alpha.Opcode;656rgb->Alpha.DestIndex = alpha->Alpha.DestIndex;657rgb->Alpha.WriteMask = alpha->Alpha.WriteMask;658rgb->Alpha.OutputWriteMask = alpha->Alpha.OutputWriteMask;659rgb->Alpha.DepthWriteMask = alpha->Alpha.DepthWriteMask;660rgb->Alpha.Saturate = alpha->Alpha.Saturate;661rgb->Alpha.Omod = alpha->Alpha.Omod;662663/* Merge ALU result writing */664if (alpha->WriteALUResult) {665if (rgb->WriteALUResult)666return 0;667668rgb->WriteALUResult = alpha->WriteALUResult;669rgb->ALUResultCompare = alpha->ALUResultCompare;670}671672/* Copy SemWait */673rgb->SemWait |= alpha->SemWait;674675return 1;676}677678/**679* Try to merge the given instructions into the rgb instructions.680*681* Return true on success; on failure, return false, and keep682* the instructions untouched.683*/684static int merge_instructions(struct rc_pair_instruction * rgb, struct rc_pair_instruction * alpha)685{686struct rc_pair_instruction backup;687688/*Instructions can't write output registers and ALU result at the689* same time. */690if ((rgb->WriteALUResult && alpha->Alpha.OutputWriteMask)691|| (rgb->RGB.OutputWriteMask && alpha->WriteALUResult)) {692return 0;693}694695/* Writing output registers in the middle of shaders is slow, so696* we don't want to pair output writes with temp writes. */697if ((rgb->RGB.OutputWriteMask && !alpha->Alpha.OutputWriteMask)698|| (!rgb->RGB.OutputWriteMask && alpha->Alpha.OutputWriteMask)) {699return 0;700}701702memcpy(&backup, rgb, sizeof(struct rc_pair_instruction));703704if (destructive_merge_instructions(rgb, alpha))705return 1;706707memcpy(rgb, &backup, sizeof(struct rc_pair_instruction));708return 0;709}710711static void presub_nop(struct rc_instruction * emitted) {712int prev_rgb_index, prev_alpha_index, i, num_src;713714/* We don't need a nop if the previous instruction is a TEX. */715if (emitted->Prev->Type != RC_INSTRUCTION_PAIR) {716return;717}718if (emitted->Prev->U.P.RGB.WriteMask)719prev_rgb_index = emitted->Prev->U.P.RGB.DestIndex;720else721prev_rgb_index = -1;722if (emitted->Prev->U.P.Alpha.WriteMask)723prev_alpha_index = emitted->Prev->U.P.Alpha.DestIndex;724else725prev_alpha_index = 1;726727/* Check the previous rgb instruction */728if (emitted->U.P.RGB.Src[RC_PAIR_PRESUB_SRC].Used) {729num_src = rc_presubtract_src_reg_count(730emitted->U.P.RGB.Src[RC_PAIR_PRESUB_SRC].Index);731for (i = 0; i < num_src; i++) {732unsigned int index = emitted->U.P.RGB.Src[i].Index;733if (emitted->U.P.RGB.Src[i].File == RC_FILE_TEMPORARY734&& (index == prev_rgb_index735|| index == prev_alpha_index)) {736emitted->Prev->U.P.Nop = 1;737return;738}739}740}741742/* Check the previous alpha instruction. */743if (!emitted->U.P.Alpha.Src[RC_PAIR_PRESUB_SRC].Used)744return;745746num_src = rc_presubtract_src_reg_count(747emitted->U.P.Alpha.Src[RC_PAIR_PRESUB_SRC].Index);748for (i = 0; i < num_src; i++) {749unsigned int index = emitted->U.P.Alpha.Src[i].Index;750if(emitted->U.P.Alpha.Src[i].File == RC_FILE_TEMPORARY751&& (index == prev_rgb_index || index == prev_alpha_index)) {752emitted->Prev->U.P.Nop = 1;753return;754}755}756}757758static void rgb_to_alpha_remap (759struct rc_instruction * inst,760struct rc_pair_instruction_arg * arg,761rc_register_file old_file,762rc_swizzle old_swz,763unsigned int new_index)764{765int new_src_index;766unsigned int i;767768for (i = 0; i < 3; i++) {769if (get_swz(arg->Swizzle, i) == old_swz) {770SET_SWZ(arg->Swizzle, i, RC_SWIZZLE_W);771}772}773new_src_index = rc_pair_alloc_source(&inst->U.P, 0, 1,774old_file, new_index);775/* This conversion is not possible, we must have made a mistake in776* is_rgb_to_alpha_possible. */777if (new_src_index < 0) {778assert(0);779return;780}781782arg->Source = new_src_index;783}784785static int can_remap(unsigned int opcode)786{787switch(opcode) {788case RC_OPCODE_DDX:789case RC_OPCODE_DDY:790return 0;791default:792return 1;793}794}795796static int can_convert_opcode_to_alpha(unsigned int opcode)797{798switch(opcode) {799case RC_OPCODE_DDX:800case RC_OPCODE_DDY:801case RC_OPCODE_DP2:802case RC_OPCODE_DP3:803case RC_OPCODE_DP4:804case RC_OPCODE_DPH:805return 0;806default:807return 1;808}809}810811static void is_rgb_to_alpha_possible(812void * userdata,813struct rc_instruction * inst,814struct rc_pair_instruction_arg * arg,815struct rc_pair_instruction_source * src)816{817unsigned int read_chan = RC_SWIZZLE_UNUSED;818unsigned int alpha_sources = 0;819unsigned int i;820struct rc_reader_data * reader_data = userdata;821822if (!can_remap(inst->U.P.RGB.Opcode)823|| !can_remap(inst->U.P.Alpha.Opcode)) {824reader_data->Abort = 1;825return;826}827828if (!src)829return;830831/* XXX There are some cases where we can still do the conversion if832* a reader reads from a presubtract source, but for now we'll prevent833* it. */834if (arg->Source == RC_PAIR_PRESUB_SRC) {835reader_data->Abort = 1;836return;837}838839/* Make sure the source only reads the register component that we840* are going to be convering from. It is OK if the instruction uses841* this component more than once.842* XXX If the index we will be converting to is the same as the843* current index, then it is OK to read from more than one component.844*/845for (i = 0; i < 3; i++) {846rc_swizzle swz = get_swz(arg->Swizzle, i);847switch(swz) {848case RC_SWIZZLE_X:849case RC_SWIZZLE_Y:850case RC_SWIZZLE_Z:851case RC_SWIZZLE_W:852if (read_chan == RC_SWIZZLE_UNUSED) {853read_chan = swz;854} else if (read_chan != swz) {855reader_data->Abort = 1;856return;857}858break;859default:860break;861}862}863864/* Make sure there are enough alpha sources.865* XXX If we know what register all the readers are going866* to be remapped to, then in some situations we can still do867* the substitution, even if all 3 alpha sources are being used.*/868for (i = 0; i < 3; i++) {869if (inst->U.P.Alpha.Src[i].Used) {870alpha_sources++;871}872}873if (alpha_sources > 2) {874reader_data->Abort = 1;875return;876}877}878879static int convert_rgb_to_alpha(880struct schedule_state * s,881struct schedule_instruction * sched_inst)882{883struct rc_pair_instruction * pair_inst = &sched_inst->Instruction->U.P;884unsigned int old_mask = pair_inst->RGB.WriteMask;885unsigned int old_swz = rc_mask_to_swizzle(old_mask);886const struct rc_opcode_info * info =887rc_get_opcode_info(pair_inst->RGB.Opcode);888int new_index = -1;889unsigned int i;890891if (sched_inst->GlobalReaders.Abort)892return 0;893894if (!pair_inst->RGB.WriteMask)895return 0;896897if (!can_convert_opcode_to_alpha(pair_inst->RGB.Opcode)898|| !can_convert_opcode_to_alpha(pair_inst->Alpha.Opcode)) {899return 0;900}901902assert(sched_inst->NumWriteValues == 1);903904if (!sched_inst->WriteValues[0]) {905assert(0);906return 0;907}908909/* We start at the old index, because if we can reuse the same910* register and just change the swizzle then it is more likely we911* will be able to convert all the readers. */912for (i = pair_inst->RGB.DestIndex; i < RC_REGISTER_MAX_INDEX; i++) {913struct reg_value ** new_regvalp = get_reg_valuep(914s, RC_FILE_TEMPORARY, i, 3);915if (!*new_regvalp) {916struct reg_value ** old_regvalp =917get_reg_valuep(s,918RC_FILE_TEMPORARY,919pair_inst->RGB.DestIndex,920rc_mask_to_swizzle(old_mask));921new_index = i;922*new_regvalp = *old_regvalp;923*old_regvalp = NULL;924new_regvalp = get_reg_valuep(s, RC_FILE_TEMPORARY, i, 3);925break;926}927}928if (new_index < 0) {929return 0;930}931932/* If we are converting a full instruction with RC_OPCODE_REPL_ALPHA933* as the RGB opcode, then the Alpha instruction will already contain934* the correct opcode and instruction args, so we do not want to935* overwrite them.936*/937if (pair_inst->RGB.Opcode != RC_OPCODE_REPL_ALPHA) {938pair_inst->Alpha.Opcode = pair_inst->RGB.Opcode;939memcpy(pair_inst->Alpha.Arg, pair_inst->RGB.Arg,940sizeof(pair_inst->Alpha.Arg));941}942pair_inst->Alpha.DestIndex = new_index;943pair_inst->Alpha.WriteMask = RC_MASK_W;944pair_inst->Alpha.Target = pair_inst->RGB.Target;945pair_inst->Alpha.OutputWriteMask = pair_inst->RGB.OutputWriteMask;946pair_inst->Alpha.DepthWriteMask = pair_inst->RGB.DepthWriteMask;947pair_inst->Alpha.Saturate = pair_inst->RGB.Saturate;948pair_inst->Alpha.Omod = pair_inst->RGB.Omod;949/* Move the swizzles into the first chan */950for (i = 0; i < info->NumSrcRegs; i++) {951unsigned int j;952for (j = 0; j < 3; j++) {953unsigned int swz = get_swz(pair_inst->Alpha.Arg[i].Swizzle, j);954if (swz != RC_SWIZZLE_UNUSED) {955pair_inst->Alpha.Arg[i].Swizzle =956rc_init_swizzle(swz, 1);957break;958}959}960}961pair_inst->RGB.Opcode = RC_OPCODE_NOP;962pair_inst->RGB.DestIndex = 0;963pair_inst->RGB.WriteMask = 0;964pair_inst->RGB.Target = 0;965pair_inst->RGB.OutputWriteMask = 0;966pair_inst->RGB.DepthWriteMask = 0;967pair_inst->RGB.Saturate = 0;968memset(pair_inst->RGB.Arg, 0, sizeof(pair_inst->RGB.Arg));969970for(i = 0; i < sched_inst->GlobalReaders.ReaderCount; i++) {971struct rc_reader reader = sched_inst->GlobalReaders.Readers[i];972rgb_to_alpha_remap(reader.Inst, reader.U.P.Arg,973RC_FILE_TEMPORARY, old_swz, new_index);974}975return 1;976}977978static void try_convert_and_pair(979struct schedule_state *s,980struct schedule_instruction ** inst_list)981{982struct schedule_instruction * list_ptr = *inst_list;983while (list_ptr && *inst_list && (*inst_list)->NextReady) {984int paired = 0;985if (list_ptr->Instruction->U.P.Alpha.Opcode != RC_OPCODE_NOP986&& list_ptr->Instruction->U.P.RGB.Opcode987!= RC_OPCODE_REPL_ALPHA) {988goto next;989}990if (list_ptr->NumWriteValues == 1991&& convert_rgb_to_alpha(s, list_ptr)) {992993struct schedule_instruction * pair_ptr;994remove_inst_from_list(inst_list, list_ptr);995add_inst_to_list_score(&s->ReadyAlpha, list_ptr);996997for (pair_ptr = s->ReadyRGB; pair_ptr;998pair_ptr = pair_ptr->NextReady) {999if (merge_instructions(&pair_ptr->Instruction->U.P,1000&list_ptr->Instruction->U.P)) {1001remove_inst_from_list(&s->ReadyAlpha, list_ptr);1002remove_inst_from_list(&s->ReadyRGB, pair_ptr);1003pair_ptr->PairedInst = list_ptr;10041005add_inst_to_list(&s->ReadyFullALU, pair_ptr);1006list_ptr = *inst_list;1007paired = 1;1008break;1009}10101011}1012}1013if (!paired) {1014next:1015list_ptr = list_ptr->NextReady;1016}1017}1018}10191020/**1021* This function attempts to merge RGB and Alpha instructions together.1022*/1023static void pair_instructions(struct schedule_state * s)1024{1025struct schedule_instruction *rgb_ptr;1026struct schedule_instruction *alpha_ptr;10271028/* Some pairings might fail because they require too1029* many source slots; try all possible pairings if necessary */1030rgb_ptr = s->ReadyRGB;1031while(rgb_ptr) {1032struct schedule_instruction * rgb_next = rgb_ptr->NextReady;1033alpha_ptr = s->ReadyAlpha;1034while(alpha_ptr) {1035struct schedule_instruction * alpha_next = alpha_ptr->NextReady;1036if (merge_instructions(&rgb_ptr->Instruction->U.P, &alpha_ptr->Instruction->U.P)) {1037/* Remove RGB and Alpha from their ready lists.1038*/1039remove_inst_from_list(&s->ReadyRGB, rgb_ptr);1040remove_inst_from_list(&s->ReadyAlpha, alpha_ptr);1041rgb_ptr->PairedInst = alpha_ptr;1042add_inst_to_list(&s->ReadyFullALU, rgb_ptr);1043break;1044}1045alpha_ptr = alpha_next;1046}1047rgb_ptr = rgb_next;1048}10491050if (!s->Opt) {1051return;1052}10531054/* Full instructions that have RC_OPCODE_REPL_ALPHA in the RGB1055* slot can be converted into Alpha instructions. */1056try_convert_and_pair(s, &s->ReadyFullALU);10571058/* Try to convert some of the RGB instructions to Alpha and1059* try to pair it with another RGB. */1060try_convert_and_pair(s, &s->ReadyRGB);1061}10621063static void update_max_score(1064struct schedule_state * s,1065struct schedule_instruction ** list,1066int * max_score,1067struct schedule_instruction ** max_inst_out,1068struct schedule_instruction *** list_out)1069{1070struct schedule_instruction * list_ptr;1071for (list_ptr = *list; list_ptr; list_ptr = list_ptr->NextReady) {1072int score;1073s->CalcScore(list_ptr);1074score = list_ptr->Score;1075if (!*max_inst_out || score > *max_score) {1076*max_score = score;1077*max_inst_out = list_ptr;1078*list_out = list;1079}1080}1081}10821083static void emit_instruction(1084struct schedule_state * s,1085struct rc_instruction * before)1086{1087int max_score = -1;1088struct schedule_instruction * max_inst = NULL;1089struct schedule_instruction ** max_list = NULL;1090unsigned tex_count = 0;1091struct schedule_instruction * tex_ptr;10921093pair_instructions(s);1094#if VERBOSE1095fprintf(stderr, "Full:\n");1096print_list(s->ReadyFullALU);1097fprintf(stderr, "RGB:\n");1098print_list(s->ReadyRGB);1099fprintf(stderr, "Alpha:\n");1100print_list(s->ReadyAlpha);1101fprintf(stderr, "TEX:\n");1102print_list(s->ReadyTEX);1103#endif11041105for (tex_ptr = s->ReadyTEX; tex_ptr; tex_ptr = tex_ptr->NextReady) {1106if (tex_ptr->Instruction->U.I.Opcode == RC_OPCODE_KIL) {1107emit_all_tex(s, before);1108return;1109}1110tex_count++;1111}1112update_max_score(s, &s->ReadyFullALU, &max_score, &max_inst, &max_list);1113update_max_score(s, &s->ReadyRGB, &max_score, &max_inst, &max_list);1114update_max_score(s, &s->ReadyAlpha, &max_score, &max_inst, &max_list);11151116if (tex_count >= s->max_tex_group || max_score == -11117|| (s->TEXCount > 0 && tex_count == s->TEXCount)1118|| (!s->C->is_r500 && tex_count > 0 && max_score == -1)) {1119emit_all_tex(s, before);1120} else {112111221123remove_inst_from_list(max_list, max_inst);1124rc_insert_instruction(before->Prev, max_inst->Instruction);1125commit_alu_instruction(s, max_inst);11261127presub_nop(before->Prev);1128}1129}11301131static void add_tex_reader(1132struct schedule_state * s,1133struct schedule_instruction * writer,1134struct schedule_instruction * reader)1135{1136if (!writer || writer->Instruction->Type != RC_INSTRUCTION_NORMAL) {1137/*Not a TEX instructions */1138return;1139}1140reader->TexReadCount++;1141rc_list_add(&writer->TexReaders, rc_list(&s->C->Pool, reader));1142}11431144static void scan_read(void * data, struct rc_instruction * inst,1145rc_register_file file, unsigned int index, unsigned int chan)1146{1147struct schedule_state * s = data;1148struct reg_value ** v = get_reg_valuep(s, file, index, chan);1149struct reg_value_reader * reader;11501151if (!v)1152return;11531154if (*v && (*v)->Writer == s->Current) {1155/* The instruction reads and writes to a register component.1156* In this case, we only want to increment dependencies by one.1157* Why?1158* Because each instruction depends on the writers of its source1159* registers _and_ the most recent writer of its destination1160* register. In this case, the current instruction (s->Current)1161* has a dependency that both writes to one of its source1162* registers and was the most recent writer to its destination1163* register. We have already marked this dependency in1164* scan_write(), so we don't need to do it again.1165*/11661167/* We need to make sure we are adding s->Current to the1168* previous writer's list of TexReaders, if the previous writer1169* was a TEX instruction.1170*/1171add_tex_reader(s, s->PrevWriter[chan], s->Current);11721173return;1174}11751176DBG("%i: read %i[%i] chan %i\n", s->Current->Instruction->IP, file, index, chan);11771178reader = memory_pool_malloc(&s->C->Pool, sizeof(*reader));1179reader->Reader = s->Current;1180if (!*v) {1181/* In this situation, the instruction reads from a register1182* that hasn't been written to or read from in the current1183* block. */1184*v = memory_pool_malloc(&s->C->Pool, sizeof(struct reg_value));1185memset(*v, 0, sizeof(struct reg_value));1186(*v)->Readers = reader;1187} else {1188reader->Next = (*v)->Readers;1189(*v)->Readers = reader;1190/* Only update the current instruction's dependencies if the1191* register it reads from has been written to in this block. */1192if ((*v)->Writer) {1193add_tex_reader(s, (*v)->Writer, s->Current);1194s->Current->NumDependencies++;1195}1196}1197(*v)->NumReaders++;11981199if (s->Current->NumReadValues >= 12) {1200rc_error(s->C, "%s: NumReadValues overflow\n", __FUNCTION__);1201} else {1202s->Current->ReadValues[s->Current->NumReadValues++] = *v;1203}1204}12051206static void scan_write(void * data, struct rc_instruction * inst,1207rc_register_file file, unsigned int index, unsigned int chan)1208{1209struct schedule_state * s = data;1210struct reg_value ** pv = get_reg_valuep(s, file, index, chan);1211struct reg_value * newv;12121213if (!pv)1214return;12151216DBG("%i: write %i[%i] chan %i\n", s->Current->Instruction->IP, file, index, chan);12171218newv = memory_pool_malloc(&s->C->Pool, sizeof(*newv));1219memset(newv, 0, sizeof(*newv));12201221newv->Writer = s->Current;12221223if (*pv) {1224(*pv)->Next = newv;1225s->Current->NumDependencies++;1226/* Keep track of the previous writer to s->Current's destination1227* register */1228s->PrevWriter[chan] = (*pv)->Writer;1229}12301231*pv = newv;12321233if (s->Current->NumWriteValues >= 4) {1234rc_error(s->C, "%s: NumWriteValues overflow\n", __FUNCTION__);1235} else {1236s->Current->WriteValues[s->Current->NumWriteValues++] = newv;1237}1238}12391240static void is_rgb_to_alpha_possible_normal(1241void * userdata,1242struct rc_instruction * inst,1243struct rc_src_register * src)1244{1245struct rc_reader_data * reader_data = userdata;1246reader_data->Abort = 1;12471248}12491250static void schedule_block(struct schedule_state * s,1251struct rc_instruction * begin, struct rc_instruction * end)1252{1253unsigned int ip;12541255/* Scan instructions for data dependencies */1256ip = 0;1257for(struct rc_instruction * inst = begin; inst != end; inst = inst->Next) {1258s->Current = memory_pool_malloc(&s->C->Pool, sizeof(*s->Current));1259memset(s->Current, 0, sizeof(struct schedule_instruction));12601261if (inst->Type == RC_INSTRUCTION_NORMAL) {1262const struct rc_opcode_info * info =1263rc_get_opcode_info(inst->U.I.Opcode);1264if (info->HasTexture) {1265s->TEXCount++;1266}1267}12681269/* XXX: This causes SemWait to be set for all instructions in1270* a block if the previous block contained a TEX instruction.1271* We can do better here, but it will take a lot of work. */1272if (s->PrevBlockHasTex) {1273s->Current->TexReadCount = 1;1274}12751276s->Current->Instruction = inst;1277inst->IP = ip++;12781279DBG("%i: Scanning\n", inst->IP);12801281/* The order of things here is subtle and maybe slightly1282* counter-intuitive, to account for the case where an1283* instruction writes to the same register as it reads1284* from. */1285rc_for_all_writes_chan(inst, &scan_write, s);1286rc_for_all_reads_chan(inst, &scan_read, s);12871288DBG("%i: Has %i dependencies\n", inst->IP, s->Current->NumDependencies);12891290if (!s->Current->NumDependencies) {1291instruction_ready(s, s->Current);1292}12931294/* Get global readers for possible RGB->Alpha conversion. */1295s->Current->GlobalReaders.ExitOnAbort = 1;1296rc_get_readers(s->C, inst, &s->Current->GlobalReaders,1297is_rgb_to_alpha_possible_normal,1298is_rgb_to_alpha_possible, NULL);1299}13001301/* Temporarily unlink all instructions */1302begin->Prev->Next = end;1303end->Prev = begin->Prev;13041305/* Schedule instructions back */1306while(!s->C->Error &&1307(s->ReadyTEX || s->ReadyRGB || s->ReadyAlpha || s->ReadyFullALU)) {1308emit_instruction(s, end);1309}1310}13111312static int is_controlflow(struct rc_instruction * inst)1313{1314if (inst->Type == RC_INSTRUCTION_NORMAL) {1315const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);1316return opcode->IsFlowControl;1317}1318return 0;1319}13201321void rc_pair_schedule(struct radeon_compiler *cc, void *user)1322{1323struct r300_fragment_program_compiler *c = (struct r300_fragment_program_compiler*)cc;1324struct schedule_state s;1325struct rc_instruction * inst = c->Base.Program.Instructions.Next;1326unsigned int * opt = user;13271328memset(&s, 0, sizeof(s));1329s.Opt = *opt;1330s.C = &c->Base;1331if (s.C->is_r500) {1332s.CalcScore = calc_score_readers;1333} else {1334s.CalcScore = calc_score_r300;1335}1336s.max_tex_group = debug_get_num_option("RADEON_TEX_GROUP", 8);1337while(inst != &c->Base.Program.Instructions) {1338struct rc_instruction * first;13391340if (is_controlflow(inst)) {1341inst = inst->Next;1342continue;1343}13441345first = inst;13461347while(inst != &c->Base.Program.Instructions && !is_controlflow(inst))1348inst = inst->Next;13491350DBG("Schedule one block\n");1351memset(s.Temporary, 0, sizeof(s.Temporary));1352s.TEXCount = 0;1353schedule_block(&s, first, inst);1354if (s.PendingTEX) {1355s.PrevBlockHasTex = 1;1356}1357}1358}135913601361